In [40]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from nltk.classify import SklearnClassifier
from nltk import word_tokenize
from nltk.corpus import stopwords
from random import shuffle
from sklearn.pipeline import Pipeline
import re, string

In [41]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader, None)  # skip the headers
        for line in reader:
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
            preprocessedData.append((Id, preProcess(Text), Label))
        
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [42]:
# QUESTION 1
def parse_label(label):
    if label == '__label2__':
        return 'real'
    else:
        return 'fake'    

# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    return reviewLine[0], reviewLine[8], parse_label(reviewLine[1])


In [52]:
#TODO!!!!!!!
# TEXT PREPROCESSING AND FEATURE VECTORIZATION
# stop = set(stopwords.words('english'))
translator=str.maketrans('','',string.punctuation)
# Input: a string of one review
def preProcess(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.translate(translator)
    text = text.lower()
    tokens = re.split(r"\s+",text)
    return tokens

In [44]:
# QUESTION 2
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    v = {}
    for t in tokens:
        try:
            featureDict[t] += 1
        except KeyError:            
            featureDict[t] = 1
        try:
            v[t] += (1.0/len(tokens))
        except KeyError:
            v[t] = (1.0/len(tokens))
    return v

In [45]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [55]:
# QUESTION 3

def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    predictions = []
    ground_truth = []
    foldSize = int(len(dataset)/folds)
    
    for i in range(0,len(dataset), foldSize):
        trainData = []
#         trainRawData = []
#         testRawData = []
        trainRawData = dataset[:i] + dataset[i+foldSize:]        
        testRawData = dataset[i:i+foldSize]

        for (_, Text, Label) in trainRawData:
            trainData.append((toFeatureVector(preProcess(Text)),Label))        
        classifier = trainClassifier(trainData)     
                
        predictions.append(predictLabels(testRawData, classifier))
        ground_truth.append([ l[2] for l in testRawData])

    return ground_truth, predictions

In [54]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))
#     return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample[1])))

def flatten(lst):
    for el in lst:
        if isinstance(el, list):
            yield from el
        else:
            yield el

In [56]:
# MAIN

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
# splitData(0.8)
# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
ground_truth, predictions = crossValidate(rawData, 10)
ground_truth = list(flatten(ground_truth))
predictions = list(flatten(predictions))
# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')
print('Accuracy: ' + str(round(100*accuracy_score(ground_truth, predictions), 2)) + '%')
print('Confusion Matrix: ')
tn, fp, fn, tp = confusion_matrix(ground_truth, predictions).ravel()
print('TP: ' + str(tp))
print('TN: ' + str(tn))
print('FP: ' + str(fp))
print('FP: ' + str(fp))


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Training Classifier...


AttributeError: 'str' object has no attribute 'items'

In [49]:
# predictions = predictLabels(testData, classifier)