In [1]:
import csv
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from nltk.classify import SklearnClassifier
from nltk import word_tokenize
from nltk.corpus import stopwords
from random import shuffle
from sklearn.pipeline import Pipeline
import re, string

In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader, None)  # skip the headers
        for line in reader:
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
            preprocessedData.append((Id, preProcess(Text), Label))
        
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((_, toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((_, toFeatureVector(preProcess(Text)),Label))

In [3]:
# QUESTION 1
def parse_label(label):
    if label == '__label2__':
        return 'real'
    else:
        return 'fake'    

# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    return reviewLine[0], reviewLine[8], parse_label(reviewLine[1])


In [4]:
#TODO!!!!!!!
# TEXT PREPROCESSING AND FEATURE VECTORIZATION
# stop = set(stopwords.words('english'))
translator=str.maketrans('','',string.punctuation)
# Input: a string of one review
def preProcess(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.translate(translator)
    text = text.lower()
    tokens = re.split(r"\s+",text)
    return tokens

In [5]:
# QUESTION 2
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    v = {}
    for t in tokens:
        try:
            featureDict[t] += 1
        except KeyError:            
            featureDict[t] = 1
        try:
            v[t] += (1.0/len(tokens))
        except KeyError:
            v[t] = (1.0/len(tokens))
    return v

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [7]:
# QUESTION 3

def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    predictions = []
    ground_truth = []
    foldSize = int(len(dataset)/folds)
    
    for i in range(0,len(dataset), foldSize):
        trainFolds = []
        validationFold = []
        trainFolds = dataset[:i] + dataset[i+foldSize:]        
        validationFold = dataset[i: i+foldSize]
        
        training_set = [(t[1], t[2]) for t in trainFolds]
        classifier = trainClassifier(training_set)
        validation_set = [(t[0], t[1]) for t in validationFold]
        predictions.append(predictLabels(validationFold, classifier))
        ground_truth.append([ l[2] for l in validationFold])

    return ground_truth, predictions

In [10]:
rawData2 = []
trainData2 = []
testData2 = []
reviewPath = 'amazon_reviews.txt'

def loadData2(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader, None)  # skip the headers
        for line in reader:
            (Id, Text, Label) = parseReview(line)
            rawData2.append((Id, Text, Label))
            trainData2.append((toFeatureVector(preProcess(Text)),Label))
            
def crossValidate2(dataset, folds):
    shuffle(dataset)
    results = []
    foldSize = int(len(dataset)/folds)
    for i in range(0,len(dataset),foldSize):
        clf = trainClassifier(dataset[:i] + dataset[i+foldSize:])
        pLabel = predictLabels2(dataset[i:i+foldSize], clf)
        yLabel = [ l[1] for l in dataset[i:i+foldSize]]
        results.append(accuracy_score(yLabel, pLabel))
        # insert code here that trains and tests on the 10 folds of data in the dataset
    return results

def predictLabels2(tweetData, classifier):
    test = map(lambda t: t[0], tweetData)
    return classifier.classify_many(test)

loadData2(reviewPath) 
print("Now %d rawData, %d trainData, %d testData" % (len(rawData2), len(trainData2), len(testData2)),
      "Preparing training and test data...",sep='\n')
cv_results = crossValidate2(trainData2, 10)
print(sum(cv_results)/len(cv_results))

Now 21000 rawData, 21000 trainData, 0 testData
Preparing training and test data...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...


In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many([t[1] for t in reviewSamples])
#     return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample[1])))

def flatten(lst):
    for el in lst:
        if isinstance(el, list):
            yield from el
        else:
            yield el

In [9]:
# MAIN

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(1)

ground_truth, predictions = crossValidate(trainData, 10)
ground_truth = list(flatten(ground_truth))
predictions = list(flatten(predictions))

# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')
print('Accuracy: ' + str(round(100*accuracy_score(ground_truth, predictions), 2)) + '%')
print('Confusion Matrix: ')
tn, fp, fn, tp = confusion_matrix(ground_truth, predictions).ravel()
print('TP: ' + str(tp))
print('TN: ' + str(tn))
print('FP: ' + str(fp))
print('FN: ' + str(fn))


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Now 21000 rawData, 21000 trainData, 0 testData
Training Samples: 
21000
Features: 
46235
Accuracy: 65.06%
Confusion Matrix: 
TP: 7193
TN: 6469
FP: 4031
FN: 3307


In [None]:
# predictions = predictLabels(testData, classifier)

In [None]:
# trainData[0][1]