In [2]:
import csv                               # csv reader
import re                                       # regular expressions
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
from random import shuffle
import numpy as np

In [3]:
from collections import Counter

In [3]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\FiercePC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
def loadData(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "DOC_ID":
                continue
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))


def splitData(percentage):
    # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))
        
        

In [5]:


# the output classes
fakeLabel = 'fake'
realLabel = 'real'
labelMap = {'__label1__' : fakeLabel, '__label2__' : realLabel}

# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    
    reviewId    = int(reviewLine[0])
    reviewText  = reviewLine[8]
    reviewLabel = labelMap[reviewLine[1]]
    
    return (reviewId, reviewText, reviewLabel)


In [6]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION

# input: a string of one review
def preProcess(text):
    # should return a list of tokens
    
    # word tokenisation, including punctuation removal
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    
    # lowercasing
    tokens = [t.lower() for t in tokens]

    # stopword removal
    stop = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop]
    
    # lemmatisation
    lemmatiser = WordNetLemmatizer()
    tokens = [lemmatiser.lemmatize(t) for t in tokens]
    
    return tokens

print(preProcess("hello this is the, ehh... presumably, a crying situations!"))

['hello', 'ehh', 'presumably', 'cry', 'situation']


In [7]:
# QUESTION 2

def toFeatureVector(words):
    # return a dictionary 'featureVect' where the keys are the tokens in 'words' and the values are the number of occurrences of the tokens
    # start by using binary values only:
    counts = Counter(words)
    return {w: counts[w]/sum(counts.values()) for w in counts.keys()}#{w: 1.0/len(words) for w in words}

In [8]:
toFeatureVector(["a", "a", "b", "c"])

{'a': 0.5, 'b': 0.25, 'c': 0.25}

In [9]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    return SklearnClassifier(LinearSVC(loss='squared_hinge')).train(trainData)



In [10]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(text, classifier):
    return classifier.classify(toFeatureVector(preProcess(text)))


In [11]:
# QUESTION 3

def crossValidate(dataset, folds):
    shuffle(dataset)
    results = []
    foldSize = int(len(dataset)/folds)
    
    for i in range(0,len(dataset),int(foldSize)):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        print("Fold start on items %d - %d" % (i, i+foldSize))
        myTestData = dataset[i:i+foldSize]
        myTrainData = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier(myTrainData)
        y_true = [x[1] for x in myTestData]
        y_pred = predictLabels(myTestData, classifier)
        results.append(precision_recall_fscore_support(y_true, y_pred, average='weighted'))
        
    print(zip(*results))
    avgResults = [np.mean([x[0] for x in results]),
                   np.mean([x[1] for x in results]),
                   np.mean([x[2] for x in results])
                ]
    return avgResults

In [13]:
# MAIN

# loading reviews
rawData = [] # the filtered data from the dataset file (should be 21000 samples)
trainData = [] # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = [] # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# references to the data files
reviewPath = 'amazon_reviews.txt'

# do the actual stuff
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)))
print("Preparing the dataset...")
loadData(reviewPath)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)))
print("Preparing training and test data...")
splitData(0.8)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)))


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData


In [14]:
cv_results = crossValidate(trainData, 10)
print(cv_results)

Fold start on items 0 - 1680
Training Classifier...
Fold start on items 1680 - 3360
Training Classifier...
Fold start on items 3360 - 5040
Training Classifier...
Fold start on items 5040 - 6720
Training Classifier...
Fold start on items 6720 - 8400
Training Classifier...
Fold start on items 8400 - 10080
Training Classifier...
Fold start on items 10080 - 11760
Training Classifier...
Fold start on items 11760 - 13440
Training Classifier...
Fold start on items 13440 - 15120
Training Classifier...
Fold start on items 15120 - 16800
Training Classifier...
<zip object at 0x126a2d2d0>
[0.653992757558385, 0.653154761904762, 0.6529181515047029]


In [15]:
# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0])   # have a look at the first test data instance
    classifier = trainClassifier(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])

({'assortment': 0.09090909090909091, 'really': 0.09090909090909091, 'hershey': 0.09090909090909091, 'best': 0.09090909090909091, 'little': 0.09090909090909091, 'one': 0.09090909090909091, 'always': 0.09090909090909091, 'excited': 0.09090909090909091, 'whenever': 0.09090909090909091, 'holiday': 0.09090909090909091, 'come': 0.09090909090909091}, 'fake')
Training Classifier...
Done training!
Precision: 0.602309
Recall: 0.601905
F Score:0.601511
