In [66]:
import csv
import numpy
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)

In [10]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader, None)  # skip the headers        
        for line in reader:
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
#             preprocessedData.append((Id, preProcess(Text), Label))
        
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [9]:
# QUESTION 1
def parse_label(label):
    if label == '__label2__':
        return 'real'
    else:
        return 'fake'    

# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    return reviewLine[0], reviewLine[8], parse_label(reviewLine[1])

In [11]:
from __future__ import unicode_literals

import re, string
import nltk
from nltk.corpus.reader.wordnet import NOUN
from nltk.corpus import wordnet
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop = set(stopwords.words('english'))
s = nltk.stem.SnowballStemmer('english')
l = WordNetLemmatizer()
translator=str.maketrans('','',string.punctuation)

# Input: a string of one review
def preProcess(text):  
    text = re.sub(r"(\w)([.,;:!-?'\"”\)])", r"\1 \2", text)
    text = re.sub(r"([.,;:!-?'\"“\(])(\w)", r"\1 \2", text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.translate(translator)
    text = text.lower()    
    tokens = word_tokenize(text)
    tokens = [l.lemmatize(t) for t in tokens]
#     tokens = [s.stem(t) for t in tokens]    
#     text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
#     text = text.translate(translator)
#     text = text.lower()
#     tokens = word_tokenize(text)
#     tokens = [l.lemmatize(t) for t in tokens]    
#     tokens = [s.stem(t) for t in tokens]
    tokens = [t for t in tokens if t not in stop]
    return tokens

In [119]:
# QUESTION 2
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    v = {}
    for t in tokens:
        try:
            featureDict[t] += 1
        except KeyError:            
            featureDict[t] = 1
        try:
            v[t] += (1.0/len(tokens))
        except KeyError:
            v[t] = (1.0/len(tokens))
    return v

In [67]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', SGDClassifier(loss='hinge', penalty='l2'))])
    return SklearnClassifier(pipeline).train(trainData)

# text_clf_svm = Pipeline([('vect', CountVectorizer()),
# ...                      ('tfidf', TfidfTransformer()),
# ...                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
# ...                                            alpha=1e-3, n_iter=5, random_state=42)),
# ... ])

In [117]:
# QUESTION 3

def crossValidate(dataset, folds):
    shuffle(dataset)
    predictions = []
    ground_truth = []
    foldSize = int(len(dataset)/folds)
    #preProcess and tokenize once!
    dataset = [(t[0], toFeatureVector(preProcess(t[1])), t[2]) for t in dataset]
    
    for i in range(0,len(dataset), foldSize):
        trainFolds = dataset[:i] + dataset[i+foldSize:]        
        validationFold = dataset[i: i+foldSize]
        
        training_set = [(t[1], t[2]) for t in trainFolds]
        classifier = trainClassifier(training_set)
        validation_set = [(t[0], t[1]) for t in validationFold]
        predictions.append(predictLabels(validationFold, classifier))
        ground_truth.append([ l[2] for l in validationFold])
        
    return ground_truth, predictions

In [115]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[1], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [63]:
def flatten(lst):
    for el in lst:
        if isinstance(el, list):
            yield from el
        else:
            yield el

In [68]:
# MAIN
import time
start_time = time.time()

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
# preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')

ground_truth, predictions = crossValidate(rawData, 10)
ground_truth = list(flatten(ground_truth))
predictions = list(flatten(predictions))

# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')
print('Accuracy: ' + str(round(100*accuracy_score(ground_truth, predictions), 2)))
print(metrics.classification_report(ground_truth, predictions))

print("--- %s seconds ---" % (time.time() - start_time))

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing the dataset...


NameError: name 'crossValidate' is not defined

In [60]:
# from sklearn.feature_extraction.text import CountVectorizer
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(twenty_train.data)
# X_train_counts.shape

In [61]:
# rawData = []
# reviewPath = 'amazon_reviews.txt'
# loadData(reviewPath) 


In [23]:
vector

['When least you think so, this product will save the day. Just keep it around just in case you need it for something.']

In [59]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
vector = []
vector.append(rawData[0][1])

rawData

X_train_counts = count_vect.fit_transform(vector)
X_train_counts.shape

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

X_train_tfidf.todense()

# X_train_counts.toarray()
# tfidf_matrix =  X_train_tfidf.fit_transform(vector)
# feature_names = X_train_tfidf.get_params

# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(vector)
# from sklearn.feature_extraction.text import TfidfTransformer
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# X_train_tfidf.shape

matrix([[ 0.18569534,  0.18569534,  0.18569534,  0.18569534,  0.18569534,
          0.37139068,  0.37139068,  0.18569534,  0.18569534,  0.18569534,
          0.18569534,  0.18569534,  0.18569534,  0.18569534,  0.18569534,
          0.18569534,  0.18569534,  0.18569534,  0.18569534,  0.37139068]])

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 0, stop_words = 'english')
tfidf_matrix =  tf.fit_transform(vector)
feature_names = tf.get_feature_names() 

In [44]:
print(feature_names)
print(vector)

['case', 'day', 'just', 'need', 'product', 'save', 'think']
['When least you think so, this product will save the day. Just keep it around just in case you need it for something.']
