In [24]:
import pandas as pd                       
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import numpy as np
import nltk
import abc
import pickle
from nltk.corpus import stopwords
from collections import defaultdict
from collections import Counter
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Untr0nix\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Untr0nix\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
class PreProcessor(abc.ABC):
    @abc.abstractmethod
    def apply(self,obj):
        pass

class SimpleTokenizer(PreProcessor):
    def apply(self,text):
        return word_tokenize(text)
    
class Lemmatization(PreProcessor):
    
    def __init__(self,lemmatizer):
        self._lemmatizer = lemmatizer
        self.table = str.maketrans({key: None for key in string.punctuation})
    
    def apply(self,text):
        filtered_tokens = []
        stop_words = set(stopwords.words('english'))
        text = text.translate(self.table)
        for word in text.split():
            if word not in stop_words:
                filtered_tokens.append(self._lemmatizer.lemmatize(word.lower()))
        return filtered_tokens

class AdvancedLemmatization(PreProcessor):
    def __init__(self,lemmatizer):
        self._lemmatizer = lemmatizer
        self.table = str.maketrans({key: None for key in string.punctuation})
    
    def apply(self,text):
        filtered_tokens = []
        lemmatized_tokens = []
        stop_words = set(stopwords.words('english'))
        text = text.translate(self.table)
        for word in text.split():
            if word not in stop_words:
                lemmatized_tokens.append(self._lemmatizer.lemmatize(word.lower()))
            filtered_tokens = [' '.join(l) for l in nltk.bigrams(lemmatized_tokens)] + lemmatized_tokens
        return filtered_tokens


In [3]:
class Encoder(abc.ABC):
    
    @abc.abstractmethod
    def encode(self,item):
        pass

class BinaryEncoder(Encoder):
    def __init__(self):
        self._features = Counter()
    def encode(self,tokens): 
        self._features += Counter(tokens)
        return Counter(tokens)
            


In [4]:
class DataSet:
    
    def __init__(self,path,label_column = 'LABEL',delimiter = '\t',mapping = None,transformators = []):
        self._data = pd.read_csv(path,delimiter = delimiter)
        if mapping is not None:
            self._data[label_column] = self._data[label_column].map(mapping)
        self.raw_data = []
        self.preprocessed = []
        self._transformators = transformators
    
    def extract_item(self,position,args):
        return tuple(self._data[args].iloc[position])
    
    def transform(self,args,apply_to):
        self.index = apply_to
        for idx in range(self._data.shape[0]):
            items = self.extract_item(idx,args)
            self.raw_data.append(items)
            transformed = items[apply_to]
            for transformer in self._transformators:
                transformed = transformer.apply(transformed)
            temp_items = list(items)
            temp_items[apply_to] = transformed
            self.preprocessed.append(tuple(temp_items))


            
                        
            

In [27]:
class Classification:
    def __init__(self,dataset,model = LinearSVC,**kwargs):
        self._dataset = dataset
        self._model = SklearnClassifier(Pipeline([('ml_model',model(**kwargs))]))
        self.train_data = []
        self.test_data = []
        
    def split(self,ratio,vectorizer):
        raw_length = len(self._dataset.raw_data)
        middle = int(raw_length/2)
        traning_number = int(ratio*raw_length/2)
        for item in self._dataset.preprocessed[:traning_number] + self._dataset.preprocessed[middle:middle+traning_number:]:
            self.train_data.append((vectorizer.encode(item[self._dataset.index]),item[-1]))
        for item in self._dataset.preprocessed[traning_number:middle] + self._dataset.preprocessed[middle+traning_number:]:
            self.test_data.append((vectorizer.encode(item[self._dataset.index]),item[-1]))
    
    def train(self,train_data):
        return self._model.train(train_data)
    
    def predict(self,review,vectorizer = BinaryEncoder(),tokenizer = SimpleTokenizer()):
        return self._model.classify(vectorizer.encode(tokenizer.apply(review)))
    
    def load(self,path):
        with open(path,'rb') as f:
            self._model = pickle.load(f)
    
    def save(self,path):
        with open(path,'rb') as f:
            pickle.dump(self._model,f)
            
    def predict_many(self,reviews):
        return self._model.classify_many(map(lambda t:t[0],reviews))
    
    def cross_validation(self,folds):
        shuffle(self.train_data)
        self.cv_ = []
        fold = int(len(self.train_data)/folds)
        for idx in range(0,len(self.train_data),fold):
            clf = self.train(self.train_data[:idx] + self.train_data[fold+idx:])
            y_predicted = self.predict_many(self.train_data[idx:idx +fold])
            a = accuracy_score(list(map(lambda d:d[1],self.train_data[idx:idx+fold])),y_predicted)
            (p,r,f,_) = precision_recall_fscore_support(list(map(lambda d : d[1], self.train_data[idx:idx+fold])), y_predicted, average ='macro')
            print(p,r,f)
            self.cv_.append((a,p,r,f))
        self.cv_ = (np.mean(np.array(self.cv_),axis =  0))
        

In [132]:
reader = DataSet('amazon_reviews.txt',mapping = {'__label1__':'FAKE','__label2__':'NOT FAKE'},transformators= [SimpleTokenizer()])
reader.transform(['DOC_ID','REVIEW_TEXT','LABEL'],apply_to = 1)

In [135]:
clf = Classification(reader)
clf.split(0.8,BinaryEncoder())
clf.train(clf.train_data)



<SklearnClassifier(Pipeline(memory=None,
         steps=[('ml_model',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='squared_hinge', max_iter=1000,
                           multi_class='ovr', penalty='l2', random_state=None,
                           tol=0.0001, verbose=0))],
         verbose=False))>

In [136]:
clf.cross_validation(10)

0.6087931951089846 0.6087979765901461 0.6087953700221765
0.6356770274285568 0.6352553345880859 0.6348798514900102
0.6303119118347354 0.6303623507927709 0.6302993771348202
0.6354958909703435 0.635526636751967 0.6355076574021554
0.6299019607843137 0.629796654176044 0.6296984034960644
0.6024791325022272 0.6023809523809525 0.602285694844712
0.6164687414687415 0.6161417657768022 0.6153535353535353
0.6312948340158097 0.631608783864423 0.6312077562975233
0.6320061847700038 0.6316917702567826 0.6316730523627077
0.619773733398918 0.6185093060180856 0.6182905982905984


In [137]:
clf.cv_

array([0.62416667, 0.62422026, 0.62400715, 0.62379913])

## With Lemmatization and removing all non important word

In [7]:
reader = DataSet('amazon_reviews.txt',mapping = {'__label1__':'FAKE','__label2__':'NOT FAKE'},transformators= [Lemmatization(WordNetLemmatizer())])
reader.transform(['DOC_ID','REVIEW_TEXT','LABEL'],apply_to = 1)

In [8]:
clf_1 = Classification(reader)
clf_1.split(0.8,BinaryEncoder())
clf_1.train(clf_1.train_data)
clf_1.cross_validation(10)



0.6356968978216091 0.6355791908563135 0.6355650202534768
0.605407583775255 0.6049230381060244 0.6046903098658132
0.592062367115521 0.5924313062486124 0.5913863818151907
0.6173167701863354 0.617119070700886 0.6170338130100871
0.6056015353073678 0.6051103368176538 0.605011372047068
0.6184291367687393 0.6182987119013063 0.618267222160976
0.5757824389316821 0.575732492743106 0.5755614021100748
0.591252108045494 0.5911899722523279 0.5904669040114288
0.6067826086956521 0.606597826518291 0.6063957500608759
0.5907653905419312 0.5906597151230151 0.5906915426348769


In [9]:
clf_1.cv_

array([0.60386905, 0.60390968, 0.60376417, 0.60350697])

In [13]:
reader = DataSet('amazon_reviews.txt',mapping = {'__label1__':'FAKE','__label2__':'NOT FAKE'},transformators= [AdvancedLemmatization(WordNetLemmatizer())])
reader.transform(['DOC_ID','REVIEW_TEXT','LABEL'],apply_to = 1)

In [16]:
clf_2 = Classification(reader,model = LinearSVC,C = 0.01)
clf_2.split(0.8,BinaryEncoder())
clf_2.train(clf_2.train_data)
clf_2.cross_validation(10)

0.6341472461254023 0.633290998031125 0.6323148352110263
0.6534419937850231 0.6533758639021797 0.6533940976037911
0.624016588400317 0.6239322175151052 0.6237753991291727
0.6486797527975194 0.6482142857142857 0.6479387389010176
0.6674568879854184 0.6673685046877483 0.6674041601444144
0.6286481204098442 0.6281958782614365 0.6280648672582698
0.6520787963905363 0.651628104737271 0.6514645192795656
0.6398346179368077 0.6400570328165918 0.6392345854004252
0.6452840909090909 0.645165319284222 0.6440395473820268
0.6516922386516624 0.6514300903774588 0.6514388079689573


In [17]:
clf_2.cv_

array([0.64416667, 0.64452803, 0.64426583, 0.64390696])

In [18]:
df = pd.read_csv('amazon_reviews.txt',delimiter = '\t')
df.columns

Index(['DOC_ID', 'LABEL', 'RATING', 'VERIFIED_PURCHASE', 'PRODUCT_CATEGORY',
       'PRODUCT_ID', 'PRODUCT_TITLE', 'REVIEW_TITLE', 'REVIEW_TEXT'],
      dtype='object')

In [22]:
reader = DataSet('amazon_reviews.txt',label_column = 'RATING',transformators = [AdvancedLemmatization(WordNetLemmatizer())])
reader.transform(['DOC_ID','REVIEW_TEXT','RATING'],apply_to = 1)

In [23]:
clf_3 = Classification(reader,model = LinearSVC,C = 0.01)
clf_3.split(0.8,BinaryEncoder())
clf_3.train(clf_3.train_data)
clf_3.cross_validation(10)

0.46515737563048 0.3699837286162926 0.38123755350363986
0.46424732449715894 0.3635583592712607 0.3755071268482704
0.45457913591352145 0.36596005414707333 0.3755332964123859
0.4598371427333176 0.3541408044050809 0.3678055467312412
0.4605971016738466 0.3716027018339407 0.38141595410330076
0.4658195189167049 0.35375705212145875 0.3616779867640376
0.4379074351368887 0.37229046026722046 0.3756596569331819
0.43923881626989497 0.33842346166888115 0.3456987911002357
0.47728855033986156 0.37987982051396496 0.39270951579438984
0.47410618354166745 0.3737015889770465 0.3850461981685275


In [26]:
clf_3.cv_

array([0.61375   , 0.45987786, 0.3643298 , 0.37422916])