In [None]:
import spacy
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# Data Collection

In [None]:
def readReviews(path):
    arr = []
    for filename in tqdm(os.listdir(path)):
        fullpath= os.path.join(path,filename)
        #print(fullpath)
        with open(fullpath) as f:
            review = f.readlines()
            arr.append(review)
            
    return np.array(arr) 
    #return arr 

In [None]:
posPath='imdb/train/pos2'
posArr=readReviews(posPath)
#posArr = np.insert(posArr, 1, 1, axis=1)

In [None]:
negPath='imdb/train/neg2'
negArr=readReviews(negPath)
#negArr = np.insert(negArr, 1, 0, axis=1)

In [None]:
print(posArr.shape)
#print(len(posArr))
print(negArr.shape)
#print(len(negArr))
#features = posArr + negArr
features = np.concatenate((posArr,negArr),axis=0)
#len(features)
print(features.shape)

In [None]:
#labels
posLabels=np.ones(len(posArr))
negLabels=np.zeros(len(negArr))
#labels = posLabels + negLabels
labels = np.concatenate((posLabels,negLabels),axis=0)
#print(len(labels))
print(labels.shape)

In [None]:
df1 = pd.DataFrame(data = posArr, columns = ['review','label'])
df2 = pd.DataFrame(data = negArr, columns = ['review','label'])
reviews_df = df1.append(df2)
reviews_df.head()

In [None]:
posPathTest='imdb/test/pos'
posArrTest=readReviews(posPathTest)

In [None]:
negPathTest='imdb/test/neg'
negArrTest=readReviews(negPathTest)

In [None]:
posArrTest = np.insert(posArrTest, 1, 1, axis=1)
negArrTest = np.insert(negArrTest, 1, 0, axis=1)

In [None]:
df1 = pd.DataFrame(data = posArrTest, columns = ['review','label'])
df2 = pd.DataFrame(data = negArrTest, columns = ['review','label'])
test_df = df1.append(df2)
test_df.head()

# Pre Processing

In [None]:
#!python -m spacy download en_core_web_sm

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
import os
path='imdb/train/neg/0_3.txt'
with open(path) as f:
    lines = f.readlines()

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
text_sentences = nlp(lines[0])

In [None]:
lines
text_sentences = nlp(lines[0])
d = []
for sentence in text_sentences.sents:
    #print(sentence.text)
    for token in sentence: 
        #print(token.text, token.pos_)
        d.append({'TEXT': token.text, 
            'LEMMA': token.lemma_, 
            'POS': token.pos_,
            'ALPHA': token.is_alpha, 
            'STOP': token.is_stop})
        
spacy_dataframe = pd.DataFrame(d)

In [None]:
#No Stop Words, Propernouns or Punctuation. 
spacy_dataframe.loc[(spacy_dataframe['STOP']==False) & (spacy_dataframe['POS']!='PROPN') \
                    & (spacy_dataframe['POS']!='PUNCT')]#.count()

# Modeling

In [None]:
reviews_df.review

In [None]:
#print(posArr[:,0].shape)#features
#print(posArr[:,1].shape)#labels
print(features[:,0].shape)
print(labels.shape)

In [None]:
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()
y = lbl_enc.fit_transform(labels)

In [None]:
y
lbl_enc.classes_

----------Option 1 Count Vectorizer----------

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
features = vectorizer.fit_transform(features[:,0])

----------Option 2 TfidfVectorizer---------

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
features = tfv.fit_transform(features[:,0])

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(features, labels, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.3, shuffle=True)

In [None]:
print(xtrain.shape)
xtest.shape

# Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

model1 = MultinomialNB()
model1.fit(xtrain, ytrain)
pred1 = model1.predict_proba(xtest)

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
acc = accuracy_score(ytest, pred1[:,1] > 0.5)
auc = roc_auc_score(ytest, pred1[:,1])
fpr, tpr, thr = roc_curve(ytest, pred1[:,1])

In [None]:
print('Accuracy: ', acc)
print('ROC AUC: ', auc)

In [None]:
import matplotlib.pyplot as plt
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest, pred1[:,1]>.7)

In [None]:
pred1[:,1]

--------Model with Tfidf-------

In [None]:
from sklearn.naive_bayes import MultinomialNB

model2 = MultinomialNB()
model2.fit(xtrain, ytrain)
pred2 = model2.predict_proba(xtest)

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
acc = accuracy_score(ytest, pred2[:,1] > 0.5)
auc = roc_auc_score(ytest, pred2[:,1])
fpr, tpr, thr = roc_curve(ytest, pred2[:,1])

In [None]:
print('Accuracy: ', acc)
print('ROC AUC: ', auc)

In [None]:
import matplotlib.pyplot as plt
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest, pred2[:,1]>.5)