## Importing libraries

In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

## Importing dataset

In [86]:
pos_rev = open('../input/positive.txt', encoding='latin-1').read()
neg_rev = open('../input/negative.txt', encoding='latin-1').read()
pos_rev = pos_rev.split('\n')
neg_rev = neg_rev.split('\n')

In [87]:
len(pos_rev), len(neg_rev)

(5331, 5331)

## Preprocessing data

***Tokenizing Reviews***

In [88]:
tokenizer = TreebankWordTokenizer()
pos_rev_token = [tokenizer.tokenize(rev) for rev in pos_rev]
neg_rev_token = [tokenizer.tokenize(rev) for rev in neg_rev]

***Removing punctuations and Numbers***

In [73]:
pos_rev_token = [[token for token in rev if token.isalpha()] for rev in pos_rev_token]
neg_rev_token = [[token for token in rev if token.isalpha()] for rev in neg_rev_token]

***Stemming***

In [22]:
ps = PorterStemmer()
pos_rev_token = [[ps.stem(token) for token in rev] for rev in pos_rev_token]
neg_rev_token = [[ps.stem(token) for token in rev] for rev in neg_rev_token]

***Removing stopwords***

In [89]:
STOPWORDS = set(stopwords.words('english'))
pos_rev_token = [[token for token in rev if token not in STOPWORDS] for rev in pos_rev_token]
neg_rev_token = [[token for token in rev if token not in STOPWORDS] for rev in neg_rev_token]

***Removing words which have length < 3***

In [90]:
pos_rev_token = [[token for token in rev if len(token)>2] for rev in pos_rev_token]
neg_rev_token = [[token for token in rev if len(token)>2] for rev in neg_rev_token]

## Splitting data into test and train sets

In [91]:
x_train = pos_rev_token[0:5000]+neg_rev_token[0:5000]
y_train = [1 for i in range(5000)]+[0 for i in range(5000)]
x_test = pos_rev_token[5000:]+neg_rev_token[5000:]
y_test = [1 for i in range(len(pos_rev_token)-5000)]+[0 for i in range(len(neg_rev_token)-5000)]

In [92]:
len(x_train), len(y_train), len(x_test), len(y_test)

(10000, 10000, 662, 662)

***Shuffling the dataset***

In [93]:
train_df = pd.DataFrame({'text':x_train, 'rev':y_train})
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = pd.DataFrame({'text':x_test, 'rev':y_test})
test_df = test_df.sample(frac=1).reset_index(drop=True)

## Creating Tfidf Vectors

In [94]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5)
x_train_tfidf = vectorizer.fit_transform([' '.join(rev) for rev in train_df['text']])
x_test_tfidf = vectorizer.transform(' '.join(rev) for rev in test_df['text'])

## Training

***Using SVM***

In [95]:
clf = svm.SVC(gamma='scale')
clf.fit(x_train_tfidf, y_train)
predictions = clf.predict(x_test_tfidf)
print(classification_report(predictions, y_test))


              precision    recall  f1-score   support

           0       0.48      0.51      0.49       309
           1       0.54      0.51      0.53       353

   micro avg       0.51      0.51      0.51       662
   macro avg       0.51      0.51      0.51       662
weighted avg       0.51      0.51      0.51       662



***Using Logistic Regression***

In [96]:
clf = LogisticRegression(penalty='l2', C=1)
clf.fit(x_train_tfidf, y_train)
predictions = clf.predict(x_test_tfidf)
print(classification_report(predictions, y_test))

              precision    recall  f1-score   support

           0       0.47      0.50      0.49       312
           1       0.53      0.50      0.51       350

   micro avg       0.50      0.50      0.50       662
   macro avg       0.50      0.50      0.50       662
weighted avg       0.50      0.50      0.50       662





***Using Naive Bayes Classifier***

In [97]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

In [98]:
clf = GaussianNB().fit(x_train_tfidf.toarray(), y_train)
predictions = clf.predict(x_test_tfidf.toarray())
print(classification_report(predictions, y_test))

              precision    recall  f1-score   support

           0       0.43      0.51      0.47       275
           1       0.60      0.51      0.55       387

   micro avg       0.51      0.51      0.51       662
   macro avg       0.51      0.51      0.51       662
weighted avg       0.52      0.51      0.51       662



In [99]:
clf = MultinomialNB().fit(x_train_tfidf.toarray(), y_train)
predictions = clf.predict(x_test_tfidf.toarray())
print(classification_report(predictions, y_test))

              precision    recall  f1-score   support

           0       0.50      0.53      0.51       314
           1       0.55      0.53      0.54       348

   micro avg       0.53      0.53      0.53       662
   macro avg       0.53      0.53      0.53       662
weighted avg       0.53      0.53      0.53       662



In [100]:
clf = BernoulliNB().fit(x_train_tfidf.toarray(), y_train)
predictions = clf.predict(x_test_tfidf.toarray())
print(classification_report(predictions, y_test))

              precision    recall  f1-score   support

           0       0.50      0.53      0.51       318
           1       0.54      0.52      0.53       344

   micro avg       0.52      0.52      0.52       662
   macro avg       0.52      0.52      0.52       662
weighted avg       0.52      0.52      0.52       662



#### The accuracy is very poor.