In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import timeit
import warnings

In [4]:
reviews_train = []
for line in open('positive.txt', 'r'):
    reviews_train.append(line.strip())
for line in open('negative.txt', 'r'):
    reviews_train.append(line.strip())

In [5]:
reviews_test = []
for line in open('positive_test.txt', 'r'):
    reviews_test.append(line.strip())
for line in open('negative_test.txt', 'r'):
    reviews_test.append(line.strip())

In [6]:
cv = CountVectorizer(binary=True)
cv.fit(reviews_train)
X = cv.transform(reviews_train)
X_test = cv.transform(reviews_test)

In [7]:
print(X.shape)

(25000, 31402)


In [8]:
y = np.ones(25000)
y[12500:25000] = 0
y_test = np.ones(25000)
y_test[12500:25000] = 0

In [9]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

In [49]:
times = []
for c in [0.01, 0.05, 0.25, 0.5, 1, 2, 3, 4]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.8656
Accuracy for C=0.05: 0.87408
Accuracy for C=0.25: 0.87168
Accuracy for C=0.5: 0.86928
Accuracy for C=1: 0.86672
Accuracy for C=2: 0.8656
Accuracy for C=3: 0.864
Accuracy for C=4: 0.8632


In [10]:
final_model = LogisticRegression(C=0.05)
final_model.fit(X, y)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final_model.predict(X_test)))



Final Accuracy: 0.87976


In [56]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}

print('Best positive predicator words:')
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
    
print('Best negative predicator words:')
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

Best positive predicator words:
('excellent', 0.9266022329819438)
('perfect', 0.7513393762433902)
('great', 0.6655088079286111)
('favorite', 0.6501970743548225)
('amazing', 0.6433794749834267)
Best negative predicator words:
('worst', -1.3871248297321959)
('waste', -1.213651247902094)
('awful', -1.0362327120362491)
('poorly', -0.8941649158587862)
('disappointment', -0.8394736092762659)
