In [1]:
import glob
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# dataset is availabe here http://ai.stanford.edu/~amaas/data/sentiment/
train_data_p = glob.glob('/home/kamlesh/datasets/aclImdb_v1/aclImdb/train/pos/*')
train_data_n = glob.glob('/home/kamlesh/datasets/aclImdb_v1/aclImdb/train/neg/*')
test_data_p = glob.glob('/home/kamlesh/datasets/aclImdb_v1/aclImdb/test/pos/*')
test_data_n = glob.glob('/home/kamlesh/datasets/aclImdb_v1/aclImdb/test/neg/*')

In [3]:
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
def clean_data(data):
    data = REPLACE_WITH_SPACE.sub(" ", data)
    data = REPLACE_NO_SPACE.sub("", data.lower())
    return data


In [4]:
train = []
test = []
for file in train_data_p:
    data = open(file,"r").read().strip()
    data = clean_data(data)
    train.append(data)
for file in train_data_n:
    data = open(file,"r").read().strip()
    data = clean_data(data)
    train.append(data)
for file in test_data_p:
    data = open(file,"r").read().strip()
    data = clean_data(data)
    test.append(data)
for file in test_data_n:
    data = open(file,"r").read().strip()
    data = clean_data(data)
    test.append(data)
    
print(len(train))
print(len(test))

25000
25000


In [5]:
cv = CountVectorizer(binary=True)
cv.fit(train)
X_train = cv.transform(train)
X_test = cv.transform(test)
y_train = [1 if i<12500 else 0 for i in range(25000)]
y_test = [1 if i<12500 else 0 for i in range(25000)]

In [6]:
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,train_size=0.80,random_state=42)

for c in [0.01,0.05,0.1,0.5,1]:
    model = LogisticRegression(C=c)
    model.fit(X_train,y_train)
    print("validation accuracy for c=%s:%s"%(c,accuracy_score(y_val,model.predict(X_val))))



validation accuracy for c=0.01:0.8766
validation accuracy for c=0.05:0.8862
validation accuracy for c=0.1:0.8896
validation accuracy for c=0.5:0.8822
validation accuracy for c=1:0.879


In [7]:
best_model = LogisticRegression(C=0.1)
best_model.fit(X_train,y_train)
print("test accuracy is %s"%accuracy_score(y_test,best_model.predict(X_test)))

test accuracy is 0.87904


In [8]:
feature_to_coef = { word: coef for word, coef in zip( cv.get_feature_names(), best_model.coef_[0]) }
for best_positive in sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:5]:
    print (best_positive)
for best_negative in sorted(feature_to_coef.items(), key=lambda x: x[1])[:5]:
    print (best_negative)

('excellent', 1.0140027766045905)
('perfect', 0.8589396465970884)
('great', 0.6868357072134068)
('surprisingly', 0.6834742465586405)
('refreshing', 0.6773943875302683)
('worst', -1.4781310139446902)
('waste', -1.3166634556909393)
('awful', -1.0990847187621628)
('disappointment', -1.0013182244167205)
('poorly', -0.9730142002179379)
