In [44]:
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/ivan/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Lists of reviews

In [45]:
ids = movie_reviews.fileids()
reviews = [movie_reviews.raw(fileids=[f]) for f in ids]
print('Quantity of reiews: ', len(ids))

with open('Answers 1/ans1.txt', 'w') as file:
    file.write(str(len(ids)))

Quantity of reiews:  2000


Negative and positive id's

In [46]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

negfeats = [movie_reviews.raw(fileids=[f]) for f in negids]
posfeats = [movie_reviews.raw(fileids=[f]) for f in posids]

print('Quantity of negative reviews: ', len(negids))
print('Quantity of positive reviews: ', len(posids))

with open('Answers 1/ans2.txt', 'w') as file:
    partition = len(posids)/len(ids)
    file.write(str(partition))

Quantity of negative reviews:  1000
Quantity of positive reviews:  1000


Classes of reviews (negative = 0, positive = 1)

In [47]:
y = []
for review in ids:
    if review in negids:
        y.append(0)
    else:
        y.append(1)

In [48]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score, auc
from sklearn.model_selection import cross_val_score

Simple vectorizer

In [49]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(reviews)

In [50]:
quantity_of_features = len(vectorizer.get_feature_names())
print(quantity_of_features)

with open('Answers 1/ans3.txt', 'w') as file:
    file.write(str(quantity_of_features))

39659


Simple classification with accuracy score

In [51]:
clf = LogisticRegression()
pipeline = Pipeline([('vect', CountVectorizer()), 
                     ('clf', LogisticRegression())])
result_accuracy = cross_val_score(pipeline, reviews, y, scoring=make_scorer(accuracy_score))

In [52]:
print(result_accuracy)
with open('Answers 1/ans4.txt', 'w') as file:
    file.write(str(result_accuracy[1]))

[0.81437126 0.84684685 0.84684685]


And ROC AUC score

In [53]:
result_roc_auc = cross_val_score(pipeline, reviews, y, scoring=make_scorer(roc_auc_score))

In [54]:
print(result_roc_auc)
with open('Answers 1/ans5.txt', 'w') as file:
    file.write(str(result_roc_auc[1]))

[0.81437126 0.84684685 0.84684685]


Let's find the most important features (words)

In [55]:
clf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [56]:
coeffs = clf.coef_[0]
mif = [vectorizer.get_feature_names()[list(coeffs).index(i)] for i in sorted(coeffs)[:5]]
print(mif)

with open('Answers 1/ans6.txt', 'w') as file:
    file.write(mif[0] + ' ' + mif[1])

['bad', 'unfortunately', 'worst', 'waste', 'nothing']


Compare CountVectorizer and TfidfVectorizer

In [57]:
count_pipeline = Pipeline([('vect', CountVectorizer()), 
                     ('clf', LogisticRegression())])

count_cross_val_score = cross_val_score(count_pipeline, reviews, y, scoring=make_scorer(accuracy_score), cv=5)
count_result_accuracy = count_cross_val_score.mean()
count_result_std = count_cross_val_score.std()

print("Accuracy for CountVectorizer:", count_result_accuracy)
print("Std for CountVectorizer:", count_result_std)

Accuracy for CountVectorizer: 0.841
Std for CountVectorizer: 0.01677796173556255


In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_pipeline = Pipeline([('vect', TfidfVectorizer()), 
                     ('clf', LogisticRegression())])

tfidf_cross_val_score = cross_val_score(tfidf_pipeline, reviews, y, scoring=make_scorer(accuracy_score), cv=5)
tfidf_result_accuracy = tfidf_cross_val_score.mean()
tfidf_result_std = tfidf_cross_val_score.std()

print("Accuracy for TfidfVectorizer:", tfidf_result_accuracy)
print("Std for TfidfVectorizer:", tfidf_result_std)

Accuracy for TfidfVectorizer: 0.8210000000000001
Std for TfidfVectorizer: 0.004062019202317978


In [59]:
with open('Answers 2/ans1.txt', 'w') as file:
    file.write(str(count_result_accuracy) + ' ' + str(count_result_std) + ' ' +
               str(tfidf_result_accuracy) + ' ' + str(tfidf_result_std))

Seems like CountVectorizer is better. Let's try to improve it

In [60]:
min_dfs = [10, 50]
improved_acc = []

for df in min_dfs:
    improved_count_pipeline = Pipeline([('vect', CountVectorizer(min_df=df)), 
                                 ('clf', LogisticRegression())])
    improved_acc.append(cross_val_score(improved_count_pipeline, reviews, 
                                        y, scoring=make_scorer(accuracy_score), cv=5).mean())
    
print("Result with min_dfs=10", improved_acc[0])
print("Result with min_dfs=50", improved_acc[1])

with open('Answers 2/ans2.txt', 'w') as file:
    file.write(str(improved_acc[0]) + ' ' + str(improved_acc[1]))

Result with min_dfs=10 0.8390000000000001
Result with min_dfs=50 0.813


Let's try other classificators

In [70]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings('ignore')

lr_pipeline = Pipeline([('vect', CountVectorizer()), 
                     ('clf', LogisticRegression())])
sgd_pipeline = Pipeline([('vect', CountVectorizer()), 
                     ('clf', SGDClassifier(random_state=42))])
svc_pipeline = Pipeline([('vect', CountVectorizer()), 
                     ('clf', LinearSVC())])

lr_cross_val_score = cross_val_score(lr_pipeline, reviews, y, scoring=make_scorer(accuracy_score), cv=5)
sgd_cross_val_score = cross_val_score(sgd_pipeline, reviews, y, scoring=make_scorer(accuracy_score), cv=5)
svc_cross_val_score = cross_val_score(svc_pipeline, reviews, y, scoring=make_scorer(accuracy_score), cv=5)

lr_accuracy = lr_cross_val_score.mean()
sgd_accuracy = sgd_cross_val_score.mean()
svc_accuracy = svc_cross_val_score.mean()

print("Accuracy for LinearRegression:", lr_accuracy)
print("Accuracy for SGDClassifier:", sgd_accuracy)
print("Accuracy for LinearSVC:", svc_accuracy)

with open('Answers 2/ans3.txt', 'w') as file:
    file.write(str(min(lr_accuracy, sgd_accuracy, svc_accuracy)))

Accuracy for LinearRegression: 0.841
Accuracy for SGDClassifier: 0.74
Accuracy for LinearSVC: 0.8325000000000001


Try to use stop-words

In [66]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
print(stop_words)

[nltk_data] Downloading package stopwords to /Users/ivan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both

In [69]:
nltk_sw_pipeline = Pipeline([('vect', CountVectorizer(stop_words=stop_words)), 
                     ('clf', LogisticRegression())])
sklearn_sw_pipeline = Pipeline([('vect', CountVectorizer(stop_words="english")), 
                     ('clf', LogisticRegression())])

nltk_sw_accuracy = cross_val_score(nltk_sw_pipeline, reviews, y, 
                                   scoring=make_scorer(accuracy_score), cv=5).mean()
sklearn_sw_accuracy = cross_val_score(sklearn_sw_pipeline, reviews, y, 
                                      scoring=make_scorer(accuracy_score), cv=5).mean()

print("Accuracy with nltk stop words:", nltk_sw_accuracy)
print("Accuracy with sklearn stop words:", sklearn_sw_accuracy)

with open('Answers 2/ans4.txt', 'w') as file:
    file.write(str(nltk_sw_accuracy) + ' ' + str(sklearn_sw_accuracy))

Accuracy with nltk stop words: 0.841
Accuracy with sklearn stop words: 0.8385


Try to add bigrams and n-grams

In [72]:
bigram_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,2), analyzer='char_wb')), 
                     ('clf', LogisticRegression())])
ngram_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(3,5), analyzer='char_wb')), 
                     ('clf', LogisticRegression())])

bigram_accuracy = cross_val_score(bigram_pipeline, reviews, y, scoring=make_scorer(accuracy_score), cv=5).mean()
ngram_accuracy = cross_val_score(ngram_pipeline, reviews, y, scoring=make_scorer(accuracy_score), cv=5).mean()

print("Accuracy with bigrams:", bigram_accuracy)
print("Accuracy with 3-5grams:", ngram_accuracy)

with open('Answers 2/ans5.txt', 'w') as file:
    file.write(str(bigram_accuracy) + ' ' + str(ngram_accuracy))

Accuracy with bigrams: 0.704
Accuracy with 3-5grams: 0.819
