In [1]:
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

ModuleNotFoundError: No module named 'nltk'

In [None]:
movie_data = load_files("txt_sentoken")
X, y = movie_data.data, movie_data.target

In [None]:
'''
=======

Introduction

This README v2.0 (June, 2004) for the v2.0 polarity dataset comes from
the URL http://www.cs.cornell.edu/people/pabo/movie-review-data .

=======

What's New -- June, 2004

This dataset represents an enhancement of the review corpus v1.0
described in README v1.1: it contains more reviews, and labels were
created with an improved rating-extraction system.

=======

Citation Info 

This data was first used in Bo Pang and Lillian Lee,
``A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization 
Based on Minimum Cuts'',  Proceedings of the ACL, 2004.

@InProceedings{Pang+Lee:04a,
  author =       {Bo Pang and Lillian Lee},
  title =        {A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts},
  booktitle =    "Proceedings of the ACL",
  year =         2004
}

=======

Data Format Summary 

- review_polarity.tar.gz: contains this readme and  data used in
  the experiments described in Pang/Lee ACL 2004.

  Specifically:

  Within the folder "txt_sentoken" are the 2000 processed down-cased
  text files used in Pang/Lee ACL 2004; the names of the two
  subdirectories in that folder, "pos" and "neg", indicate the true
  classification (sentiment) of the component files according to our
  automatic rating classifier (see section "Rating Decision" below).

  File names consist of a cross-validation tag plus the name of the
  original html file.  The ten folds used in the Pang/Lee ACL 2004 paper's
  experiments were:

     fold 1: files tagged cv000 through cv099, in numerical order
     fold 2: files tagged cv100 through cv199, in numerical order     
     ...
     fold 10: files tagged cv900 through cv999, in numerical order

  Hence, the file neg/cv114_19501.txt, for example, was labeled as
  negative, served as a member of fold 2, and was extracted from the
  file 19501.html in polarity_html.zip (see below).

  Each line in each text file corresponds to a single sentence, as
  determined by Adwait Ratnaparkhi's sentence boundary detector
  MXTERMINATOR.
 
  Preliminary steps were taken to remove rating information from the
  text files, but only the rating information upon which the rating
  decision was based is guaranteed to have been removed. Thus, if the
  original review contains several instances of rating information,
  potentially given in different forms, those not recognized as valid
  ratings remain part of the review text.
	
- polarity_html.zip: The original source files from which the
  processed, labeled, and (randomly) selected data in
  review_polarity.tar.gz was derived.

  Specifically:  

  This data consists of unprocessed, unlabeled html files from the
  IMDb archive of the rec.arts.movies.reviews newsgroup,
  http://reviews.imdb.com/Reviews. The files in review_polarity.tar.gz
  represent a processed subset of these files. 

=======

Rating Decision (Appendix A)

This section describes how we determined whether a review was positive
or negative.

The original html files do not have consistent formats -- a review may
not have the author's rating with it, and when it does, the rating can
appear at different places in the file in different forms.  We only
recognize some of the more explicit ratings, which are extracted via a
set of ad-hoc rules.  In essence, a file's classification is determined
based on the first rating we were able to identify.


- In order to obtain more accurate rating decisions, the maximum
	rating must be specified explicitly, both for numerical ratings
	and star ratings.  ("8/10", "four out of five", and "OUT OF
	****: ***" are examples of rating indications we recognize.)

- With a five-star system (or compatible number systems):
	three-and-a-half stars and up are considered positive, 
	two stars and below are considered negative.
- With a four-star system (or compatible number system):
	three stars and up are considered positive, 
	one-and-a-half stars and below are considered negative.  
- With a letter grade system:
	B or above is considered positive,
	C- or below is considered negative.

We attempted to recognize half stars, but they are specified in an
especially free way, which makes them difficult to recognize.  Hence,
we may lose a half star very occasionally; but this only results in 2.5
stars in five star system being categorized as negative, which is 
still reasonable.

'''

In [None]:
X[0]

In [None]:
y[0]

In [None]:
len(y)

In [None]:
set(y)

In [None]:
y.sum()

In [None]:
# Preprocessing

documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for item in range(0, len(X)):
    # remove all the special characters
    document = re.sub(r'\W', ' ', str(X[item]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = list(filter(lambda word: word != '', document))
    document = ' '.join(document)
    
    documents.append(document)

In [None]:
documents[0]

### BoW

In [None]:
vectorizer = CountVectorizer(max_features=1500, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()

In [None]:
X.shape

In [None]:
X[0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
lr = LogisticRegression(C=1)
lr.fit(X_train, y_train) 
y_pred = lr.predict(X_test)

In [None]:
fig = plt.figure(figsize=(6, 4))
heatmap = sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, cmap='PuRd', 
                      linewidths=.5, linecolor='gray',fmt="d")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
print(classification_report(y_test,y_pred))
print("Accuracy: ",accuracy_score(y_test, y_pred))
print("Roc-auc: ", roc_auc_score(y_test,y_pred))

In [None]:
rf = RandomForestClassifier(n_estimators=1000, random_state=42)
rf.fit(X_train, y_train) 
y_pred = rf.predict(X_test)

In [None]:
fig = plt.figure(figsize=(6, 4))
heatmap = sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, cmap='PuRd', 
                      linewidths=.5, linecolor='gray',fmt="d")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
print(classification_report(y_test,y_pred))
print("Accuracy: ",accuracy_score(y_test, y_pred))
print("Roc-auc: ", roc_auc_score(y_test,y_pred))

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
fig = plt.figure(figsize=(6, 4))
heatmap = sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, cmap='PuRd', 
                      linewidths=.5, linecolor='gray',fmt="d")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
print(classification_report(y_test,y_pred))
print("Accuracy: ",accuracy_score(y_test, y_pred))
print("Roc-auc: ", roc_auc_score(y_test,y_pred))

### TF-IDF

In [None]:
tfidfconverter = TfidfVectorizer(max_features=2000, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(documents).toarray()

In [None]:
X[0][:20]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
lr = LogisticRegression(C=1)
lr.fit(X_train, y_train) 
y_pred = lr.predict(X_test)

In [None]:
fig = plt.figure(figsize=(6, 4))
heatmap = sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, cmap='PuRd', 
                      linewidths=.5, linecolor='gray',fmt="d")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
print(classification_report(y_test,y_pred))
print("Accuracy: ",accuracy_score(y_test, y_pred))
print("Roc-auc: ", roc_auc_score(y_test,y_pred))

In [None]:
rf = RandomForestClassifier(n_estimators=500, random_state=42)
rf.fit(X_train, y_train) 
y_pred = rf.predict(X_test)

In [None]:
fig = plt.figure(figsize=(6, 4))
heatmap = sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, cmap='PuRd', 
                      linewidths=.5, linecolor='gray',fmt="d")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
print(classification_report(y_test,y_pred))
print("Accuracy: ",accuracy_score(y_test, y_pred))
print("Roc-auc: ", roc_auc_score(y_test,y_pred))

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
fig = plt.figure(figsize=(6, 4))
heatmap = sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, cmap='PuRd', 
                      linewidths=.5, linecolor='gray',fmt="d")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
print(classification_report(y_test,y_pred))
print("Accuracy: ",accuracy_score(y_test, y_pred))
print("Roc-auc: ", roc_auc_score(y_test,y_pred))

### Exercises:
1) Try other classifiers, compare results;  
2) Find best meta parameters using GridSearch/RandomSearch.