In [None]:
!pip install -r requirements.txt

In [None]:
import nltk

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from IPython.core.display import display, HTML 
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
# !wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=186el6hPe81eA1YkS_2PU4S4ZbYzhNPjr' -O dataset.csv

# 1. Exploratory Data Analysis

In [None]:
dataset = pd.read_csv('dataset.csv')

In [None]:
dataset.head()

In [None]:
dataset['sentiment'].hist()

We are mostly interested in negative comments, thus we assign 1 label to them.

In [None]:
dataset['sentiment'] = dataset['sentiment'].map({'positive': 0,
                                                 'negative': 1})
dataset.head()

In [None]:
reviews = dataset['review'].values.tolist()
reviews[:10]

# 2. Data preprocessing
## 2.1. Remove structures without semantic meaning

In [None]:
import re

In [None]:
reviews = [re.sub("<.+?>", "", rev) for rev in reviews]
reviews[:2]

In [None]:
reviews = [re.sub('\S*\d+\S*', "", rev) for rev in reviews]
reviews[:2]

## 2.2. Remove capital letters

In [None]:
reviews = [rev.lower() for rev in reviews]
reviews[:2]

## 2.3. Tokenization

In [None]:
nltk.download('punkt')

reviews = [nltk.word_tokenize(rev) for rev in reviews]
reviews[:2]

## 2.4. Stopwords removal

In [None]:
nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')
stopwords

In [None]:
stopwords.extend([
    "'ll",
    "'re",
    "'m"
])

In [None]:
stopwords

In [None]:
reviews = [[token for token in rev if token not in stopwords] for rev in reviews]
reviews[:2]

## 2.5. Stemming

In [None]:
stemmer = nltk.PorterStemmer()

In [None]:
reviews[:2]

In [None]:
from tqdm import tqdm

In [None]:
reviews = [[stemmer.stem(word) for word in rev] for rev in tqdm(reviews, position=0, leave=True)]

In [None]:
reviews[0]

## 2.6. Joining preprocessed words into full sentences

In [None]:
reviews = [" ".join(rev) for rev in reviews]

## 2.7. Punctuation removal

In [None]:
import string

string.punctuation

In [None]:
punk_table = str.maketrans({key: None for key in string.punctuation})
punk_table

In [None]:
reviews = [rev.translate(punk_table) for rev in reviews]
reviews[0]

## 2.8. Putting all toghether into TextCleaner

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.patterns = ["<.+?>", "\S*\d+\S*"]
        self.stopwords = nltk.corpus.stopwords.words('english') + ["'ll", "'re", "'m"]
        self.punk_table = str.maketrans({key: None for key in string.punctuation})
        self.stemmer = nltk.PorterStemmer()

    def fit(self, X, y=None):
        return self # unused method

    def transform(self, X):
        for pattern in self.patterns:
            X = [re.sub(pattern, '', rev) for rev in X]
        X = [rev.lower() for rev in X]
        X = [nltk.word_tokenize(rev) for rev in X]
        X = [[token for token in rev if token not in self.stopwords] for rev in X]
        X = [[self.stemmer.stem(token) for token in rev] for rev in X]
        X = [" ".join(rev) for rev in X]
        X = [rev.translate(self.punk_table) for rev in X]
        return X 

In [None]:
text_cleaner = TextCleaner()
cleaned_reviews = text_cleaner.transform(dataset['review'].values.tolist())

In [None]:
cleaned_reviews[:2]

## 2.9. Sentence vectorization using TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.3)

reviews_tfidf = tfidf.fit_transform(cleaned_reviews)
reviews_tfidf

In [None]:
tfidf.get_feature_names()

In [None]:
reviews_tfidf.getrow(0).todense().A.max()

In [None]:
plt.figure(figsize=(16, 16))
plt.imshow(reviews_tfidf.tocsr()[:200,:500].todense().A)
plt.xlabel('First 500 tokens')
plt.ylabel('First 200 reviews')

In [None]:
f'{reviews_tfidf.data.nbytes / 1e6} Mb'

## 2.9. Train-test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reviews_tfidf, dataset['sentiment'].values.ravel(), test_size=0.1, random_state=42)

In [None]:
X_train

# 3. Modeling 
## 3.1. Training Random Forest model and evaluation

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

## 3.2. Evaluation

In [None]:
from sklearn.metrics import accuracy_score
y_pred_train = model.predict(X_train)
accuracy_score(y_train, y_pred_train)

In [None]:
y_pred_test = model.predict(X_test)
accuracy_score(y_test, y_pred_test)

In [None]:
from sklearn.metrics import confusion_matrix

pd.DataFrame(data=confusion_matrix(y_true=y_test, y_pred=y_pred_test),
             columns=['predicted_negative', 'predicted_positive'],
             index=['real_negative', 'real_positive'])

## 3.3. Hyperparameter search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = RandomForestClassifier()

params = {
    'n_estimators': [100, 300],
    'max_depth': [30, 60, 120],
    'min_samples_leaf': [2, 4, 8, 16],
}

grid_search = GridSearchCV(model, param_grid=params, scoring='accuracy', cv=4, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
model = grid_search.best_estimator_
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print(model.get_params())
print('ACC train set: {}'.format(accuracy_score(y_train, y_pred_train)))
print('ACC test set: {}'.format(accuracy_score(y_test, y_pred_test)))

## 3.4. Creating classifier Pipeline

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
classifier_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(min_df=2, max_df=0.3)),
    ('classifier', RandomForestClassifier(**model.get_params()))
])

In [None]:
classifier_pipeline.fit(cleaned_reviews,
                        dataset['sentiment'].values.ravel())

In [None]:
raw_samples = ['This was the best movie i have ever seen. It should be in every cinema! I love it! :P ',
              'Terrible stuff, i cant say its good, what a waste of time... 0/10']
cleaned_samples = text_cleaner.transform(raw_samples)
classifier_pipeline.predict_proba(raw_samples)

In [None]:
import joblib

joblib.dump(classifier_pipeline, 'classifier_pipeline.pkl')

In [None]:
classifier_pipeline_restored = joblib.load('classifier_pipeline.pkl')

classifier_pipeline_restored.predict_proba(raw_samples)