## Movies Review - Dataset

You will find two files there (one with positive and one with negative reviews).

### Naive Bayes Implementation
* Implement your own version of the Naive Bayes Classifier. Your task is to build a binary classifier that will perform movie review classification.

### Data Transformation
* Convert each example into a vector. You will likely end up with feature vectors of very high dimensionality. 
* keep only 10 most frequent words and/or remove all words that occur only in a single example). I
* Remove stop words from the data

### Tasks
* split your data into training (70%), development (15%) and test (15%)
* Tune your classifier on the development set and evaluate on the test set. 
* evaluation metric.

Include the details on what kind of pre-processing you performed. 


In [1]:
import re
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.preprocessing import scale
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
class NBClassifier:
    
    """
    Usage Example:
    
    X = np.array([[1,0,1,1],
                 [1,1,0,0],
                 [1,0,2,1],
                 [0,1,1,1],
                 [0,0,0,0]]);
    y = np.array([0,1,1,1,0,1,0,1])
    x_test = np.array([[1,0,1,0],[0,0,0,1]])
    
    clf = NBClassifier()
    clf.fit(X,y)
    clf.predict(x_test)
    """
    
    def __init__(self, priors=None, probabilities = False):
        self.probs = probabilities
        if priors:
            self.priors = priors
        
    def _priors(self, target):
        n_target = len(target)
        self.priors = dict(Counter(target))
        for k in self.priors.keys():
            self.priors[k] = self.priors[k]/float(n_target)
        return self.priors
    
    def fit(self, X, y):
        self._X=X
        self._y=y
        self._classes = np.unique(self._y)
        features = self._X.shape[1]
        self._likelihoods = {i:defaultdict(list) for i in self._classes} 

        self._classes_proba = self._priors(self._y)

        for cls in self._classes:
            idx_class = np.where(self._y == cls)[0]
            X_subset = self._X[idx_class, :]
            for i in range(X_subset.shape[1]):
                self._likelihoods[cls][i] += list(X_subset[:,i])
                
        for cls in self._classes:
            for i in range(features):
                 self._likelihoods[cls][i] = self._priors(self._likelihoods[cls][i])

    def predict(self, X_pred):
        self.pred = np.empty(shape=(X.shape[0],0),dtype = self._y.dtype)
        results = {}
        for i in range(X_pred.shape[0]):
            for cls in self._classes:
                cls_proba = self._classes_proba[cls]
                for k in range(X_pred[i].shape[0]):
                    _feature_proba = self._likelihoods[cls][k]
                    if X_pred[i][k] in self._classes_proba.keys():
                        cls_proba *= _feature_proba[X_pred[i][k]]
                    else:
                        cls_proba *= 0
                    results[cls] = cls_proba
            if self.probs:
                self.pred = np.append(self.pred, max(results.values()))
            else:
                dd = {v[1]:v[0] for v in results.items()}
                self.pred = np.append(self.pred, dd[max(dd.keys())])
        return self.pred
    

In [3]:
def text_processing(messages, stop_words = ['english'], max_features = None):
    corpus = [re.sub(r"\b\d+\b|![:alpha:]|\d|\W|_|\b\w{1,2}\b", " ", m).lower() for m in messages]
    if max_features is not None:
        vectorizer = CountVectorizer(strip_accents = 'ascii', 
                                     stop_words=stop_words, 
                                     max_features = max_features)
    else:
        vectorizer = CountVectorizer(strip_accents = 'ascii', 
                                     stop_words=stop_words_list)
    vcorpus = vectorizer.fit_transform(corpus)
    labels = vectorizer.get_feature_names()
    return vcorpus.toarray(), labels

def data_preparation(pos_reviews, neg_reviews,file_out = False):
    neg_reviews["class_label"] = 0
    neg_reviews.columns = ["review","class_label"]
    pos_reviews["class_label"] = 1
    pos_reviews.columns = ["review","class_label"]
    reviews = pd.concat([pos_reviews, neg_reviews],axis=0)
    if file_out:
        reviews.to_csv("data/reviews.csv",index=False)
    return reviews

def data_split(X,y,splits = [0.3,0.5]):
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size = splits[0], 
                                                        random_state=1)
    X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, 
                                                    test_size = splits[1], 
                                                    random_state=1)
    return X_train, y_train, X_dev, y_dev, X_test, y_test 

In [4]:
neg_reviews = pd.read_csv("data/neg-rt-polarity.txt", sep="\n", header=None)
pos_reviews = pd.read_csv("data/pos-rt-polarity.txt", sep="\n", header=None)
reviews = data_preparation(pos_reviews, neg_reviews, file_out = False)

In [5]:
X, labels = text_processing(reviews['review'],max_features=100)
y = np.array(reviews['class_label'])

### Data Split: Train (70%), development (15%) and test (15%)

In [6]:
X_train, y_train, X_dev, y_dev, X_test, y_test = data_split(X, y, splits = [0.3,0.5])

### Naive Bayes Implementation

In [7]:
clf = NBClassifier()
clf.fit(X_train, y_train)
pred = clf.predict(X_dev)
f1_score(y_pred = pred, y_true = y_dev, average='weighted')
# 10 - 0.5373
# 100 - 0.5743

0.5743088768867185

### Scikit-Learn - Multinomial Naive Bayes

#### Model prediction - dev set

In [8]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
pred = clf.predict(X_dev)
f1_score(y_pred = pred, y_true = y_dev, average='weighted')
# 10 - 0.5487
# 100 - 0.6190

0.61905910063885405

#### Model prediction - test set

In [9]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
f1_score(y_pred = pred, y_true = y_test, average='weighted')
# 10 - 0.5472
# 100 - 0.6185

0.61857126113455219

#### Model Cross-validation 

In [10]:
clf = MultinomialNB(alpha=0.001)
cross_val_score(clf, X, y, cv=5, scoring='accuracy').mean()
# 10 - 0.55862072300342358
# 100 - 0.6229

0.62296095908115023