### Basic Imports

In [41]:
import numpy as np
import pandas as pd
import os
import sys


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion


from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.metrics import accuracy_score, recall_score, precision_score


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import matplotlib.pyplot as plt
%matplotlib inline

### Loading Data

In [42]:
#set relative path to data
DATA_PATH = os.path.join("../data/")

#true to load in training data
def load_movie_reviews(path=DATA_PATH, train=True):
    if train:
        tsv_path = os.path.join(path, "labeledTrainData.tsv")
    else:
        tsv_path = os.path.join(path, "testData.tsv")
    return pd.read_csv(tsv_path, delimiter='\t', header=0, quoting=3)

In [43]:
#load train data
review_train_orig = load_movie_reviews(path=DATA_PATH, train=True)
review_train = review_train_orig.copy()

#load test data
review_test_orig = load_movie_reviews(path=DATA_PATH, train=False)
review_test = review_test_orig.copy()

In [44]:
review_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
id        25000 non-null object
review    25000 non-null object
dtypes: object(2)
memory usage: 390.7+ KB


In [45]:
review_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB


***Note:*** we need to remember to add in the 'sentiment' for the test data set based on the 'id' column

### Adding 'sentiment' To Test Data

In [46]:
def add_sentiment(df):
    sentiment = [1 if int(x[1].strip('"')) >= 5 else 0 for x in df['id'].str.split('_')]
    df['sentiment'] = sentiment
    return

#adding sentiment column to test data
add_sentiment(review_test)
review_test.head()

Unnamed: 0,id,review,sentiment
0,"""12311_10""","""Naturally in a film who's main themes are of ...",1
1,"""8348_2""","""This movie is a disaster within a disaster fi...",0
2,"""5828_4""","""All in all, this is a movie for kids. We saw ...",0
3,"""7186_2""","""Afraid of the Dark left me with the impressio...",0
4,"""12128_7""","""A very accurate depiction of small time mob l...",1


### Extracting Train and Test Output Columns

In [47]:
y_train = review_train['sentiment'].values
y_test  = review_test['sentiment'].values 

### Custom Classes

In [67]:
class columnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, selected_columns):
        self.selected_columns = selected_columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        #assert isinstance(X, pd.DataFrame)
        trans = X[self.columns].copy() 
        temp = X[self.selected_columns].copy()
        return temp

In [60]:
class modelScorer():
    def __init__(self, title, model, y, X):
        """
        title --> string --> describe the model (used for book keeping)
        model --> sklearn model
        y     --> np array --> target output column from feature matrix 
        X     --> np array --> feature matrix
        """
        self.title = title
        self.model = model
        self.y = y
        self.X = X
        
        self.y_predict = self.model.predict(self.X)
        
        self.confusion_matrx = confusion_matrix(self.y, self.y_predict)
        self.accuracy_scr = accuracy_score(self.y, self.y_predict)
        self.recall_scr = recall_score(self.y, self.y_predict)
        self.precision_scr = precision_score(self.y, self.y_predict)
        
    def genScoreReport(self):
        print("\t\tSCORE FOR {}".format(self.title))
        print("Accuracy:{0:.2f}".format(self.accuracy_scr))
        print("Recall:{0:.2f}".format(self.recall_scr))
        print("Precision:{0:.2f}".format(self.precision_scr))
        print("\nConfusion Matrix:\n {}".format(self.confusion_matrx))
        
    def plot_roc(self):
        AUC = roc_auc_score(self.y, self.model.predict_proba(self.X)[:,1])
        fpr, tpr, thresholds = roc_curve(self.y, self.model.predict_proba(self.X)[:,1])
    
        plt.figure()
        lw = 2
        plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % AUC)
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic:: {}'.format(self.title))
        plt.legend(loc="lower right");
        
    def getAccuracyScore(self):
        return self.accuracy_scr
    
    def getRecallScore(self):
        return self.recall_scr
    
    def getPrecisionScore(self):
        return self.precision_scr
    
    def getConfusionMatrix(self):
        return self.confusion_matrx

### Building Preprocessing Pipeline

In [61]:
selected_columns = ['review']
data_preprocess_pipeline = make_pipeline(columnSelector(selected_columns), CountVectorizer())

### Logisitc Regression

In [62]:
logistic_pipeline = make_pipeline(data_preprocess_pipeline, LogisticRegression())

In [65]:
param_grid = [{'logisticregression__C': [0.01, 0.1, 1, 10]}]

clf = GridSearchCV(logistic_pipeline, param_grid)

clf.fit(review_train['review'], y_train)

KeyError: "None of [['review']] are in the [index]"