In [1]:
import os
import math
import datetime
from mlsettings.settings import load_app_config, get_datafolder_path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

% matplotlib inline 
from numpy import set_printoptions
set_printoptions(precision=4)

pd.set_option('display.width', 200)
pd.set_option('precision', 4)
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
sns.set_style("whitegrid")


In [2]:
load_app_config()
input_path  = get_datafolder_path()
input_directory = "aclimdb"
input_file ="movie_data.csv"

train_file = os.path.join(input_path, input_directory, input_file)
print(train_file)

Adding D:\DataSource  to system path
Adding D:\MachineLearning  to system path
D:\DataSource\aclimdb\movie_data.csv


In [3]:
dataset = pd.read_csv(train_file, encoding='utf-8')
dataset.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [4]:
dataset.shape

(50000, 2)

In [5]:
dataset.loc[0,'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [6]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [7]:
preprocessor(dataset.loc[0, 'review'][-50:])

'is seven title brazil not available'

In [8]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [9]:
dataset['review'] = dataset['review'].apply(preprocessor)

In [10]:
from nltk.stem.porter  import PorterStemmer

def tokenizer(text):
    return text.split()

porter  = PorterStemmer()
def tokenizer_porter(text):
    return [ porter.stem(word) for word in text.split()]


In [11]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [12]:
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [46]:
X_train = dataset.loc[:10000,'review'].values
y_train = dataset.loc[:10000 ,'sentiment'].values
X_test  = dataset.loc[10000:20000,'review'].values
y_test = dataset.loc[10000:20000,'sentiment'].values

In [47]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [48]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [56]:
#gs_lr_tfidf.fit(X_train, y_train)
#print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
my_stopwords = set(ENGLISH_STOP_WORDS)
my_stopwords.remove("well")
my_stopwords.remove("not")
my_stopwords.add("ve")

#tfidf = TfidfVectorizer(strip_accents=None,lowercase=False, preprocessor=None,
                        #tokenizer = tokenizer_porter,ngram_range =(1,3) ,min_df=4,stop_words =my_stopwords)

tfidf = TfidfVectorizer(ngram_range =(1,3) ,min_df=4,stop_words =my_stopwords)
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)


In [58]:
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(random_state =0)
clf.fit(X_train_tf,y_train)
print(clf) 
y_pred = clf.predict(X_test_tf)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)


In [57]:
print(y_pred.shape)
print(y_train.shape)

(10001,)
(10001,)


In [55]:
clf.score(X_test_tf, y_train)

0.503949605039496