In [1]:
#Machine Learning on Sentiment Analysis
import pyprind
import pandas as pd
import os
import sys



In [2]:
basepath = '/home/pranav/Documents/ScikitAndPytorch/aclImdb'

In [3]:
labels = {'pos':1, 'neg':0} 

pbar = pyprind.ProgBar(50000, stream=sys.stdout)

df = pd.DataFrame()

dataFrameList = []

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        
        for file in sorted(os.listdir(path)):
            infile = open(os.path.join(path, file))
            txt = infile.read()
            dataFrameList.append([txt, labels[l]])
            
            pbar.update()



0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


In [4]:
df = pd.DataFrame(data=dataFrameList, columns=['Review', 'Sentiment'])

In [5]:
df

Unnamed: 0,Review,Sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
...,...,...
49995,"Towards the end of the movie, I felt it was to...",0
49996,This is the kind of movie that my enemies cont...,0
49997,I saw 'Descent' last night at the Stockholm Fi...,0
49998,Some films that you pick up for a pound turn o...,0


In [6]:
import numpy as np
np.random.seed(0)

df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding = 'utf-8')

In [7]:
df.head()

Unnamed: 0,Review,Sentiment
11841,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
19602,OK... so... I really like Kris Kristofferson a...,0
45519,"***SPOILER*** Do not read this, if you think a...",0
25747,hi for all the people who have seen this wonde...,1
42642,"I recently bought the DVD, forgetting just how...",0


In [8]:
#Bag of words model -> converts texts/words into numerical form.
# We create a vocab of unique tokens, from the entire set of documents
# Contruct a feature vector from each document that contains how often each word/unique tokens occur in the
# Particular dataset.

In [9]:
# Example of Bag of Words

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()

# Our example dataset
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet,'
                 'and one and one is two'])

bag = count.fit_transform(docs)

print(count.vocabulary_)


{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [10]:
# Frequency matrix of the words
type((bag.toarray()))

print(pd.DataFrame(data=bag.toarray(), columns=count.get_feature_names_out()))

   and  is  one  shining  sun  sweet  the  two  weather
0    0   1    0        1    1      0    1    0        0
1    0   1    0        0    0      1    1    0        1
2    2   3    2        1    1      1    2    1        1


In [11]:
#Word relevancy -> some words are used to describe both the classes. Those words are kinda useless, 
# becuase they don't give us information to act upon

# Term Frequency-inverse document free (TF-idf) -> technique used to downplay those words
# TF -> term frequency matrix
# idf -> inverse frequency matrix


In [12]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, norm = 'l2', smooth_idf=True)

np.set_printoptions(precision=2)

print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [13]:
#Cleaning our dataset

import re

def preprocessor(text):
    #Remove Html
    text = re.sub('<[^>]*>', '', text)
    
    #Finding Emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    
    #Removing non-word characters. Converted text into lower case. Adding back the emoticons to the
    #end of the list
    text = (re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace('-', ''))
    
    return text
    
    


In [14]:
preprocessor(df.loc[0, 'Review'][-50:])


'and i suggest that you go see it before you judge '

In [15]:
df['Review'] = df['Review'].apply(preprocessor)

In [31]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
#porter.stem -> converts word into its root

def tokenizer(text):
    return text.split()

#for every review, this function returns a list of words, where each word is converted into its Stem
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()] #.split() changes text/str into list of words

In [32]:
#Stop Words -> words that are extremely common, and bear no useful information
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords

stop = stopwords.words('english')

[w for w in tokenize_porter('a runner likes running and runs a lot') if w not in stop]

[nltk_data] Downloading package stopwords to /home/pranav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['runner', 'like', 'run', 'run', 'lot']

In [42]:
X = np.array(df['Review'])
y = np.array(df['Sentiment'])

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)


In [43]:
X_train.shape

(25000,)

In [44]:
y_train.shape

(25000,)

In [46]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer #TfidfVectorizer combines the vectorizer and transformer in once

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

smallParamGrid = [
    {
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words':[None],
        'vect__tokenizer':[tokenizer, tokenizer_porter],
        'clf__penalty':['l2'],
        'clf__C':[1.0, 10.0]
    },
    {
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words':[stop, None],
        'vect__tokenizer':[tokenizer],
        'vect__use_idf':[False],
        'vect__norm':[None],
        'clf__penalty':['l2'],
        'clf__C':[1.0, 10.0]
    },
]

#First Dictionary, TfidfTransformer default settings:

lr_tfidf = Pipeline([
    ('vect', tfidf),
    ('clf', LogisticRegression(solver = 'liblinear'))
])

gs_lr_tfidf = GridSearchCV(lr_tfidf, smallParamGrid, scoring='accuracy', cv = 5, verbose = 2, n_jobs=1)

gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>; total time=   2.8s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>; total time=   2.8s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>; total time=   2.7s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>; total time=   2.7s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>; total time=   2.8s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x7f132fc21000>; total time= 1.2min




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x7f132fc21000>; total time= 1.3min




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x7f132fc21000>; total time= 1.3min




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x7f132fc21000>; total time= 1.2min




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x7f132fc21000>; total time= 1.1min




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>; total time=   3.3s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>; total time=   3.2s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>; total time=   3.3s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>; total time=   3.3s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>; total time=   3.3s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x7f132fc21000>; total time= 1.1min




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x7f132fc21000>; total time= 1.2min




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x7f132fc21000>; total time= 1.2min




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x7f132fc21000>; total time= 1.2min




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x7f132fc21000>; total time= 1.2min




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', '



[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', '



[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', '



[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', '



[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', '



[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>, vect__use_idf=False; total time=   7.4s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>, vect__use_idf=False; total time=   8.4s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>, vect__use_idf=False; total time=   7.5s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>, vect__use_idf=False; total time=   9.3s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>, vect__use_idf=False; total time=   7.6s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 



[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 



[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 



[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 



[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 



[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>, vect__use_idf=False; total time=   9.3s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>, vect__use_idf=False; total time=  10.2s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>, vect__use_idf=False; total time=   9.4s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>, vect__use_idf=False; total time=  10.2s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7f132fc213f0>, vect__use_idf=False; total time=   8.6s




In [48]:
print(gs_lr_tfidf.best_params_)

{'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7f132fc213f0>}


In [51]:
print(gs_lr_tfidf.best_score_)

0.8953599999999999


In [52]:
clf = gs_lr_tfidf.best_estimator_

clf.score(X_test, y_test)

0.89976

In [56]:
# Out of Core learning
# Work with large datasets by fitting the classifier on smaller batches

#Creating a new tokenizer function that removes stop words as well

import numpy as np
import re

from nltk.corpus import stopwords

stop = stopwords.words('english')

def newTokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    
    text = re.sub('[\W]+', " ", text.lower()) + ' '.join(emoticons).replace('-','')
    
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

# reads and returns one document at a time

def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label
            
#Takes a doc stream (output of stream docs) and a number of documents, determined by size parameter
def getMinibatch(docStream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(docStream)
            docs.append(text)
            y.append(label)
    
    except StopIteration:
        return None, None
    return docs, y


In [59]:
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore',
                        n_features=2**21, 
                        preprocessor=None,
                        tokenizer=newTokenizer)

clf = SGDClassifier(loss = 'log_loss', random_state=1)
doc_stream = stream_docs(path = "movie_data.csv")

In [60]:
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])

for _ in range(45):
    X_train, y_train = getMinibatch(doc_stream, size=1000)
    
    if not X_train:
        break
    
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes = classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:17


In [61]:
X_test, y_test = getMinibatch(doc_stream, size = 5000)
X_test = vect.transform(X_test)
clf.score(X_test, y_test)

0.8682

In [62]:
clf = clf.partial_fit(X_test, y_test)

In [63]:
#Topic Modelling with LDA(Latent Dirchlet Allocation)

# Assign categories to large amount of documents and dat

# Finds of groups of words that occur together
# LDA requires a bag-words as an input
# LDA decomposes into a document-to-topic matrix, and a word-to-topic matrix 
# Need to define number of topics beforehand

In [64]:
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')

df = df.rename(columns = {"0": "Review", '1': "Sentiment"})

In [65]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english', max_df=.1, max_features=5000)

In [66]:
X = count.fit_transform(df['Review'].values)

In [68]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=10, random_state=123, learning_method='batch')

X_topics = lda.fit_transform(X)

In [69]:
n_top_words = 5
 
featureNames = count.get_feature_names_out()

for topicIndex, topic in enumerate(lda.components_):
    print(topicIndex, [featureNames[i] for i in topic.argsort()[:-n_top_words -1:-1]])

0 ['worst', 'minutes', 'awful', 'script', 'stupid']
1 ['family', 'mother', 'father', 'children', 'girl']
2 ['american', 'war', 'dvd', 'music', 'tv']
3 ['human', 'audience', 'cinema', 'art', 'sense']
4 ['police', 'guy', 'car', 'dead', 'murder']
5 ['horror', 'house', 'sex', 'girl', 'woman']
6 ['role', 'performance', 'comedy', 'actor', 'performances']
7 ['series', 'episode', 'war', 'episodes', 'tv']
8 ['book', 'version', 'original', 'read', 'novel']
9 ['action', 'fight', 'guy', 'guys', 'cool']


In [19]:
basepath2 = '/home/pranav/Documents/ScikitAndPytorch/aclImdbsample/'

In [20]:
masterList = []

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath2, s, l)
        
        for file in sorted(os.listdir(path)):
            infile = open(os.path.join(path,file))
            txt = infile.read()
            masterList.append([txt, l])
        

In [21]:
masterList[0]

["I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge.",
 'pos']

In [22]:
df_trial = pd.DataFrame(data=masterList, columns=['review', 'sentiment'])

In [23]:
df_trial['review']

0     I went and saw this movie last night after bei...
1     If you had asked me how the movie was througho...
2     I was looking forward to The Guardian, but whe...
3     I was pleasantly surprised to find this movie ...
4     My boyfriend and I went to watch The Guardian....
5     My yardstick for measuring a movie's watch-abi...
6     How many movies are there that you can think o...
7     This movie was sadly under-promoted but proved...
8     I only went to see this movie because I have a...
9     I was fortunate enough to see this movie on pr...
10    Our family (and the entire sold out sneak prev...
11    I'm a Petty Officer 1st Class (E-6) and have b...
12    I've seen this story before but my kids haven'...
13    Once again Mr. Costner has dragged out a movie...
14    Years ago, when DARLING LILI played on TV, it ...
15    Julie Andrews satirically prods her own goody-...
16    This is a pale imitation of 'Officer and a Gen...
17    It seems ever since 1982, about every two 

In [24]:
sorted(os.listdir(path))[0]

'0_3.txt'

In [25]:
infile = open(os.path.join(path, sorted(os.listdir(path))[0]), 'r', encoding='utf-8')

txt = infile.read()

In [26]:
[txt, 'pos']

["Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.",
 'pos']