# Downloading dataset and saving to CSV file

In [1]:
%cd YOUR_TRAINING_DIRECTORY

[Errno 2] No such file or directory: 'YOUR_TRAINING_DIRECTORY'
/home/hafiz031


In [11]:
import pyprind
import pandas as pd
import os

pbar = pyprind.ProgBar(50000) # iteration = 50000 the number of docs we are going to read

labels = {'pos': 1, 'neg': 0}

df = pd.DataFrame()

for s in ('test', 'train'):
	for l in ('pos', 'neg'):
		path = './aclImdb/%s/%s' % (s, l)
		for file in os.listdir(path):
			with open(os.path.join(path, file), 'r') as infile:
				txt = infile.read()
			df = df.append([[txt, labels[l]]], ignore_index = True)
			pbar.update()

df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:08


In [115]:
import numpy as np

np.random.seed(10)

df = df.reindex(np.random.permutation(df.index))
# saving the file to disk
df.to_csv('./movie_data.csv', index = False) 

df = pd.read_csv('./movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,it s such a shame that because of it s title t...,1
1,i regret every single second of the time i los...,0
2,from hardly alien sounding lasers to an elemen...,0


# Introducing the Bag of Words Model

Bag of words allows us to represent texts as numerical feature vectors.

In [116]:
# Transforming words into feature vector

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# vectorizer is unigram by default
# to make it say bigram initialize if with ngram_range = (2, 2)
count = CountVectorizer()

docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet'
])

# by calling fit_transform() on CountVectorizer() object we constructed the vocabulary of the bag-of-words model (fit).
# and again transformed the sentences in docs into sparse feature vectors (transform).
bag = count.fit_transform(docs)

In [117]:
# here we just have created unigram or 1-gram where each item in the vocabulary represents a single word sequence 
# to view which word occured how many times 
# we see that while creating vocabulary all the words are by default converted to lowercase and punctuations removed
count.vocabulary_

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}

In [118]:
count.get_feature_names() # these names are lexicographically sorted

['and', 'is', 'shining', 'sun', 'sweet', 'the', 'weather']

In [119]:
# just representing the frequency of each word in each sentence
bag.toarray()

array([[0, 1, 1, 1, 0, 1, 0],
       [0, 1, 0, 0, 1, 1, 1],
       [1, 2, 1, 1, 1, 2, 1]])

# Data Cleaning

Before we build our bag-of-words model we have to clean our text data to strip off all unwanted characters.

In [120]:
# let's display the first 50 characters from the first document in the reshuffled movie review
# we probably see many unwanted characters here

df.loc[0, 'review'][-50:]

'go see this film if anything it ll make you smile '

In [121]:
# cleaning

import re

def preprocessor(text):
    # we are trying to remove entire HTML markup
    text = re.sub(r'<[^>]*>', '', text) 
    # we found all emoticons and saved temporarily in a variable
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) 
    # now removing all non words
    # also converting the text to lowercase
    # finally adding the temporarily stored emoticons
    # emoticons will be joined keeping spaces (' ' before .join() means that)  
    # additionally removing the nose character in emoticon
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    
    return text

In [122]:
preprocessor(df.loc[0, 'review'][-50:])

'go see this film if anything it ll make you smile '

In [123]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [125]:
# let's apply the preprocessing to all movie reviews
df['review'] = df['review'].apply(preprocessor)

In [126]:
df['review']

0        it s such a shame that because of it s title t...
1        i regret every single second of the time i los...
2        from hardly alien sounding lasers to an elemen...
3        conventional and superficial claude s portraya...
4        the acting is excellent in this film with some...
                               ...                        
49995    this is not a good movie too preachy in parts ...
49996     fat girls is among the worst films within the...
49997    i rented this thinking it would be pretty good...
49998    john travolta was excellent as michael in the ...
49999    despite reading the initial comments from some...
Name: review, Length: 50000, dtype: object

# Processing Documents into Tokens

Tokenization means splitting the text corpora to individual elements. One way to tokenize documents is to split them into individual words by splitting the cleaned document at its whitespace characters.

In [131]:
def tokenizer(text):
    return text.split()

In [132]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

Another useful technique of tokenization is word stemming. The original algorithm for word stemming is called as the Porter stemmer. Porter stemmer may be the oldest and the simpliest stemming algorithm. Other popular algorithms are the newer Snowball Stemmer (Porter2 or English stemmer) or the Lancaster stemmer (Piace-Husk stemmer), which are faster but also more agressive than the Porter stemmer. All of them are available in the NLTK package.

In [141]:
!pip install nltk



In [142]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [140]:
# stemming can create non-real words...for example thus->thu (it just removed s assuming it is a plural form)
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

While stemming creates unreal word. Lemmatization creates canonical (grammatically correct) forms of individual words- the so called lemmas. But lemmatization is computationally more difficult and expensive compared to stemming and practically it has been observed that stemming and lemmatization have little impact on the performance of text classification.

Another useful operation is to remove the stop-words. Stop-words are the words which are extremely common in all sorts of texts and likely bear no (or only little) useful information that can be used to distinguish between different classes of documents. Examples of stop-words are is, and, has, and, the, like.

Removing stop-words can be useful if we are working with raw or normalized term frequencies rather than tf-idfs (term frequency-inverse document frequency) , which are already downweighting frequently occurring words.

In [145]:
# to remove stop-words from the movie reviews, we will use the set of 127 English stop-words that is available from
# the NLTK library.

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [150]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [155]:
[w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

# Training a Logistic Regression Model for Document Classification

In [159]:
X_train = df.loc[:25000, 'review'].values # values attribute will convert the contents in df column to list
y_train = df.loc[:25000, 'sentiment'].values

X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [162]:
# we will be using GridSearchCV to find the optimal set of parameters for our logistic regression model using
# 5-fold stratified cross-validation

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents = None,
                       lowercase = False,
                       preprocessor = None)

param_grid = [
    {
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer, tokenizer_porter],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [1.0, 10.0, 100.0]
    },
    
    {
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer, tokenizer_porter],
        'vect__use_idf': [False],
        'vect__norm': [None],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [1.0, 10.0, 100.0]
    }
]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state = 0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                          scoring = 'accuracy',
                          cv = 5, verbose = 1, 
                          n_jobs = -1)

gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 23.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 114.0min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 144.4min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(lowercase=False)),
                                       ('clf',
                                        LogisticRegression(random_state=0))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [1.0, 10.0, 100.0],
                          'clf__penalty': ['l1', 'l2'],
                          'vect__ngram_range': [(1, 1)],
                          'vect__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've",
                                                "you'll", "you'd",...
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've",
                                                "you'll", "you'd", 'your',
           

In [172]:
# showing the best parameters
print('Best parameter set: %s' % gs_lr_tfidf.best_params_)

with open('best_parameters.txt', 'w') as f:
    f.write("Best parameters: \n" + str(gs_lr_tfidf.best_params_) + "\n\n")
    f.write("Best score (CV accuracy): \n" + str(gs_lr_tfidf.best_score_) + "\n\n")
    f.write("Best index: \n" + str(gs_lr_tfidf.best_index_) + "\n\n")

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7f3e5ee08320>}


In [169]:
# serialize the model
import pickle
print("[INFO] saving Model...")
f = open('sentiment.model', "wb")
f.write(pickle.dumps(gs_lr_tfidf.best_estimator_))
f.close()
print("[INFO] Model saved!")

[INFO] saving Model...
[INFO] Model saved!


In [176]:
# Evaluation
print('CV accuracy: %.3f' % gs_lr_tfidf.best_score_)

# clf = gs_lr_tfidf.best_estimator_

# or load the saved model
clf = pickle.load(open("sentiment.model", "rb"))

print("Test accuracy: %.3f" % clf.score(X_test, y_test))

CV accuracy: 0.895
Test accuracy: 0.900


In [177]:
# serialize the model
import pickle
print("[INFO] saving Model...")
f = open('sentiment.model', "wb")
f.write(pickle.dumps(gs_lr_tfidf))
f.close()
print("[INFO] Model saved!")

[INFO] saving Model...
[INFO] Model saved!
