Name : Khemraj Gupta

Reg. No. : 20MAI0079

Logistic Regression to Classify Text Documents

In [44]:
# Classify movie review
#1. stemming - tokenize or segment
#2. vectorising

#sentimental analysis - "Movie was exceptionally good" 

In [45]:
#1 Importing the Dataset
import pandas as pd

data = pd.read_csv("data/IMDB Dataset.csv")

data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [46]:
data["review"][10]

'Phil the Alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines.<br /><br />At first it was very odd and pretty funny but as the movie progressed I didn\'t find the jokes or oddness funny anymore.<br /><br />Its a low budget film (thats never a problem in itself), there were some pretty interesting characters, but eventually I just lost interest.<br /><br />I imagine this film would appeal to a stoner who is currently partaking.<br /><br />For something similar but better try "Brother from another planet"'

In [47]:
#2 Data Preparation - Regular Expression

In [48]:
#3.1 Stemming the documents --------------- PorterStemmer
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()   

def stemmer_tokenize(text)  :
    return[porter.stem(word) for word in text.split()]

In [49]:
stemmer_tokenize("we love coding and we keep coding and learning")

['we', 'love', 'code', 'and', 'we', 'keep', 'code', 'and', 'learn']

In [50]:
stemmer_tokenize(data["review"][10])[:30]

['phil',
 'the',
 'alien',
 'is',
 'one',
 'of',
 'those',
 'quirki',
 'film',
 'where',
 'the',
 'humour',
 'is',
 'base',
 'around',
 'the',
 'odd',
 'of',
 'everyth',
 'rather',
 'than',
 'actual',
 'punchlines.<br',
 '/><br',
 '/>at',
 'first',
 'it',
 'wa',
 'veri',
 'odd']

In [51]:
#3.2 Stemming the documents --------------- LancasterStemmer
from nltk.stem.lancaster import LancasterStemmer
lancaster = LancasterStemmer()

def lancaster_tokenize(text)  :
    return[lancaster.stem(word) for word in text.split()]

In [52]:
lancaster_tokenize("we love coding and we keep coding and learning")

['we', 'lov', 'cod', 'and', 'we', 'keep', 'cod', 'and', 'learn']

In [53]:
lancaster_tokenize(data["review"][10])[:30]

['phil',
 'the',
 'aly',
 'is',
 'on',
 'of',
 'thos',
 'quirky',
 'film',
 'wher',
 'the',
 'humo',
 'is',
 'bas',
 'around',
 'the',
 'od',
 'of',
 'everyth',
 'rath',
 'than',
 'act',
 'punchlines.<br',
 '/><br',
 '/>at',
 'first',
 'it',
 'was',
 'very',
 'od']

In [54]:
#3.3 Stemming the documents ---------------- SnowballStemmer
from nltk.stem import SnowballStemmer
snowball = SnowballStemmer("english")

def snowball_tokenize(text) :
    return[snowball.stem(word) for word in text.split()]

In [55]:
snowball_tokenize("we love coding and we keep coding and learning")

['we', 'love', 'code', 'and', 'we', 'keep', 'code', 'and', 'learn']

In [56]:
#4 Vectorization of documents
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents = None, lowercase = False, tokenizer = stemmer_tokenize,
                        use_idf = True, norm="l2", smooth_idf = True)
y = data.sentiment.values
X = tfidf.fit_transform(data.review)

In [57]:
y

array(['positive', 'positive', 'positive', ..., 'negative', 'negative',
       'negative'], dtype=object)

In [58]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.3, shuffle = False)

import pickle

from sklearn.linear_model import LogisticRegressionCV

LogitCV = LogisticRegressionCV (cv =5, scoring = 'accuracy', max_iter = 350).fit(X_train, y_train)

saved_model = open('saved_model_1.sav','wb')

pickle.dump(LogitCV, saved_model)

saved_model.close()

In [59]:
#5 Run the Saved Model
filename = "saved_model_1.sav"

saved_logCV = pickle.load(open(filename, "rb"))

saved_logCV.score(X_test, y_test)

0.8945333333333333

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid  = TfidfVectorizer(strip_accents = None, lowercase = False, tokenizer = lancaster_tokenize,
                        use_idf = True, norm = "l2", smooth_idf = True)
X1 = tfid.fit_transform(data.review)

In [63]:
#4 Vectorizing the text data 

from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(strip_accents = None, lowercase = False, tokenizer = stemmer_tokenize)
y = data.sentiment.values
X = count.fit_transform(data.review)

# Create a vectorizer object
vectorizer = CountVectorizer()

vectorizer.fit(y)

# Printing the identified unique words along with their indices
print("Vocabulary : ", vectorizer.vocabulary_)

# Encode the document
vector = vectorizer.transform(y)

# Summarizing the Encoded Texts
print("Encoded Document is : ")
print(vector.toarray())

Vocabulary :  {'positive': 1, 'negative': 0}
Encoded Document is : 
[[0 1]
 [0 1]
 [0 1]
 ...
 [1 0]
 [1 0]
 [1 0]]
