# Introduction to text classification project

In [2]:
#Load the libraries

import pandas as pd
import numpy as np

In [3]:
# Import the tsv file 
df = pd.read_csv('../TextFiles/moviereviews.tsv' , sep = '\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [4]:
# check the length of the file

len(df)

2000

In [5]:
# check is there any null values or not

df.isnull().sum()

label      0
review    35
dtype: int64

In [6]:
# Drop out the null values

df.dropna(inplace = True)
len(df)

1965

# Detect and review empty strings

In [7]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [8]:
# check if there are any empty strings in the review column
blanks = []

for i , lb , rv in df.itertuples():
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)
            
print(len(blanks) , 'blanks' , blanks)

27 blanks [57, 71, 147, 151, 283, 307, 313, 323, 343, 351, 427, 501, 633, 675, 815, 851, 977, 1079, 1299, 1455, 1493, 1525, 1531, 1763, 1851, 1905, 1993]


In [9]:
# Drop out all those index values having empty strings
df.drop(blanks , inplace=True)
len(df)

1938

In [12]:
# check for the label column
df['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

In [13]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


# Split the data into training and test set

In [14]:
# Now splittng the data into training and test set
from sklearn.model_selection import train_test_split

np.random.seed(0)

x = df['review']
y = df['label']

x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2)

# Build pipelines to vectorize the data then train and fit the model

In [15]:
# Import all the required pipelines and models

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Naive Bayers pipeline
model_nb = Pipeline([('tfidf' , TfidfVectorizer()) , ('model' , MultinomialNB())])

# Support vector pipeline
model_svm = Pipeline([('tfidf' , TfidfVectorizer()) , ('model' , SVC())])

# Random forest classifier pipeline
model_forest = Pipeline([('tfidf' , TfidfVectorizer()) , ('model' , RandomForestClassifier())])

In [16]:
# Fit the naive bayers model
model_nb.fit(x_train , y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [17]:
# Make an instance for predictions
y_preds = model_nb.predict(x_test)

In [18]:
# check for the accuracy score
from sklearn.metrics import accuracy_score

In [19]:
print(accuracy_score(y_test , y_preds))

0.7731958762886598


In [20]:
# Fit the support vector machine model and get the accuracy score
model_svm.fit(x_train , y_train)

y_preds = model_svm.predict(y_test)

print(accuracy_score(y_test , y_preds))

0.48195876288659795


In [21]:
# Fit the random forest classifier model and check for the accuray score
model_forest.fit(x_train , y_train)

y_preds = model_forest.predict(x_test)

print(accuracy_score(y_test , y_preds))

0.8015463917525774


# Playing with stop_words

In [22]:
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [23]:
# Import the required libraries and import the tsv file
import pandas as pd
import numpy as np

df = pd.read_csv('../TextFiles/moviereviews.tsv' , sep = '\t')
df.isnull().sum()

label      0
review    35
dtype: int64

In [24]:
# Drop out the null values
df.dropna(inplace = True)

In [25]:
# check the length of the file
len(df)

1965

In [26]:
# check for the blank reviews

blanks = []

for i , lb , rv in df.itertuples():
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)
            
print(len(blanks) , 'blanks' , blanks)

27 blanks [57, 71, 147, 151, 283, 307, 313, 323, 343, 351, 427, 501, 633, 675, 815, 851, 977, 1079, 1299, 1455, 1493, 1525, 1531, 1763, 1851, 1905, 1993]


In [27]:
# Drop out all those values having the empty strings
df.drop(blanks , inplace = True)

In [28]:
# Again check the length of file
len(df)

1938

In [29]:
# Now split the data into training and test set including the random seed
from sklearn.model_selection import train_test_split

np.random.seed(0)

x = df['review']
y = df['label']

x_train , x_test , y_train , y_test  = train_test_split(x , y , test_size = 0.2)

In [30]:
# Import all the required pipeline and metries
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.metrics import accuracy_score , confusion_matrix  , classification_report

model = Pipeline([('tfidf' , TfidfVectorizer(stop_words=stopwords)) , ('model' , RandomForestClassifier())])

model.fit(x_train , y_train)
y_preds = model.predict(x_test)
print(accuracy_score(y_test , y_preds))

0.7912371134020618


In [31]:
# starting from the base

import pandas as pd
import numpy as np

In [32]:
df = pd.read_csv('../TextFiles/moviereviews.tsv' , sep = '\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [33]:
len(df)

2000

In [34]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [35]:
df.dropna(inplace=True)

In [36]:
len(df)

1965

In [37]:
blanks = []

for i , lb , rv in df.itertuples():
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)
            
print(len(blanks) , 'blanks' , blanks)

27 blanks [57, 71, 147, 151, 283, 307, 313, 323, 343, 351, 427, 501, 633, 675, 815, 851, 977, 1079, 1299, 1455, 1493, 1525, 1531, 1763, 1851, 1905, 1993]


In [38]:
df.drop(blanks , inplace = True)

In [39]:
len(df)

1938

In [40]:
from sklearn.model_selection import train_test_split

np.random.seed(0)

x= df['review']
y= df['label']

x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2)

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [42]:
model_nb = Pipeline([('tfidf' , TfidfVectorizer(stop_words=stopwords)) , ('model' , MultinomialNB())])

model_svm = Pipeline([('tfidf' , TfidfVectorizer(stop_words=stopwords)) , ('model' , SVC())])


In [43]:
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score
model_nb.fit(x_train , y_train)

y_preds = model_nb.predict(x_test)
print(accuracy_score(y_test , y_preds))

0.8015463917525774


In [44]:
model_svm.fit(x_train , y_train)

y_preds = model_svm.predict(x_test)

print(accuracy_score(y_test , y_preds))

0.845360824742268


### here we see that the accuracy of the support vector machine is incresed by removing the stop words

In [45]:
# Check for the confusion metrics

print(confusion_matrix(y_test , y_preds))

[[156  31]
 [ 29 172]]


In [46]:
print(classification_report(y_test , y_preds))

              precision    recall  f1-score   support

         neg       0.84      0.83      0.84       187
         pos       0.85      0.86      0.85       201

    accuracy                           0.85       388
   macro avg       0.85      0.84      0.85       388
weighted avg       0.85      0.85      0.85       388



# Now its time to check for the random reviews taken

In [58]:
myreview1 = "I love this movie very much. I enjoyed it alot"

In [59]:
print(model_svm.predict([myreview1]))

['pos']


In [60]:
myreview2 = "A movie I really wanted to love was terrible. \
I'm sure the producers had the best intentions, but the execution was lacking."

In [61]:
print(model_svm.predict([myreview2]))

['neg']


In the above two examples we see that one review of a movie is positive and the other one is negetive. and we predict it on Support vector machine