In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
mail = pd.read_csv("C:/Users/Sobhan/Machine Learning/NLP/bag_of_words/spam_mail.csv")

In [3]:
mail.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
mail.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [5]:
mail["spam"] = mail["Category"].apply(lambda x: 1 if x == "spam" else 0)

In [6]:
mail.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
mail.shape

(5572, 3)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(mail.Message, mail.spam, test_size = 0.2)

In [9]:
X_train.shape, X_test.shape

((4457,), (1115,))

In [10]:
type(X_train)

pandas.core.series.Series

In [11]:
X_train[:5]

2389    wiskey Brandy Rum Gin Beer Vodka Scotch Shampa...
5199           Ugh my leg hurts. Musta overdid it on mon.
3370    Sorry i've not gone to that place. I.ll do so ...
4585    Noooooooo please. Last thing I need is stress....
3564    Auction round 4. The highest bid is now £54. N...
Name: Message, dtype: object

In [12]:
type(y_train)

pandas.core.series.Series

In [13]:
y_train[:5]

2389    0
5199    0
3370    0
4585    0
3564    1
Name: spam, dtype: int64

In [14]:
type(X_train.values)

numpy.ndarray

<p>Bag Of words</p>

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
cv = CountVectorizer()


In [17]:
X_train_cv = cv.fit_transform(X_train.values)
X_train_cv

<4457x7740 sparse matrix of type '<class 'numpy.int64'>'
	with 59067 stored elements in Compressed Sparse Row format>

In [18]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [19]:
X_train_cv.shape

(4457, 7740)

In [20]:
cv.vocabulary_

{'wiskey': 7544,
 'brandy': 1484,
 'rum': 5854,
 'gin': 3120,
 'beer': 1292,
 'vodka': 7313,
 'scotch': 5957,
 'shampain': 6069,
 'wine': 7528,
 'kudi': 3967,
 'yarasu': 7669,
 'dhina': 2275,
 'vaazhthukkal': 7232,
 'ugh': 7121,
 'my': 4666,
 'leg': 4056,
 'hurts': 3552,
 'musta': 4659,
 'overdid': 5026,
 'it': 3744,
 'on': 4934,
 'mon': 4560,
 'sorry': 6335,
 've': 7255,
 'not': 4827,
 'gone': 3160,
 'to': 6942,
 'that': 6814,
 'place': 5227,
 'll': 4148,
 'do': 2357,
 'so': 6285,
 'tomorrow': 6967,
 'really': 5606,
 'noooooooo': 4811,
 'please': 5249,
 'last': 4010,
 'thing': 6849,
 'need': 4725,
 'is': 3732,
 'stress': 6521,
 'for': 2924,
 'once': 4937,
 'in': 3627,
 'your': 7707,
 'life': 4090,
 'be': 1268,
 'fair': 2741,
 'auction': 1134,
 'round': 5827,
 'the': 6818,
 'highest': 3414,
 'bid': 1346,
 'now': 4838,
 '54': 556,
 'next': 4764,
 'maximum': 4385,
 '71': 613,
 'send': 6017,
 'bids': 1347,
 '10': 254,
 '83383': 669,
 'good': 3164,
 'luck': 4241,
 'or': 4973,
 'remind': 56

In [21]:
cv.get_feature_names()[6814]

'that'

In [22]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [23]:
np.where(X_train_np[0]!= 0)

(array([1292, 1484, 2275, 3120, 3967, 5854, 5957, 6069, 7232, 7313, 7528,
        7544, 7669], dtype=int64),)

In [25]:
X_train[:5][3370]

"Sorry i've not gone to that place. I.ll do so tomorrow. Really sorry."

In [26]:
X_train_np[0][6170]

0

Naive Bayes

In [27]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

In [28]:
model.fit(X_train_cv, y_train)

MultinomialNB()

In [29]:
X_test_cv = cv.transform(X_test)

In [30]:
y_pred = model.predict(X_test_cv)

In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       962
           1       0.98      0.92      0.95       153

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [32]:
emails = [
    "How's it going? Got any exciting karaoke type activities planned? I'm debating whether to play football this eve. Feeling lazy though."
]

emails_count = cv.transform(emails)
model.predict(emails_count)

array([0], dtype=int64)

In [33]:
#using sklearn pipeline and reduce number of lines of code

from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("nb", MultinomialNB())
])

In [34]:
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [35]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       962
           1       0.98      0.92      0.95       153

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



Exercises

1. read the data provided in the same directory with name 'movies_sentiment_data.csv' and store it in df variable
2. print the shape of the data
3. print top 5 datapoints

#creating a new column "Category" which represent 1 if the sentiment is positive or 0 if it is negative.
#check the distribution of 'Category' and see whether the Target labels are balanced or not.
#Do the 'train-test' splitting with test size of 20%

Exercise 1

Using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative.

*use CountVectorizer for pre-processing the text.

*use Random Forest as the classifier with estimators as 50 and criterion as entropy.

*print the classification report.

1. create a pipeline object
2. fit with X_train and y_train
3. get the predictions for X_test and store it in y_pred
4. print the classfication report

Solution1

In [36]:
#import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [37]:
#1. read the data provided in the same directory with name 'movies_sentiment_data.csv' and store it in df variable
df = pd.read_csv("movies_sentiment_data.csv")

In [38]:
#2. print the shape of the data
print(df.shape)

(50000, 2)


In [39]:
#3. print top 5 datapoints
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [40]:
#creating a new column "Category" which represent 1 if the sentiment is positive or 0 if it is negative
df["Category"] = df["sentiment"].apply(lambda x: 1 if x == "positive" else 0)

In [41]:
#check the distribution of 'Category' and see whether the Target labels are balanced or not.
df["Category"].value_counts()

1    25000
0    25000
Name: Category, dtype: int64

In [42]:
#Do the 'train-test' splitting with test size of 20%
X_train, X_test, y_train, y_test = train_test_split(df.review, df.Category, test_size = 0.2)

Solution Exercise 1

In [43]:
#1. create a pipeline object

clf = Pipeline([
    ("vectorizer", CountVectorizer()), #initializing the vectorizer
    ("random_forest", (RandomForestClassifier(n_estimators = 50, criterion = "entropy"))) #using the RandomForest classifier
])

In [44]:
#2. fit with X_train and y_train
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('random_forest',
                 RandomForestClassifier(criterion='entropy', n_estimators=50))])

In [45]:
#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

In [46]:
#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      5023
           1       0.84      0.84      0.84      4977

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



Exercise 2

Using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative..
Note:

*use CountVectorizer for pre-processing the text.

*use KNN as the classifier with n_neighbors of 10 and metric as 'euclidean'.

*print the classification report.

1. create a pipeline object
2. fit with X_train and y_train
3. get the predictions for X_test and store it in y_pred
4. print the classfication report

Solution Exercise 2

In [47]:
#1. create a pipeline object
clf = Pipeline([
            ("vectorizer", CountVectorizer()),   
            ("KNN", (KNeighborsClassifier(n_neighbors = 10, metric = "euclidean")))   #using the KNN classifier with 10 neighbors 
])

In [48]:
#2. fit with X_train and y_train
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('KNN',
                 KNeighborsClassifier(metric='euclidean', n_neighbors=10))])

In [49]:
#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

In [50]:
#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.65      0.66      5023
           1       0.66      0.68      0.67      4977

    accuracy                           0.66     10000
   macro avg       0.66      0.66      0.66     10000
weighted avg       0.66      0.66      0.66     10000



Exercise 3

Using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative..
Note:

*use CountVectorizer for pre-processing the text.

*use Multinomial Naive Bayes as the classifier.

*print the classification report.

1. create a pipeline object
2. fit with X_train and y_train
3. get the predictions for X_test and store it in y_pred
4. print the classfication report

Solution Exercise 3

In [51]:
#1. create a pipeline object
clf = Pipeline([
            ("vectorizer", CountVectorizer()),   
            ("Multi NB", MultinomialNB())   #using the Multinomial Naive Bayes classifier 
])

In [52]:
#2. fit with X_train and y_train
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('Multi NB', MultinomialNB())])

In [53]:
#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

In [54]:
#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85      5023
           1       0.86      0.83      0.84      4977

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

