In [65]:
# 1. Import libraries

import matplotlib.pyplot as plt
import nltk 
import numpy as np
import pandas as pd 
import seaborn as sns

from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# 2. Read data

In [61]:
df2 = pd.read_csv("SMSSpamCollection.csv", header=None, sep="\t", names=["target", "text"])
df2

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [24]:
df = pd.read_csv("SMSSpamCollection.csv", sep='delimiter', header=None)
df[['spam', 'text']] = df[0].str.split('\t', expand=True)
df = df.drop(columns=0)
df = df.replace({"ham": 0, "spam": 1})
df

  df = pd.read_csv("SMSSpamCollection.csv", sep='delimiter', header=None)
  df = df.replace({"ham": 0, "spam": 1})


Unnamed: 0,spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,1,This is the 2nd time we have tried 2 contact u...
5570,0,Will ü b going to esplanade fr home?
5571,0,"Pity, * was in mood for that. So...any other s..."
5572,0,The guy did some bitching but I acted like i'd...


In [3]:
# 3. Split data into training and test sets

In [27]:
X = df["text"]
y = df["spam"]

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# 4. Preprocess and vectorize data

In [None]:
X_df = pd.DataFrame(X)

In [None]:
count = CountVectorizer(stop_words='english')
count_vector = count.fit_transform(X_df['text'])

In [41]:
count_vector.shape

(4459, 7463)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(count_vector, y, test_size=0.2, random_state=42)

What else do we do, what do we do with the count matrix?

In [5]:
# 5. Train model

MultinomialNB

In [51]:
model_multi = MultinomialNB()
model_multi.fit(X_train, y_train)

In [52]:
y_pred_multi = model_multi.predict(X_test)

BernoulliNB

In [56]:
model_bern = BernoulliNB()
model_bern.fit(X_train, y_train)

In [57]:
y_pred_bern = model_bern.predict(X_test)

In [54]:
# 6. Evaluate model

In [58]:
print("MultinomialNB")
print("Accuracy:", accuracy_score(y_pred_multi, y_test))

print("\nBernoulliNB")
print("Accuracy:", accuracy_score(y_pred_bern, y_test))

MultinomialNB
Accuracy: 0.97847533632287

BernoulliNB
Accuracy: 0.979372197309417


Kaggle tweets

In [64]:
#2. Read the data

In [3]:
df_tweets = pd.read_csv("kaggle_tweet_sentiments.csv")
df_tweets

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [15]:
df_tweets.isna().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [16]:
df_tweets = df_tweets.dropna()

In [63]:
# 3. Split data into training and test sets

In [17]:
X = df_tweets[["text", "selected_text"]]
y = df_tweets["sentiment"]

In [None]:
count = CountVectorizer(stop_words='english') # there are more params, should dp gridSearch
count_vector = count.fit_transform(X["selected_text"])

In [56]:
count_vector.shape

(27480, 17338)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(count_vector, y, test_size=0.2, random_state=42)

In [58]:
# 5. Train model

MultinomialNB

In [59]:
model_multi = MultinomialNB()
model_multi.fit(X_train, y_train)

In [60]:
y_pred_multi = model_multi.predict(X_test)

BernoulliNB

In [61]:
model_bern = BernoulliNB()
model_bern.fit(X_train, y_train)

In [62]:
y_pred_bern = model_bern.predict(X_test)

In [63]:
# 6. Evaluate model

In [64]:
print("MultinomialNB")
print("Accuracy:", accuracy_score(y_pred_multi, y_test))

print("\nBernoulliNB")
print("Accuracy:", accuracy_score(y_pred_bern, y_test))

MultinomialNB
Accuracy: 0.75382096069869

BernoulliNB
Accuracy: 0.7518195050946143


In [66]:
# Do tfidf

In [69]:
df2 = pd.read_csv("SMSSpamCollection.csv", header=None, sep="\t", names=["target", "text"])
df2 = df2.replace({"ham": 0, "spam": 1})
df2

  df2 = df2.replace({"ham": 0, "spam": 1})


Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [71]:
X = df2["text"]
y = df2["target"]

In [70]:
tfidf = TfidfVectorizer(stop_words='english')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df2["text"])

In [72]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.2, random_state=42)

In [82]:
model_multi = MultinomialNB()
model_multi.fit(X_train, y_train)

In [83]:
y_pred_multi = model_multi.predict(X_test)

In [84]:
model_bern = BernoulliNB()
model_bern.fit(X_train, y_train)

In [85]:
y_pred_bern = model_bern.predict(X_test)

In [86]:
print("MultinomialNB")
print("Accuracy:", accuracy_score(y_pred_multi, y_test))

print("\nBernoulliNB")
print("Accuracy:", accuracy_score(y_pred_bern, y_test))

MultinomialNB
Accuracy: 0.979372197309417

BernoulliNB
Accuracy: 0.979372197309417
