In [4]:
# Josh Shell
# Apply BernoulliNB on tweets csv
# Compared with others including binarized MNB


import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.feature_extraction.text as text
import sklearn.naive_bayes as nb
from sklearn.metrics import accuracy_score

df = pd.read_csv('tweets_01-08-2021.csv')
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
X_train, X_test, Y_train, Y_test = train_test_split(df['text'],
                                                    df['year'],
                                                    test_size=0.2,
                                                    stratify=df['year'])

print('Size of Training Data ', X_train.shape[0])
print('Size of Test Data ', X_test.shape[0])

countv = text.CountVectorizer(min_df = 10, ngram_range=(1,2), stop_words="english", binary=True)
X_train_tf = countv.fit_transform(X_train)
X_test_tf = countv.transform(X_test)

model1 = nb.BernoulliNB()
model1.fit(X_train_tf, Y_train)
Y_pred = model1.predict(X_test_tf)
print ('Accuracy Score for BernoulliNB using CountVectorizer - ', accuracy_score(Y_test, Y_pred))

model1 = nb.MultinomialNB()
model1.fit(X_train_tf, Y_train)
Y_pred = model1.predict(X_test_tf)
print ('Accuracy Score for MultinomialNB using CountVectorizer - ', accuracy_score(Y_test, Y_pred))

model1 = nb.ComplementNB()
model1.fit(X_train_tf, Y_train)
Y_pred = model1.predict(X_test_tf)
print ('Accuracy Score for ComplementNB using CountVectorizer - ', accuracy_score(Y_test, Y_pred))

countv = text.CountVectorizer(min_df = 10, ngram_range=(1,2), stop_words="english", binary=False)
X_train_tf = countv.fit_transform(X_train)
X_test_tf = countv.transform(X_test)

model2 = nb.BernoulliNB()
model2.fit(X_train_tf, Y_train)
Y_pred = model2.predict(X_test_tf)
print ('Accuracy Score for BernoulliNB using without binarizing CountVectorizer - ', accuracy_score(Y_test, Y_pred))

model2 = nb.MultinomialNB()
model2.fit(X_train_tf, Y_train)
Y_pred = model2.predict(X_test_tf)
print ('Accuracy Score for MultinomialNB using without binarizing CountVectorizer - ', accuracy_score(Y_test, Y_pred))

model2 = nb.ComplementNB()
model2.fit(X_train_tf, Y_train)
Y_pred = model2.predict(X_test_tf)
print ('Accuracy Score for ComplementNB using without binarizing CountVectorizer - ', accuracy_score(Y_test, Y_pred))

countv = text.TfidfVectorizer(min_df = 10, ngram_range=(1,2), stop_words="english")
X_train_tf = countv.fit_transform(X_train)
X_test_tf = countv.transform(X_test)

model3 = nb.BernoulliNB()
model3.fit(X_train_tf, Y_train)
Y_pred = model3.predict(X_test_tf)
print ('Accuracy Score for BernoulliNB using TfidfVectorizer - ', accuracy_score(Y_test, Y_pred))

model3 = nb.MultinomialNB()
model3.fit(X_train_tf, Y_train)
Y_pred = model3.predict(X_test_tf)
print ('Accuracy Score for MultinomialNB using TfidfVectorizer - ', accuracy_score(Y_test, Y_pred))

model3 = nb.ComplementNB()
model3.fit(X_train_tf, Y_train)
Y_pred = model3.predict(X_test_tf)
print ('Accuracy Score for ComplementNB using TfidfVectorizer - ', accuracy_score(Y_test, Y_pred))
print("2nd run")

Size of Training Data  45256
Size of Test Data  11315
Accuracy Score for BernoulliNB using CountVectorizer -  0.5612903225806452
Accuracy Score for MultinomialNB using CountVectorizer -  0.5793194874060981
Accuracy Score for ComplementNB using CountVectorizer -  0.5764030048608042
Accuracy Score for BernoulliNB using without binarizing CountVectorizer -  0.5612903225806452
Accuracy Score for MultinomialNB using without binarizing CountVectorizer -  0.5790543526292532
Accuracy Score for ComplementNB using without binarizing CountVectorizer -  0.573663278833407
Accuracy Score for BernoulliNB using TfidfVectorizer -  0.5612903225806452
Accuracy Score for MultinomialNB using TfidfVectorizer -  0.5643835616438356
Accuracy Score for ComplementNB using TfidfVectorizer -  0.5730446310207689
2nd run


In [6]:
# IR16A.py CS5154/6054 cheng 2021
# Apply BernoulliNB on tweets
# Compared with others including binarized MNB
# Usage: python IR16A.py

import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.feature_extraction.text as text
import sklearn.naive_bayes as nb
from sklearn.metrics import accuracy_score

df = pd.read_csv('tweets_01-08-2021.csv')
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
X_train, X_test, Y_train, Y_test = train_test_split(df['text'],
                                                    df['year'],
                                                    test_size=0.2,
                                                    stratify=df['year'])

print('Size of Training Data ', X_train.shape[0])
print('Size of Test Data ', X_test.shape[0])

countv = text.CountVectorizer(min_df = 10, ngram_range=(1,2), stop_words="english", binary=True)
X_train_tf = countv.fit_transform(X_train)
X_test_tf = countv.transform(X_test)

model1 = nb.BernoulliNB()
model1.fit(X_train_tf, Y_train)
Y_pred = model1.predict(X_test_tf)
print ('Accuracy Score for BernoulliNB using CountVectorizer - ', accuracy_score(Y_test, Y_pred))

model1 = nb.MultinomialNB()
model1.fit(X_train_tf, Y_train)
Y_pred = model1.predict(X_test_tf)
print ('Accuracy Score for MultinomialNB using CountVectorizer - ', accuracy_score(Y_test, Y_pred))

model1 = nb.ComplementNB()
model1.fit(X_train_tf, Y_train)
Y_pred = model1.predict(X_test_tf)
print ('Accuracy Score for ComplementNB using CountVectorizer - ', accuracy_score(Y_test, Y_pred))

countv = text.CountVectorizer(min_df = 10, ngram_range=(1,2), stop_words="english", binary=False)
X_train_tf = countv.fit_transform(X_train)
X_test_tf = countv.transform(X_test)

model2 = nb.BernoulliNB()
model2.fit(X_train_tf, Y_train)
Y_pred = model2.predict(X_test_tf)
print ('Accuracy Score for BernoulliNB using without binarizing CountVectorizer - ', accuracy_score(Y_test, Y_pred))

model2 = nb.MultinomialNB()
model2.fit(X_train_tf, Y_train)
Y_pred = model2.predict(X_test_tf)
print ('Accuracy Score for MultinomialNB using without binarizing CountVectorizer - ', accuracy_score(Y_test, Y_pred))

model2 = nb.ComplementNB()
model2.fit(X_train_tf, Y_train)
Y_pred = model2.predict(X_test_tf)
print ('Accuracy Score for ComplementNB using without binarizing CountVectorizer - ', accuracy_score(Y_test, Y_pred))

countv = text.TfidfVectorizer(min_df = 10, ngram_range=(1,2), stop_words="english")
X_train_tf = countv.fit_transform(X_train)
X_test_tf = countv.transform(X_test)

model3 = nb.BernoulliNB()
model3.fit(X_train_tf, Y_train)
Y_pred = model3.predict(X_test_tf)
print ('Accuracy Score for BernoulliNB using TfidfVectorizer - ', accuracy_score(Y_test, Y_pred))

model3 = nb.MultinomialNB()
model3.fit(X_train_tf, Y_train)
Y_pred = model3.predict(X_test_tf)
print ('Accuracy Score for MultinomialNB using TfidfVectorizer - ', accuracy_score(Y_test, Y_pred))

model3 = nb.ComplementNB()
model3.fit(X_train_tf, Y_train)
Y_pred = model3.predict(X_test_tf)
print ('Accuracy Score for ComplementNB using TfidfVectorizer - ', accuracy_score(Y_test, Y_pred))
print("3rd run")

Size of Training Data  45256
Size of Test Data  11315
Accuracy Score for BernoulliNB using CountVectorizer -  0.5497127706584181
Accuracy Score for MultinomialNB using CountVectorizer -  0.5675651789659744
Accuracy Score for ComplementNB using CountVectorizer -  0.5679186920017676
Accuracy Score for BernoulliNB using without binarizing CountVectorizer -  0.5497127706584181
Accuracy Score for MultinomialNB using without binarizing CountVectorizer -  0.5696862571807335
Accuracy Score for ComplementNB using without binarizing CountVectorizer -  0.5656208572691118
Accuracy Score for BernoulliNB using TfidfVectorizer -  0.5497127706584181
Accuracy Score for MultinomialNB using TfidfVectorizer -  0.5558992487847989
Accuracy Score for ComplementNB using TfidfVectorizer -  0.5680070702607158
3rd run
