In [1]:
# import necessary libraries
import pandas as pd 
from sklearn.naive_bayes import MultinomialNB, GaussianNB  
from sklearn.model_selection import train_test_split  
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# load four training datasets into respective pandas DataFrames
df1 = pd.read_csv('Youtube01-Psy.csv', index_col='COMMENT_ID')
df2 = pd.read_csv('Youtube02-KatyPerry.csv', index_col='COMMENT_ID')
df3 = pd.read_csv('Youtube03-LMFAO.csv', index_col='COMMENT_ID')
df4 = pd.read_csv('Youtube04-Eminem.csv', index_col='COMMENT_ID')

# append all four training pandas DataFrames into one DataFrame
df_train = pd.concat([df1, df2, df3, df4])

# print the first five entries of the 'df_train' pandas DataFrame
print(df_train.head())

                                                       AUTHOR  \
COMMENT_ID                                                      
LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU         Julius NM   
LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A       adam riyati   
LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8  Evgeny Murashkin   
z13jhp0bxqncu512g22wvzkasxmvvzjaz04           ElNino Melendez   
z13fwbwp1oujthgqj04chlngpvzmtt3r3dw                    GsMega   

                                                            DATE  \
COMMENT_ID                                                         
LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU  2013-11-07T06:20:48   
LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A  2013-11-07T12:37:15   
LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8  2013-11-08T17:34:21   
z13jhp0bxqncu512g22wvzkasxmvvzjaz04          2013-11-09T08:28:43   
z13fwbwp1oujthgqj04chlngpvzmtt3r3dw          2013-11-10T16:05:38   

                                                                   

In [3]:
# load the remaining dataset into a pandas DataFrame for testing
df_test = pd.read_csv('Youtube05-Shakira.csv', index_col='COMMENT_ID')

# print the first five entries of the 'df_test' pandas DataFrame
print(df_test.head())

                                                                   AUTHOR  \
COMMENT_ID                                                                  
z13lgffb5w3ddx1ul22qy1wxspy5cpkz504                            dharma pal   
z123dbgb0mqjfxbtz22ucjc5jvzcv3ykj                           Tiza Arellano   
z12quxxp2vutflkxv04cihggzt2azl34pms0k  Prìñçeśś Âliś Łøvê Dømíñø Mâđiś™ ﻿   
z12icv3ysqvlwth2c23eddlykyqut5z1h                           Eric Gonzalez   
z133stly3kete3tly22petvwdpmghrlli                           Analena López   

                                                             DATE  \
COMMENT_ID                                                          
z13lgffb5w3ddx1ul22qy1wxspy5cpkz504    2015-05-29T02:30:18.971000   
z123dbgb0mqjfxbtz22ucjc5jvzcv3ykj      2015-05-29T00:14:48.748000   
z12quxxp2vutflkxv04cihggzt2azl34pms0k  2015-05-28T21:00:08.607000   
z12icv3ysqvlwth2c23eddlykyqut5z1h      2015-05-28T20:47:12.193000   
z133stly3kete3tly22petvwdpmghrlli      2015-05

In [4]:
# define the independent (feature) variables for the training dataset
X_train = df_train.drop(columns=['CLASS', 'DATE', 'AUTHOR'])

# define the dependent (target) variables for the training dataset
y_train = df_train['CLASS']

# define the independent (feature) variables for the testing dataset
X_test = df_test.drop(columns=['CLASS', 'DATE', 'AUTHOR'])

# define the dependent (target) variables for the testing dataset
y_test = df_test['CLASS']

In [5]:
# convert text into TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(X_train['CONTENT'])
X_test = vectorizer.transform(X_test['CONTENT'])

In [6]:
# initialize a Multinomial Naive Bayes classifier
nb_multi = MultinomialNB()

# train the Multinomial Naive Bayes classifier with the entire testing dataset
nb_multi.fit(X_train, y_train)

# initialize a Gaussian Naive Bayes classifier
nb_gauss = GaussianNB()

# train the Gaussian Naive Bayes classifier with the entire testing dataset
nb_gauss.fit(X_train.toarray(), y_train)

In [7]:
# predictions and performance evaluation for the Multinomial Naive Bayes
multi_preds = nb_multi.predict(X_test.toarray()) # predict the target variable
print("Results for Multinomial distribution assumption:") 
print(accuracy_score(y_test, multi_preds))  # calculate and print the accuracy
print(confusion_matrix(y_test, multi_preds), '\n')

# predictions and performance evaluation for the Gaussian Naive Bayes
gauss_preds = nb_gauss.predict(X_test.toarray()) # predict the target variable
print("Results for Gaussian distribution assumption:") 
print(accuracy_score(y_test, gauss_preds))  # calculate and print the accuracy
print(confusion_matrix(y_test, gauss_preds))

Results for Multinomial distribution assumption:
0.8729729729729729
[[164  32]
 [ 15 159]] 

Results for Gaussian distribution assumption:
0.827027027027027
[[178  18]
 [ 46 128]]
