In [None]:
# Download the YouTube spam collection dataset available from OA 6.15. It is 
# a public set  of comments collected for spam research. It has five datasets
# composed of 1956 real messages extracted from five videos. These five videos
# are popular pop songs that were among  the 10 most viewed of the collection 
# period. All five datasets have the following attributes:  
#
#     • COMMENT_ID: unique id representing the comment  
#     • AUTHOR: author ID  
#     • DATE: date the comment is posted  
#     • CONTENT: the comment  
#     • TAG: for spam 1, otherwise 0.  
#       
# For this exercise use any four of these five datasets to build a spam filter
# and use that  filter to check the accuracy on the remaining dataset. 

In [27]:
import pandas as pd
import re
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

nb_multi = MultinomialNB() 
nb_gauss = GaussianNB()  

# Text cleaning (consider additional steps like stop word removal)
def clean_text(text):
    text = text.lower()  
    text = re.sub(r'[^\w\s]', '', text) 
    return text

def getXy(df):
    df['CONTENT'] = df['CONTENT'].apply(clean_text)

    # Feature extraction using TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    X_transformed = vectorizer.fit_transform(df['CONTENT'])

    # Use the transformed features for training
    X = X_transformed.toarray()
    y = df.CLASS
    
    return X, y

# Driver code
def train_models(df, name):
    X, y = getXy(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    # Build, train, and test a multinomial NB model
    nb_multi.fit(X, y)
    multi_preds = nb_multi.predict(X_test) 

    # Build, train, and test a gaussian NB model
    nb_gauss.fit(X, y) 
    gauss_preds = nb_gauss.predict(X_test) 
    
    print(f"\n\n{name} dataset")
    
    print("\nResults for multinomial distribution assumption:") 
    print(f"Accuracy: {accuracy_score(y_test, multi_preds)}")  
    print(confusion_matrix(y_test, multi_preds)) 
    
    print("\nResults for Gaussian distribution assumption:") 
    print(f"Accuracy: {accuracy_score(y_test, gauss_preds)}") 
    print(confusion_matrix(y_test, gauss_preds)) 
    
# Train on four of the datasets
df = pd.read_csv("../Datasets/Code_and_Data_(Chapter_06)/Code and Data (Chapter 06)/YouTube-Spam-Collection-v1/Youtube02-KatyPerry.csv")
train_models(df, "Katy Perry")

df = pd.read_csv("../Datasets/Code_and_Data_(Chapter_06)/Code and Data (Chapter 06)/YouTube-Spam-Collection-v1/Youtube01-Psy.csv")
train_models(df, "Psy")

df = pd.read_csv("../Datasets/Code_and_Data_(Chapter_06)/Code and Data (Chapter 06)/YouTube-Spam-Collection-v1/Youtube03-LMFAO.csv")
train_models(df, "LMFAO")

df = pd.read_csv("../Datasets/Code_and_Data_(Chapter_06)/Code and Data (Chapter 06)/YouTube-Spam-Collection-v1/Youtube05-Shakira.csv")
train_models(df, "Shakira")

# Now predict on the last one
df = pd.read_csv("../Datasets/Code_and_Data_(Chapter_06)/Code and Data (Chapter 06)/YouTube-Spam-Collection-v1/Youtube04-Eminem.csv")
X, y = getXy(df)
multi_preds = nb_multi.predict(X) 
gauss_preds = nb_gauss.predict(X) 

print(f"\n\nFinal Testing {name} dataset")

print("\nResults for multinomial distribution assumption:") 
print(f"Accuracy: {accuracy_score(y, multi_preds)}")  
print(confusion_matrix(y, multi_preds)) 

print("\nResults for Gaussian distribution assumption:") 
print(f"Accuracy: {accuracy_score(y, gauss_preds)}") 
print(confusion_matrix(y, gauss_preds)) 



Katy Perry dataset

Results for multinomial distribution assumption:
Accuracy: 0.9904761904761905
[[59  0]
 [ 1 45]]

Results for Gaussian distribution assumption:
Accuracy: 1.0
[[59  0]
 [ 0 46]]


Psy dataset

Results for multinomial distribution assumption:
Accuracy: 0.9904761904761905
[[55  1]
 [ 0 49]]

Results for Gaussian distribution assumption:
Accuracy: 1.0
[[56  0]
 [ 0 49]]


LMFAO dataset

Results for multinomial distribution assumption:
Accuracy: 0.9772727272727273
[[56  2]
 [ 1 73]]

Results for Gaussian distribution assumption:
Accuracy: 1.0
[[58  0]
 [ 0 74]]


Shakira dataset

Results for multinomial distribution assumption:
Accuracy: 0.9819819819819819
[[61  0]
 [ 2 48]]

Results for Gaussian distribution assumption:
Accuracy: 1.0
[[61  0]
 [ 0 50]]


ValueError: X has 1653 features, but MultinomialNB is expecting 1391 features as input.