In [4]:
import pandas as pd
import numpy as np

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

In [10]:
data = pd.read_csv("dataset/Youtube01-Psy.csv")
print(data.sample(5))

                                COMMENT_ID        AUTHOR                 DATE  \
144    z13osluqrpefv1hd323idhejzxanc3ai004   Tyrek Sings  2014-11-05T22:50:02   
184  z131xnjjtqeyh5dy304cfhm50vagttfyemg0k   Chack Jason  2014-11-07T04:39:08   
155  z12yinh5ks2oinqzn04cctkgvvrohbrazvo0k  Rancy Gaming  2014-11-06T09:41:07   
79     z12ywjvgdtrhxdlz504cd1tquqvuhbs4abw         Angel  2014-11-02T17:27:09   
4      z13fwbwp1oujthgqj04chlngpvzmtt3r3dw        GsMega  2013-11-10T16:05:38   

                                               CONTENT  CLASS  
144  CHECK MY CHANNEL OUT PLEASE. I DO SINGING COVERS﻿      1  
184                                        OPPA &lt;3﻿      0  
155  What free gift cards? Go here  http://www.swag...      1  
79   Hi there~I'm group leader of Angel, a rookie K...      1  
4              watch?v=vtaRGgvGtWQ   Check this out .﻿      1  


In [12]:
# we need only content and class cells for classifying

In [14]:
data = data[["CONTENT", "CLASS"]]
print(data.sample(5))

                                               CONTENT  CLASS
75   if your like drones, plz subscribe to Kamal Ta...      1
307  Go to my channel if u want to see a fly gettin...      1
212                                Still the best. :D﻿      0
258                     C'mon 3 billion views!!!!!!!!﻿      0
110  EHI GUYS CAN YOU SUBSCRIBE IN MY CHANNEL? I AM...      1


In [16]:
# lets make the classes 0 as not spam and 1 as spam

In [18]:
data["CLASS"] = data["CLASS"].map({0: "Not Spam",
                                   1: "Spam Comment"})
print(data.sample(5))

                                               CONTENT         CLASS
163                         I found out this song now﻿      Not Spam
183  The funny thing is, 1,700,000,000 of the views...      Not Spam
115                                      #2012bitches﻿      Not Spam
178  Please give us a chance and check out the new ...  Spam Comment
90   https://www.indiegogo.com/projects/cleaning-th...  Spam Comment


In [20]:
# Training the model with BernoulliNB

In [22]:
x = np.array(data["CONTENT"])
y = np.array(data["CLASS"])

# CountVectozizer is used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.
cv = CountVectorizer()
x = cv.fit_transform(x)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

model = BernoulliNB()
model.fit(xtrain, ytrain)
print(model.score(xtest, ytest))

0.9857142857142858


In [24]:
# now lets check out the model testing

In [26]:
sample = "Check this out: https://thecleverprogrammer.com/" 
data = cv.transform([sample]).toarray()
print(model.predict(data))

['Spam Comment']


In [28]:
sample = "Lack of information!" 
data = cv.transform([sample]).toarray()
print(model.predict(data))

['Not Spam']


In [30]:
import joblib

# Save the trained Naive Bayes model
joblib.dump(model, 'spam_classifier_model.pkl')

# Save the CountVectorizer object
joblib.dump(cv, 'count_vectorizer.pkl')


['count_vectorizer.pkl']