# YouTube Comments Spam Classifier

### Import modules

In [18]:
import numpy as np
import pandas as pd
import zipfile
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix, classification_report

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import dataset files from Google Drive

In [20]:
z = zipfile.ZipFile("/content/drive/MyDrive/YouTube-Spam-Collection-v1.zip")
Psy = pd.read_csv(z.open("Youtube01-Psy.csv"))
Katy = pd.read_csv(z.open("Youtube02-KatyPerry.csv"))
LMFAO = pd.read_csv(z.open("Youtube03-LMFAO.csv"))
Eminem = pd.read_csv(z.open("Youtube04-Eminem.csv"))
Shakira = pd.read_csv(z.open("Youtube05-Shakira.csv"))

In [21]:
data = pd.concat([Psy, Katy, LMFAO, Eminem, Shakira])
data.drop(["COMMENT_ID", "DATE", "AUTHOR"], axis=1, inplace=True)

data.shape
data.tail(5)

Unnamed: 0,CONTENT,CLASS
365,I love this song because we sing it at Camp al...,0
366,I love this song for two reasons: 1.it is abou...,0
367,wow,0
368,Shakira u are so wiredo,0
369,Shakira is the best dancer,0


In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1956 entries, 0 to 369
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   CONTENT  1956 non-null   object
 1   CLASS    1956 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 45.8+ KB


### Splitting dataset into train/test sets

In [23]:
X_train, X_test, y_train, y_test = train_test_split(data["CONTENT"], data["CLASS"]) # uses 75% train and 25% test split by default

### Tokenizing comments in training set and applying TF-IDF vectorizer on training set

#### [NOT CURRENTLY USED] Tokenizing comments in training set (splitting text by word boundaries)

In [24]:
vectorizer = CountVectorizer(lowercase=True)
X_train_counts = vectorizer.fit_transform(X_train) # produces a matrix of token counts
# print(X_train_counts)
# vectorizer.vocabulary_

In [25]:
print(vectorizer.get_stop_words())

None


#### [NOT CURRENTLY USED] Apply TF-IDF transformation on training set  
- more details [here](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#from-occurrences-to-frequencies)

In [26]:
tf_transformer = TfidfTransformer(use_idf=True)
X_train_tfidf = tf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1467, 3694)

#### Using TfidfVectorizer (equivalent to CountVectorizer followed by TfidfTransformer (e.g. the two steps above))

In [27]:
tfidf_vect = TfidfVectorizer(use_idf=True, lowercase=True)
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_train_tfidf.shape

(1467, 3694)

### Training the multinomial Naive Bayes model

In [28]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

MultinomialNB()

### Generate predictions on test set

In [29]:
# perform same feature extraction on test set
# X_test_counts = vectorizer.transform(X_test)
# X_test_tfidf = tf_transformer.transform(X_test_counts)
X_test_tfidf = tfidf_vect.transform(X_test)

# make predictions on the test set
predictions = model.predict(X_test_tfidf)

### Generate model performance metrics

In [30]:
confusion_matrix(y_test, predictions)

array([[205,  33],
       [  9, 242]])

In [31]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.96      0.86      0.91       238
           1       0.88      0.96      0.92       251

    accuracy                           0.91       489
   macro avg       0.92      0.91      0.91       489
weighted avg       0.92      0.91      0.91       489



In [32]:
model.score(X_test_tfidf, y_test)

0.9141104294478528

### Performing cross-validation

In [33]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X_train_tfidf, y_train, cv=10)
print(cv_scores, "\n\nmean: ", cv_scores.mean())

# cv example:
# https://medium.com/@akshmahesh/detecting-spam-comments-on-youtube-using-machine-learning-948d54f47b3
# https://github.com/AkshayLaddha943/Machine-Learning/blob/master/Youtube-Spam-Check/youtube-spam.py

[0.91836735 0.94557823 0.91836735 0.91156463 0.91156463 0.93877551
 0.93197279 0.94520548 0.93150685 0.91780822] 

mean:  0.9270711024135683


### Exporting the model and TF-IDF vectorizer

In [34]:
# exporting model to pickle file
with open("model.pkl", "wb") as model_file:
  pickle.dump(model, model_file)

# exporting the TF-IDF vectorizer as well
# https://stackoverflow.com/questions/29788047/keep-tfidf-result-for-predicting-new-content-using-scikit-for-python
with open("tfidf-vect.pkl", "wb") as tfidf_vect_file:
  pickle.dump(tfidf_vect, tfidf_vect_file)

### Testing the model with custom comments

In [35]:
# loading the model
with open("model.pkl", "rb") as model_file:
  loaded_model = pickle.load(model_file)
loaded_model

# loading the tfidf vectorizer
with open("tfidf-vect.pkl", "rb") as tfidf_vect_file:
  loaded_vectorizer = pickle.load(tfidf_vect_file)

In [36]:
# testing the model with our own test comments
comments = np.array([['check out facebook.com'], ['this was a really helpful video!'], ["i am not spam"]])
test_df = pd.DataFrame(data=comments, columns=['CONTENT'])
test_df

test_comm_tfidf = loaded_vectorizer.transform(test_df['CONTENT'])
# print(loaded_vectorizer.get_feature_names())

loaded_model.predict(test_comm_tfidf)

array([1, 0, 1])