In [1]:
import pandas as pd

#Assigning the data file to a variable
file_path = r"C:\git\comp237-gp1-nlp-project\dataset\Youtube05-Shakira.csv"

# Replace 'file_path' with the actual path to your data file
df = pd.read_csv(file_path)


In [2]:
#Displaying the first 5 rows of the data to see what it looks like
df.head(5)

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,z13lgffb5w3ddx1ul22qy1wxspy5cpkz504,dharma pal,2015-05-29T02:30:18.971000,Nice song﻿,0
1,z123dbgb0mqjfxbtz22ucjc5jvzcv3ykj,Tiza Arellano,2015-05-29T00:14:48.748000,I love song ﻿,0
2,z12quxxp2vutflkxv04cihggzt2azl34pms0k,Prìñçeśś Âliś Łøvê Dømíñø Mâđiś™ ﻿,2015-05-28T21:00:08.607000,I love song ﻿,0
3,z12icv3ysqvlwth2c23eddlykyqut5z1h,Eric Gonzalez,2015-05-28T20:47:12.193000,"860,000,000 lets make it first female to reach...",0
4,z133stly3kete3tly22petvwdpmghrlli,Analena López,2015-05-28T17:08:29.827000,shakira is best for worldcup﻿,0


In [3]:
#Confirming the data type of each column and no missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370 entries, 0 to 369
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   COMMENT_ID  370 non-null    object
 1   AUTHOR      370 non-null    object
 2   DATE        370 non-null    object
 3   CONTENT     370 non-null    object
 4   CLASS       370 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 14.6+ KB


In [4]:
#After reading the data file, we can determine that only the 'CONTENT' and 'CLASS' columns are needed for the analysis.
#We can drop the other columns using the drop() method.
df_filtered = df.drop(['COMMENT_ID', 'AUTHOR', 'DATE'], axis=1)

In [5]:
#find How many labels of CLASS column
df_filtered['CLASS'].unique()

array([0, 1], dtype=int64)

In [6]:
# View data with CLASS = 1
df_filtered[df_filtered['CLASS'] == 1].head(5)

Unnamed: 0,CONTENT,CLASS
7,SEE SOME MORE SONG OPEN GOOGLE AND TYPE Shakir...,1
22,Check out this playlist on YouTube:﻿,1
31,Support the fight for your 4th amendment right...,1
36,Check out this video on YouTube:﻿,1
51,"coby this USL and past :<br /><a href=""http://...",1


In [7]:
# View data with CLASS = 0
df_filtered[df_filtered['CLASS'] == 0].head(5)

Unnamed: 0,CONTENT,CLASS
0,Nice song﻿,0
1,I love song ﻿,0
2,I love song ﻿,0
3,"860,000,000 lets make it first female to reach...",0
4,shakira is best for worldcup﻿,0


From above data, we can determine CLASS labeled with 1 is Spam, 0 is non-spam.

In [8]:
# Shuffle the dataset
df_shuffled = df_filtered.sample(frac=1)

# Print the shuffled dataset
df_shuffled.head(4)


Unnamed: 0,CONTENT,CLASS
240,Shakira I love you,0
357,********OMG Facebook is OLD! Check out ------...,1
319,Hey guys and girls check out Comedy Recipe for...,1
38,Love this song! My soccer team made a cd for o...,0


In [9]:
# Split the dataset into training and test sets with an 75/25 split
X = df_shuffled.drop('CLASS', axis=1) 
y = df_shuffled['CLASS']

train_size = int(len(X) * 0.75) 
test_size = len(X) - train_size

X_train = X[:train_size]
X_test = X[train_size:]
y_train = y[:train_size]
y_test = y[train_size:]

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (277, 1)
X_test shape: (93, 1)
y_train shape: (277,)
y_test shape: (93,)


In [10]:
#Build a count vectorizer and extract term counts 
#Building a Category text predictor. 
#Use count_vectorizer.fit_transform().
from sklearn.feature_extraction.text import CountVectorizer

# Create an instance of CountVectorizer
count_vectorizer = CountVectorizer()
train_tc = count_vectorizer.fit_transform(X_train['CONTENT'])
train_tc.shape


(277, 1130)

In [11]:
#This downscaling is called tf–idf for “Term Frequency times Inverse Document Frequency”.
# Create the tf-idf transformer
from sklearn.feature_extraction.text import TfidfTransformer

# Create an instance of TfidfTransformer
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_tc)


# Print the shape of the downscaled data
print("Shape of the downscaled data:", train_tfidf.shape)

# Print any other useful information about the downscaled data
print("Type of the data:", type(train_tfidf))



Shape of the downscaled data: (277, 1130)
Type of the data: <class 'scipy.sparse._csr.csr_matrix'>


In [12]:
from sklearn.naive_bayes import MultinomialNB

# tfidf used for text classification and information retrieval
# Create an instance of the MultinomialNB classifier
classifier = MultinomialNB().fit(train_tfidf, y_train)


In [13]:
#Using nltk toolkit classes and methods prepare the data for model building
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [14]:
from sklearn.metrics import accuracy_score

ypred = classifier.predict(train_tfidf)
accuracy_score(y_train, ypred)

0.9891696750902527

# Define test data here

In [23]:
input_data = X_test['CONTENT']

In [20]:
#Custom input data here

# Create 4 non-spam comments and 2 spam comments
input_data = [
    'This is a great video',
    'I love this video',
    'This video is the best',
    'This video is the worst',
    'Check out my video',
    'Yo must see Snoop Dogg new video!',
    'It is similar to the video I saw yesterday',
    'It is similar to the video with snoop dog'
]


In [24]:
input_tc = count_vectorizer.transform(input_data)

In [25]:
input_tfidf = tfidf_transformer.transform(input_tc)
predictions = classifier.predict(input_tfidf)

category_map = {
    0: 'Not Spam', 
    1: 'Spam', 
}

# Print the classification results
for sent, category in zip(input_data, predictions):
    print('\nInput:', sent, '\nPredicted category:', category_map[category])


Input: and how many subscribers compared to her over a million 
Predicted category: Spam

Input: CHECK OUT THE DUBSTEP VERSION 
Predicted category: Spam

Input: I felt old when I realized that this song was 5 years old...﻿ 
Predicted category: Not Spam

Input: Can this channel get 500+ subscribers? You can make that happen :D﻿ 
Predicted category: Spam

Input: Hello everyone :) I know most of you probably pass up these kind of comments, but for those who are still reading this, thanks! I don’t have any money for advertisements, no chance of getting heard, nothing... If this comes off as spam, sorry. I am a video animator, just trying to make it up into the video animation industry. Please give me the chance to prove myself to you. Please visit my channel, subscribe if you like and thumb this comment up, so everyone can see! Thank You!  
Predicted category: Spam

Input: Hi, nice song Shakira! (Sorry for bad Brazilian)﻿ 
Predicted category: Not Spam

Input: Hey guys whats up? I found th

In [26]:
#Check the accuracy_score with test data set (0.25)
accuracy_score(y_test, predictions)

0.956989247311828