In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('spam.csv', encoding='latin-1')

In [3]:
df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df=df[['v1', 'v2']]

In [6]:
df.columns=['label', 'message']

In [7]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.isnull().sum()

Unnamed: 0,0
label,0
message,0


In [9]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


In [10]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
unique_label=df['label'].unique()

In [12]:
unique_label

array(['ham', 'spam'], dtype=object)

In [13]:
df['label_num']=df['label'].map({'ham':0, 'spam':1})

In [14]:
df.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [15]:
df = df.drop(columns = ['label'])

In [16]:
df.head()

Unnamed: 0,message,label_num
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [17]:
# Reset the index and drop the old one
df = df.reset_index(drop=True)

# Check the first few rows
df.head()


Unnamed: 0,message,label_num
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [18]:
df['message'] = df['message'].apply(lambda x:x.lower())

In [19]:
# removing the punctuation
import string
def remove_punctuation(txt):
  return txt.translate(str.maketrans('', '', string.punctuation))

In [20]:
df['message'] = df['message'].apply(remove_punctuation)

In [21]:
def remove_numbers(txt):
    new = ""
    for i in txt:
        if not i.isdigit():
            new = new + i
    return new

df['message'] = df['message'].apply(remove_numbers)



In [22]:
def remove_emojis(txt):
    new = ""
    for i in txt:
        if i.isascii():
            new += i
    return new

df['message'] = df['message'].apply(remove_emojis)



In [24]:
df.head()

Unnamed: 0,message,label_num
0,go until jurong point crazy available only in ...,0
1,ok lar joking wif u oni,0
2,free entry in a wkly comp to win fa cup final...,1
3,u dun say so early hor u c already then say,0
4,nah i dont think he goes to usf he lives aroun...,0


In [30]:
# removing the stopwards
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [25]:
stop_words=set(stopwords.words('english'))

In [26]:
def remove_stopwords(txt):
  word_tokenized = word_tokenize(txt)
  filtered_words = [word for word in word_tokenized if word.casefold() not in stop_words]
  return ' '.join(filtered_words)

In [31]:
df['message'] = df['message'].apply(remove_stopwords)

In [32]:
df.loc[4]['message']

'nah dont think goes usf lives around though'

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label_num'],
                                                    test_size=0.20,
                                                    random_state=42)

In [35]:
X_train

Unnamed: 0,message
1978,im boat still moms check yo im half naked
3989,bank granite issues strongbuy explosive pick m...
3935,r giving second chance rahul dengra
4078,played smash bros ltgt religiously
4086,private account statement shows unredeemed poi...
...,...
3772,came hostel going sleep plz call class hrishi
5191,sorry ill call later
5226,prabhaim sorydarealyfrm heart im sory
5390,nt joking seriously told


In [36]:
X_test

Unnamed: 0,message
3245,funny fact nobody teaches volcanoes erupt tsun...
944,sent scores sophas secondary application schoo...
1044,know someone know fancies call find pobox lshb p
2484,promise getting soon youll text morning let kn...
812,congratulations ur awarded either cd gift vouc...
...,...
4264,ltdecimalgt common car better buy china asia f...
2439,rightio well arent bright early morning
5556,yes thats u texted pshewmissing much
4205,get door im


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   message    5572 non-null   object
 1   label_num  5572 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [41]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [42]:
bow_vectorizer = CountVectorizer()

In [43]:
X_train_vectorizer = bow_vectorizer.fit_transform(X_train)

In [44]:
X_train_vectorizer

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 36325 stored elements and shape (4457, 7397)>

In [45]:
X_test_vectorizer = bow_vectorizer.transform(X_test)

In [46]:
X_test_vectorizer

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 8091 stored elements and shape (1115, 7397)>

In [47]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [48]:
nb_model = MultinomialNB()

In [49]:
nb_model.fit(X_train_vectorizer, y_train)

In [52]:
pred_nb = nb_model.predict(X_test_vectorizer)

In [53]:
pred_nb

array([0, 0, 1, ..., 0, 0, 1])

In [54]:
y_test

Unnamed: 0,label_num
3245,0
944,0
1044,1
2484,0
812,1
...,...
4264,0
2439,0
5556,0
4205,0


In [55]:
acc = accuracy_score(y_test, pred_nb)

In [56]:
acc

0.9802690582959641

In [57]:
tfidf = TfidfVectorizer()

In [59]:
X_train_tfidf =  tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [60]:
nb2_model = MultinomialNB()

In [61]:
nb2_model.fit(X_train_tfidf, y_train)

In [62]:
pred_nb2 = nb2_model.predict(X_test_tfidf)

In [65]:
acc2 = accuracy_score(y_test, pred_nb2)

In [66]:
acc2

0.968609865470852

In [67]:
import joblib

joblib.dump(nb_model, 'spam_model.pkl')
joblib.dump(bow_vectorizer, 'vectorizer.pkl')
print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!
