In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('messages.csv',encoding='latin1')

In [3]:
df.head()

Unnamed: 0,subject,message,label
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
1,,"lang classification grimes , joseph e . and ba...",0
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
3,risk,a colleague and i are researching the differin...,0
4,request book information,earlier this morning i was on the phone with a...,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2893 entries, 0 to 2892
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  2831 non-null   object
 1   message  2893 non-null   object
 2   label    2893 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 67.9+ KB


In [5]:
df.isnull().sum()

subject    62
message     0
label       0
dtype: int64

In [6]:
df=df.dropna()

In [7]:
df.isnull().sum()

subject    0
message    0
label      0
dtype: int64

In [8]:
df['text']=df['subject']+" "+df['message']

In [9]:
df=df[['text','label']]

In [10]:
df.head()

Unnamed: 0,text,label
0,job posting - apple-iss research center conten...,0
2,query : letter frequencies for text identifica...,0
3,risk a colleague and i are researching the dif...,0
4,request book information earlier this morning ...,0
5,call for abstracts : optimality in syntactic t...,0


In [11]:
df['label'].value_counts()

0    2363
1     468
Name: label, dtype: int64

In [12]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [13]:
lemmatizer=WordNetLemmatizer()
stop_words=set(stopwords.words('english'))

In [14]:
def clean_text(text):
    text=text.lower() #lowercase
    text=re.sub(r'https?://\S+|www\.\S+', '', text) #remove URLs
    text = text.translate(str.maketrans('', '', string.punctuation)) #removes punctuation
    text = re.sub(r'\d+', '', text) #remove numbers
    text = ' '.join(text.split()) # remove extra spaces
    text = ' '.join([word for word in text.split() if word not in stop_words])#remove stopwords
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])#lemmatize
    return text


df['text']=df['text'].apply(clean_text)
print(df.head())

                                                text  label
0  job posting appleiss research center content l...      0
2  query letter frequency text identification pos...      0
3  risk colleague researching differing degree ri...      0
4  request book information earlier morning phone...      0
5  call abstract optimality syntactic theory cont...      0


In [15]:
X=df['text']
y=df['label']

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(max_features=5000)

In [18]:
X_train_tfidf=vectorizer.fit_transform(X_train)
X_test_tfidf=vectorizer.transform(X_test)

In [19]:
from sklearn.naive_bayes import MultinomialNB
model2=MultinomialNB()

In [20]:
model2.fit(X_train_tfidf,y_train)

MultinomialNB()

In [21]:
y_pred=model2.predict(X_test_tfidf)

In [22]:
from sklearn.metrics import accuracy_score,classification_report
print("Accuracy:",accuracy_score(y_test,y_pred))

Accuracy: 0.9872881355932204


In [23]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       584
           1       1.00      0.93      0.96       124

    accuracy                           0.99       708
   macro avg       0.99      0.96      0.98       708
weighted avg       0.99      0.99      0.99       708



In [24]:
import pickle
with open('spam_classification.pkl','wb') as model2_file:
    pickle.dump(model2,model2_file)
    
with open('vectorizer1.pkl','wb') as vectorizer_file:
    pickle.dump(vectorizer,vectorizer_file)

In [25]:
with open('spam_classification.pkl','rb') as model2_file:
    model2=pickle.load(model2_file)
    
with open('vectorizer1.pkl','rb') as vectorizer_file:
    vectorizer=pickle.load(vectorizer_file)

In [27]:
new_emails = ["Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...","free","Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's","URGENT! Your Mobile No. was awarded å£2000 Bonus Caller Prize on 5/9/03 This is our final try to contact U! Call from Landline 09064019788 BOX42WR29C, 150PPM","Congratulations! You've won a lottery.", "Please find the meeting agenda attached.","Hi Team,Just a reminder that we have our weekly team meeting tomorrow at 10:00 AM in Conference Room A. Please come prepared with updates on your current tasks.Thanks,[Your Manager]"]
new_emails_vectorizer=vectorizer.transform(new_emails)
predictions=model2.predict(new_emails_vectorizer)
print(predictions)

[0 1 0 0 0 0 0]
