In [None]:
import  pandas as  pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from  sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import FunctionTransformer
import pickle

In [None]:
data=pd.read_csv('/content/spam.csv',encoding='latin-1')[['v1','v2']]

In [None]:
df=data.copy()

In [None]:
df.head(10)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [None]:
df.isnull().sum()

Unnamed: 0,0
v1,0
v2,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
stop=set(stopwords.words('english'))
lem=WordNetLemmatizer()

def clean_text(text,keep_number=False):
  if not isinstance(text,str):
    return ""

  s=text.lower()
  s = re.sub(r"http\S+|www\S+|https\S+", " ", s)   # remove urls
  s = re.sub(r"\S+@\S+", " ", s)
  if keep_number:

    s=re.sub(r"[^a-z0\s]"," ",s)
  else:
     s = re.sub(r"[^a-z\s]", " ", s)
  toks = [t for t in s.split() if len(t)>2 and t not in stop]
  toks = [lem.lemmatize(t) for t in toks]
  return " ".join(toks)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
x=df['v2']
y=df['v1']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train_cleaned = x_train.apply(clean_text)
x_test_cleaned = x_test.apply(clean_text)

In [None]:
tfdf=TfidfVectorizer(preprocessor=clean_text,ngram_range=(1,2),max_df=0.9,min_df=3,max_features=15000)
mb=MultinomialNB()

In [None]:
pipe=Pipeline([('tfdf',tfdf),('mb',mb)])

In [None]:
pipe.fit(x_train_cleaned,y_train)

In [None]:
pred=pipe.predict(x_test_cleaned)

In [None]:
print(pred)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'spam']


In [None]:
a=accuracy_score(y_test,pred)
print(a)

0.9659192825112107


In [None]:
cl=(classification_report(y_test, pred, target_names=['ham','spam']))
print(cl)

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       0.99      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [None]:
new_emails = [
    "Congratulations! You've won $1000. Click here to claim your prize!",
    "Hey John, are we still meeting tomorrow at 2PM?"
]

pipe.predict(new_emails)


array(['spam', 'ham'], dtype='<U4')

In [None]:
email=["hello i have urgent work contanct me ",
       "congratulations you won a prize "]
pipe.predict(email)


array(['ham', 'spam'], dtype='<U4')

In [None]:
filename = 'emailspam_model.pkl'

# Open the file in binary write mode ('wb')
with open(filename, 'wb') as file:
    # Use pickle.dump() to serialize the model object to the file
    pickle.dump(pipe, file)

In [None]:
pipe.predict([
  "Congratulations! You have won a free iPhone. Click here to claim."
])


array(['spam'], dtype='<U4')

In [None]:
pipe.predict([
  "Hi Rahul, please send me the project report by evening."
])


array(['ham'], dtype='<U4')