In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')  
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gaurav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/gaurav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/gaurav/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/gaurav/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/gaurav/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/gaurav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import pandas as pd
import joblib
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [3]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join back to string
    return " ".join(tokens)

In [4]:
def vectorize_text(corpus, method='count'):
    if method == 'tfidf':
        vectorizer = TfidfVectorizer()
    if method=='count':
        vectorizer = CountVectorizer()
    return vectorizer.fit_transform(corpus),vectorizer

In [5]:
df=pd.read_csv("SMS_spam_collection.csv")
df

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
df['cleaned_message'] = df['Message'].apply(preprocess_text)    # preprocessing
df


Unnamed: 0,Label,Message,cleaned_message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u u £750 pound prize ...
5568,ham,Will ü b going to esplanade fr home?,ü b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood soany suggestion
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like id interested buying s...


In [7]:
X,vectorizer= vectorize_text(df['cleaned_message'],method='count')

# Encode labels
y = df['Label'].map({'ham': 0, 'spam': 1})

In [8]:
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Label, Length: 5572, dtype: int64

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1448
           1       0.88      0.92      0.90       224

    accuracy                           0.97      1672
   macro avg       0.93      0.95      0.94      1672
weighted avg       0.97      0.97      0.97      1672



In [11]:
joblib.dump(model, 'spam_classifier_model_updated.pkl')
joblib.dump(vectorizer,"vectorizer_updated.pkl")

['vectorizer_updated.pkl']

In [12]:
model = joblib.load('spam_classifier_model_updated.pkl')
vectorizer=joblib.load('vectorizer_updated.pkl')

In [13]:
def predict_message(msg):
    cleaned = preprocess_text(msg)
    vect_msg = vectorizer.transform([cleaned])
    prediction = model.predict(vect_msg)
    return 'spam' if prediction[0] == 1 else 'ham'

In [17]:
predict_message("congratulation you won the 100000 dollars in lottery please click link below to get your money")

'spam'

In [16]:
predict_message("Congratulatio you won lottery please click here to get your prize")

'spam'

In [29]:
predict_message(" congratulations you won million dollar please click the link below to get your money")

'spam'