In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
msg = pd.read_csv("spam.csv", encoding="latin-1")
msg

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
msg_1 = msg.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

In [4]:
msg_1

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
msg_1.isnull().sum()

v1    0
v2    0
dtype: int64

In [6]:
msg_1.columns = ["labels","text"]

In [7]:
msg_1

Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [8]:
def text_preprocess(text):
    t = text.lower()  #Lowercase the text
    t = re.sub(r'http://S+',"",t) #Removes the urls
    t = re.sub(r'\d+', "", t) #Removes the digits or numbers
    t = re.sub(r'[^\w\s]', "", t) #Remove everything other than text(including  blank spaces, special characters, etc...)
    return t

In [9]:
for i in range(len(msg_1)):
    text = msg_1.iloc[i,1]
    processed_text = text_preprocess(text)
    msg_1.iloc[i,1] = processed_text
    

In [10]:
msg_1

Unnamed: 0,labels,text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,spam,this is the nd time we have tried contact u u...
5568,ham,will ì_ b going to esplanade fr home
5569,ham,pity was in mood for that soany other suggest...
5570,ham,the guy did some bitching but i acted like id ...


In [11]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(msg_1["text"], msg_1["labels"], test_size=0.2, random_state=42)

In [12]:
y_train

1978     ham
3989    spam
3935     ham
4078     ham
4086    spam
        ... 
3772     ham
5191     ham
5226     ham
5390     ham
860      ham
Name: labels, Length: 4457, dtype: object

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english", max_features=3218)

x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.fit_transform(x_test)

In [47]:
from sklearn.svm import LinearSVC

model = LinearSVC()

model.fit(x_train_tfidf, y_train)



In [48]:
from sklearn.metrics import classification_report

pred = model.predict(x_test_tfidf)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         ham       0.87      0.98      0.92       965
        spam       0.25      0.04      0.07       150

    accuracy                           0.85      1115
   macro avg       0.56      0.51      0.50      1115
weighted avg       0.78      0.85      0.81      1115



In [49]:
def predict_message(text):
    # transform text
    text_tfidf = tfidf.transform([text])
    
    # predict
    prediction = model.predict(text_tfidf)[0]
    return prediction

In [50]:
user_input = "Congratulations! You won a prize"
print(predict_message(user_input))

ham


In [27]:
import joblib
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score

# pipeline
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LinearSVC(max_iter=5000))
])

# param grid (small but effective)
param_grid = {
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__min_df': [1, 2, 5],
    'tfidf__max_df': [0.9, 0.95, 1.0],
    'tfidf__sublinear_tf': [False, True],
    'clf__C': [0.01, 0.1, 1, 10],
    'clf__class_weight': [None, 'balanced']
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scorer = make_scorer(f1_score, pos_label='spam')  # choose the label you want to optimize

grid = GridSearchCV(pipe, param_grid, cv=cv, scoring=scorer, n_jobs=-1, verbose=2)
grid.fit(x_train, y_train)

print("Best score:", grid.best_score_)
print("Best params:", grid.best_params_)

# save
joblib.dump(grid.best_estimator_, "best_spam_pipeline.pkl")


Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Best score: 0.9301201514729485
Best params: {'clf__C': 10, 'clf__class_weight': None, 'tfidf__max_df': 0.9, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 1), 'tfidf__sublinear_tf': False}




['best_spam_pipeline.pkl']

In [28]:
model = joblib.load("best_spam_pipeline.pkl")

In [73]:
def predict_message(text):
        # clean BEFORE tfidf
    text_vec = tfidf.transform([text])
    return model.predict(text_vec)[0]


In [74]:
user_input = "Congratulations! You won a prize"
text_clean = text_preprocess(user_input) 
print(predict_message(text_clean))

ham


In [67]:
y_train.value_counts()

labels
ham     3860
spam     597
Name: count, dtype: int64

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=2)),
    ('clf', LinearSVC(class_weight='balanced', C=1.0, max_iter=5000))
])

pipe.fit(x_train, y_train)
pred = pipe.predict(x_test)
print(classification_report(y_test, pred))


              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       965
        spam       0.94      0.87      0.90       150

    accuracy                           0.97      1115
   macro avg       0.96      0.93      0.94      1115
weighted avg       0.97      0.97      0.97      1115





In [84]:
def predict_message1(text):
    return pipe.predict([text])[0]

In [85]:
user_input = "Congratulations! You won a prize"
text_clean = text_preprocess(user_input) 
print(predict_message1(text_clean))

spam


In [86]:
joblib.dump(pipe, "sms_spam_pipeline.pkl")

['sms_spam_pipeline.pkl']