In [1]:
from google.colab import files
uploaded = files.upload()

Saving SMSSpamCollection to SMSSpamCollection


In [2]:
import pandas as pd
df = pd.read_csv("SMSSpamCollection",sep = "\t",header = None)
df.columns = ["label","message"]
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df["message_length"] = df["message"].apply(len)
df.head()

Unnamed: 0,label,message,message_length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [4]:
print(df.groupby("label")["message_length"].mean())

label
ham      71.482487
spam    138.670683
Name: message_length, dtype: float64


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
from sklearn.model_selection import train_test_split
x = df["message"]
y = df["label"]
x_train,x_test,y_trian,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42,stratify = y)

In [7]:
import re
from nltk.corpus import stopwords
import nltk
nltk.download("stopwords")
stop_words = set(stopwords.words('english'))

def clean_text(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  text = re.sub(r'\d+','',text)
  text = ' '.join([word for word in text.split() if word not in stop_words])
  return text
x_train_clean = x_train.apply(clean_text)
x_test_clean = x_test.apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 3000)
x_train_tfidf = tfidf.fit_transform(x_train_clean)
x_test_tfidf = tfidf.fit_transform(x_test_clean)

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_enc = le.fit_transform(y_trian)
y_test_enc = le.fit_transform(y_test)

In [10]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 42)
x_train_res,y_train_res = smote.fit_resample(x_train_tfidf,y_train_enc)

In [11]:
print("Original training shape:", x_train_tfidf.shape)
print("Resampled training shape:", x_train_res.shape)

Original training shape: (4457, 3000)
Resampled training shape: (7718, 3000)


In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
model = MultinomialNB()
model.fit(x_train_tfidf,y_train_enc)
y_pred = model.predict(x_test_tfidf)
acc = accuracy_score(y_test_enc,y_pred)
print(acc)

0.8367713004484305


In [13]:
print("\nClassification Report:")
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_enc, y_pred))



Classification Report:
              precision    recall  f1-score   support

         ham       0.89      0.93      0.91       966
        spam       0.33      0.21      0.26       149

    accuracy                           0.84      1115
   macro avg       0.61      0.57      0.58      1115
weighted avg       0.81      0.84      0.82      1115


Confusion Matrix:
[[901  65]
 [117  32]]


In [14]:
model_resampled = MultinomialNB()
model_resampled.fit(x_train_res,y_train_res)
y_pred_res = model_resampled.predict(x_test_tfidf)
acc_res = accuracy_score(y_test_enc,y_pred_res)
acc_res

0.7408071748878924


Classification Report after SMOTE:
              precision    recall  f1-score   support

         ham       0.87      0.82      0.85       966
        spam       0.16      0.23      0.19       149

    accuracy                           0.74      1115
   macro avg       0.52      0.52      0.52      1115
weighted avg       0.78      0.74      0.76      1115


Confusion Matrix after SMOTE:
[[792 174]
 [115  34]]


In [16]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(class_weight = 'balanced',max_iter = 1000)
lr.fit(x_train_tfidf,y_train_enc)

In [17]:
y_pred_lr = lr.predict(x_test_tfidf)

In [18]:
accuracy_score(y_test_enc,y_pred_lr)

0.8591928251121076

In [19]:
print(classification_report(y_test_enc, y_pred_lr, target_names=le.classes_))
print(confusion_matrix(y_test_enc, y_pred_lr))

              precision    recall  f1-score   support

         ham       0.87      0.99      0.92       966
        spam       0.28      0.03      0.06       149

    accuracy                           0.86      1115
   macro avg       0.57      0.51      0.49      1115
weighted avg       0.79      0.86      0.81      1115

[[953  13]
 [144   5]]


In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import RandomOverSampler

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# 1️⃣ CountVectorizer
cv = CountVectorizer()
X_train_count = cv.fit_transform(x_train_clean)
X_test_count = cv.transform(x_test_clean)

# 2️⃣ Oversampling
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train_count, y_train_enc)

# 3️⃣ Logistic Regression
lr_model = LogisticRegression(class_weight='balanced', max_iter=1000)
lr_model.fit(X_train_res, y_train_res)

# 4️⃣ Predictions
y_pred = lr_model.predict(X_test_count)

# 5️⃣ Evaluation
print("Classification Report:")
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))

print("Confusion Matrix:")
print(confusion_matrix(y_test_enc, y_pred))


Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.98      0.88      0.93       149

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
[[963   3]
 [ 18 131]]


In [None]:
import joblib

# Save model
joblib.dump(lr_model, "sms_spam_classifier.pkl")

# Save vectorizer
joblib.dump(cv, "count_vectorizer.pkl")

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [None]:
from google.colab import files

# Download model
files.download("sms_spam_classifier.pkl")

# Download vectorizer
files.download("count_vectorizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>