In [97]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [98]:
df = pd.read_csv(r"C:\Users\Jasin\Downloads\spam_detection_dataset.csv")
df.head()

Unnamed: 0,num_links,num_words,has_offer,sender_score,all_caps,is_spam
0,3,98,1,0.718607,0,0
1,0,170,0,0.698901,1,0
2,0,38,0,0.620466,0,0
3,0,116,0,0.701755,0,0
4,3,89,1,0.583621,1,1


In [101]:
df.shape

(20000, 6)

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   num_links     20000 non-null  int64  
 1   num_words     20000 non-null  int64  
 2   has_offer     20000 non-null  int64  
 3   sender_score  20000 non-null  float64
 4   all_caps      20000 non-null  int64  
 5   is_spam       20000 non-null  int64  
dtypes: float64(1), int64(5)
memory usage: 937.6 KB


In [103]:
df.describe()

Unnamed: 0,num_links,num_words,has_offer,sender_score,all_caps,is_spam
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,1.4973,109.50615,0.30275,0.694248,0.0978,0.09175
std,1.220478,51.969579,0.459459,0.188312,0.297051,0.28868
min,0.0,20.0,0.0,0.0,0.0,0.0
25%,1.0,64.0,0.0,0.567073,0.0,0.0
50%,1.0,110.0,0.0,0.69974,0.0,0.0
75%,2.0,154.0,1.0,0.834083,0.0,0.0
max,9.0,199.0,1.0,1.0,1.0,1.0


In [104]:
#missing values
df.isnull().sum()

num_links       0
num_words       0
has_offer       0
sender_score    0
all_caps        0
is_spam         0
dtype: int64

In [105]:
df.dropna(inplace=True)

In [106]:
X = df.drop('is_spam', axis=1)
y = df['is_spam']

In [107]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [109]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1:", f1_score(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.943
Precision: 0.7925311203319502
Recall: 0.5176151761517616
F1: 0.6262295081967213


In [110]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1:", f1_score(y_test, y_pred_rf))

Random Forest Accuracy: 0.953
Precision: 0.7578347578347578
Recall: 0.7208672086720868
F1: 0.7388888888888889


In [112]:
import joblib
joblib.dump(scaler, "scaler.pkl")
joblib.dump(rf_model, "spam_model.pkl")
print("Model & Scaler saved!")

Model & Scaler saved!


In [114]:
loaded_scaler = joblib.load("scaler.pkl")
loaded_model = joblib.load("spam_model.pkl")
print("Model & Scaler loaded!")

Model & Scaler loaded!
