In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
df = pd.read_csv('E://m//SPAM SMS Dataset//spam.csv', encoding='latin-1')
df_cleaned = df[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'message'})


In [3]:
df_cleaned['label'] = df_cleaned['label'].map({'spam': 1, 'ham': 0})


In [4]:
X = df_cleaned['message']
y = df_cleaned['label']


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [7]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)


In [8]:
y_pred = model.predict(X_test_tfidf)


In [9]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


In [10]:
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')


Accuracy: 0.9776
Precision: 1.0000
Recall: 0.8333
F1-Score: 0.9091


In [11]:
from sklearn.model_selection import GridSearchCV
params = {'alpha': [0.1, 0.5, 1.0]}
grid_search = GridSearchCV(MultinomialNB(), param_grid=params, cv=5, scoring='f1')
grid_search.fit(X_train_tfidf, y_train)
print(f'Best alpha: {grid_search.best_params_}')
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred_best)
precision = precision_score(y_test, y_pred_best)
recall = recall_score(y_test, y_pred_best)
f1 = f1_score(y_test, y_pred_best)

print(f'Tuned Model Accuracy: {accuracy:.4f}')
print(f'Tuned Model Precision: {precision:.4f}')
print(f'Tuned Model Recall: {recall:.4f}')
print(f'Tuned Model F1-Score: {f1:.4f}')


Best alpha: {'alpha': 0.1}
Tuned Model Accuracy: 0.9830
Tuned Model Precision: 0.9645
Tuned Model Recall: 0.9067
Tuned Model F1-Score: 0.9347


In [12]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)
model_smote = MultinomialNB()
model_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = model_smote.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred_smote)
precision = precision_score(y_test, y_pred_smote)
recall = recall_score(y_test, y_pred_smote)
f1 = f1_score(y_test, y_pred_smote)

print(f'SMOTE Model Accuracy: {accuracy:.4f}')
print(f'SMOTE Model Precision: {precision:.4f}')
print(f'SMOTE Model Recall: {recall:.4f}')
print(f'SMOTE Model F1-Score: {f1:.4f}')


SMOTE Model Accuracy: 0.9677
SMOTE Model Precision: 0.8519
SMOTE Model Recall: 0.9200
SMOTE Model F1-Score: 0.8846


In [13]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_tfidf, y_train)
y_pred_log = log_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred_log)
precision = precision_score(y_test, y_pred_log)
recall = recall_score(y_test, y_pred_log)
f1 = f1_score(y_test, y_pred_log)

print(f'Logistic Regression Accuracy: {accuracy:.4f}')
print(f'Logistic Regression Precision: {precision:.4f}')
print(f'Logistic Regression Recall: {recall:.4f}')
print(f'Logistic Regression F1-Score: {f1:.4f}')


Logistic Regression Accuracy: 0.9623
Logistic Regression Precision: 0.9737
Logistic Regression Recall: 0.7400
Logistic Regression F1-Score: 0.8409


In [14]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(MultinomialNB(), X_train_tfidf, y_train, cv=5, scoring='accuracy')

print(f'Cross-Validation Accuracy: {scores.mean():.4f}')


Cross-Validation Accuracy: 0.9789


In [15]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english', max_features=3000)),
    ('classifier', MultinomialNB())
])
pipeline.fit(X_train, y_train)
y_pred_pipeline = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_pipeline)
print(f'Pipeline Model Accuracy: {accuracy:.4f}')


Pipeline Model Accuracy: 0.9776


In [36]:
import pickle
with open('spam_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)


In [None]:
from flask import Flask, request, jsonify
import pickle

app = Flask(__name__)
with open('spam_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)
with open('vectorizer.pkl', 'rb') as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json['message']
    vectorized_data = vectorizer.transform([data])
    prediction = model.predict(vectorized_data)[0]
    return jsonify({'spam': bool(prediction)})

if __name__ == '__main__':
    app.run(debug=True)
