# Email Spam Classifier

### Import and Clean Data

In [2]:
import pandas as pd

df = pd.read_csv("SMSSpamCollection", sep='\t', header=None, names=['label', 'text'])
print(df)

     label                                               text
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


### Data Clean & Split

In [3]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)      # remove numbers
    return text


df['cleaned'] = df['text'].apply(clean_text)

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
X = vectorizer.fit_transform(df['cleaned'])  # this becomes your features
y = df['label'].map({'ham': 0, 'spam': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Train the classifier

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

model = MultinomialNB()
model.fit(X_train, y_train)


0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


### Evaluate Model

In [5]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9524663677130045
Confusion Matrix:
 [[966   0]
 [ 53  96]]
Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       966
           1       1.00      0.64      0.78       149

    accuracy                           0.95      1115
   macro avg       0.97      0.82      0.88      1115
weighted avg       0.95      0.95      0.95      1115



### Testing Custom Emails

In [6]:
def predict_email(email_text):
    cleaned = clean_text(email_text)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)[0]
    return 'Spam' if prediction == 1 else 'Not Spam'

# Try it
print(predict_email("You are a winner U have been specially selected 2 receive £1000 be lying. play jokes..."))


Spam


In [7]:
def debug_email(email_text):
    cleaned = clean_text(email_text)
    vector = vectorizer.transform([cleaned])
    df_vector = pd.DataFrame(vector.toarray(), columns=vectorizer.get_feature_names_out())
    top_words = df_vector.T[df_vector.T[0] > 0].sort_values(0, ascending=False)
    print("TF-IDF values of this message:\n")
    print(top_words.head(15))  # top 15 important words

debug_email("You are a winner U have been specially selected 2 receive £1000 or a 4* holiday (flights inc) speak to a live operator 2 claim 0871277810910p/min (18+) ")


TF-IDF values of this message:

                           0
receive holiday     0.274974
claim pmin          0.262492
flights speak       0.246767
holiday flights     0.246767
winner specially    0.241155
flights             0.236410
operator claim      0.228673
specially selected  0.228673
specially           0.225430
speak live          0.222496
live operator       0.222496
operator            0.210962
winner              0.209095
selected receive    0.204093
pmin                0.201159


## Exposing to API

In [8]:
import joblib

# Save model and vectorizer
joblib.dump(model, 'spam_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']