In [3]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split


In [4]:
df = pd.read_csv("spam.csv", encoding='latin1')[['v1', 'v2']]
df.columns = ['label', 'message']
df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])  # ham = 0, spam = 1
df.head()


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # remove URLs
    text = re.sub(r'\@w+|\#','', text)  # remove @ and #
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)  # remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuations
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = text.strip()
    return text


In [7]:
df['cleaned'] = df['message'].apply(clean_text)
df[['message', 'cleaned']].head()


Unnamed: 0,message,cleaned
0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...
3,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [8]:
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['cleaned']).toarray()
y = df['label'].values


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
strong_spam_words = ['free', 'win', 'winner', 'congratulations', 'claim', 'urgent', 'prize',
                     'selected', 'click', 'offer', 'buy', 'money', 'cash', 'won', 'award',
                     'guarantee', 'gift', 'limited', 'act now', 'http', 'www']


In [13]:
def has_strong_spam_words(text):
    text = text.lower()
    return int(any(word in text for word in strong_spam_words))


In [14]:
df['has_strong_spam'] = df['cleaned'].apply(has_strong_spam_words)
df[['cleaned', 'has_strong_spam']].head(10)


Unnamed: 0,cleaned,has_strong_spam
0,go until jurong point crazy available only in ...,0
1,ok lar joking wif u oni,0
2,free entry in a wkly comp to win fa cup final...,1
3,u dun say so early hor u c already then say,0
4,nah i dont think he goes to usf he lives aroun...,0
5,freemsg hey there darling its been weeks now ...,1
6,even my brother is not like to speak with me t...,0
7,as per your request melle melle oru minnaminun...,0
8,winner as a valued network customer you have b...,1
9,had your mobile months or more u r entitled t...,1


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [23]:
X = df['cleaned']
y = df['has_strong_spam']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
tf = TfidfVectorizer(max_features=3000)
X_train_tf = tf.fit_transform(X_train)
X_test_tf = tf.transform(X_test)


In [25]:
# Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(X_train_tf, y_train)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train_tf, y_train)

# Extra Trees
et = ExtraTreesClassifier()
et.fit(X_train_tf, y_train)


In [26]:
import pickle
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(tf, f)
import pickle

# Save RandomForestClassifier
with open('rf.pkl', 'wb') as f:
    pickle.dump(rf, f)

# Save ExtraTreesClassifier
with open('et.pkl', 'wb') as f:
    pickle.dump(et, f)

# Save MultinomialNB
with open('mnb.pkl', 'wb') as f:
    pickle.dump(mnb, f)



In [27]:
import pickle
import numpy as np

# Load vectorizer
with open('vectorizer.pkl', 'rb') as f:
    tf = pickle.load(f)

# Load models
with open('mnb.pkl', 'rb') as f:
    mnb = pickle.load(f)

with open('rf.pkl', 'rb') as f:
    rf = pickle.load(f)

with open('et.pkl', 'rb') as f:
    et = pickle.load(f)

# Set thresholds
mnb_thresh = 0.5
mnb_override_thresh = 0.15
strong_spam_keywords = ['free', 'win', 'cash', 'prize', 'urgent', 'lottery', 'claim', 'http', 'https']

def predict_spam(msg):
    # Keyword check
    msg_lower = msg.lower()
    strong_keyword = any(word in msg_lower for word in strong_spam_keywords)

    # Transform message
    vect_msg = tf.transform([msg])

    # Individual model predictions
    mnb_proba = mnb.predict_proba(vect_msg)[0][1]  # spam probability
    rf_pred = rf.predict(vect_msg)[0]
    et_pred = et.predict(vect_msg)[0]

    # Dynamic threshold
    dynamic_thresh = mnb_override_thresh if strong_keyword else mnb_thresh

    # Final decision
    if mnb_proba >= dynamic_thresh or rf_pred == 1 or et_pred == 1:
        return "spam"
    return "ham"


In [28]:
print(predict_spam("Congratulations! You've won a free iPhone. Click http://claim.com now."))
# Output: "spam"

print(predict_spam("Hey, are we still meeting for lunch?"))
# Output: "ham"


spam
ham


In [30]:
import pickle

# Save vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(tf, f)  # tf is your fitted TfidfVectorizer

# Save MultinomialNB
with open('model.pkl', 'wb') as f:
    pickle.dump(mnb, f)

# Save RandomForestClassifier
with open('rf.pkl', 'wb') as f:
    pickle.dump(rf, f)

# Save ExtraTreesClassifier
with open('et.pkl', 'wb') as f:
    pickle.dump(et, f)
