In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
import re
from nltk.corpus import stopwords

In [3]:
data = pd.read_csv('spamdata.csv')

In [4]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kalpa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
df = pd.DataFrame(data)

In [7]:
def preprocess_text(v2):
    v2 = v2.lower()  # Lowercase
    v2 = re.sub(r'\d+', '', v2)  # Remove numbers
    v2 = re.sub(r'\W+', ' ', v2)  # Remove special characters
    v2 = re.sub(r'\s+', ' ', v2)  # Remove extra spaces
    v2 = v2.strip()  # Remove leading and trailing spaces
    return v2

In [8]:
df['v2'] = df['v2'].apply(preprocess_text)

In [9]:
stop_words = set(stopwords.words('english'))
df['v2'] = df['v2'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [10]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,go jurong point crazy available bugis n great ...,,,
1,ham,ok lar joking wif u oni,,,
2,spam,free entry wkly comp win fa cup final tkts st ...,,,
3,ham,u dun say early hor u c already say,,,
4,ham,nah think goes usf lives around though,,,


In [11]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [12]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['v2'])
y = df['v1'].apply(lambda x:1 if x=='spam' else 0)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
X_train.shape

(3900, 7656)

####1.naive bayes Algorithm

In [15]:

model = MultinomialNB()
model.fit(X_train, y_train)


In [16]:

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9611244019138756
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1453
           1       1.00      0.70      0.83       219

    accuracy                           0.96      1672
   macro avg       0.98      0.85      0.90      1672
weighted avg       0.96      0.96      0.96      1672



2.logistic regrression

In [17]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [18]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9491626794258373
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      1453
           1       0.96      0.64      0.77       219

    accuracy                           0.95      1672
   macro avg       0.95      0.82      0.87      1672
weighted avg       0.95      0.95      0.94      1672



3.support vector machine

In [19]:
from sklearn.svm import SVC

model = SVC(kernel='linear')
model.fit(X_train, y_train)


In [20]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9778708133971292
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1453
           1       0.98      0.85      0.91       219

    accuracy                           0.98      1672
   macro avg       0.98      0.92      0.95      1672
weighted avg       0.98      0.98      0.98      1672



4.Random forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

In [22]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9760765550239234
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99      1453
           1       1.00      0.82      0.90       219

    accuracy                           0.98      1672
   macro avg       0.99      0.91      0.94      1672
weighted avg       0.98      0.98      0.98      1672



In [35]:
new_data = ["Congratulations! you got selectd for an interview at infosys", 
            "Please find the attached report for your review."]

# Preprocess the new data
new_data_transformed = vectorizer.transform(new_data)

In [36]:
predictions = model.predict(new_data_transformed)

# Print predictions
for email, prediction in zip(new_data, predictions):
    label = 'Spam' if prediction == 1 else 'Not Spam'
    print(f"Email: {email}\nPrediction: {label}\n")

Email: Congratulations! you got selectd for an interview at infosys
Prediction: Not Spam

Email: Please find the attached report for your review.
Prediction: Not Spam



In [39]:
import pickle
pickle.dump(model,open('C:/Users/kalpa/Desktop/spam/model.pkl','wb'))

models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(kernel='linear'),
    'Random Forest': RandomForestClassifier(n_estimators=100)
}

# Train and evaluate each model
i=1
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{i}. Model: {name}")
    i+=1
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("\n")