In [74]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

In [75]:
data = pd.read_csv('Spam_classification.csv')

In [76]:
data.head()

Unnamed: 0,label,origin
0,0,Received: from rodan.UU.NET by aramis.rutgers....
1,1,Received: from unknown (HELO groucho.cs.psu.ed...
2,1,Received: \n\tfrom 24-151-178-89.dhcp.kgpt.tn....
3,0,Received: from psuvax1.cs.psu.edu ([130.203.2....
4,1,Received: from 201-1-198-159.dsl.telesp.net.br...


In [77]:
data.shape

(37822, 2)

In [78]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37822 entries, 0 to 37821
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   37822 non-null  int64 
 1   origin  37822 non-null  object
dtypes: int64(1), object(1)
memory usage: 591.1+ KB


In [79]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [80]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [81]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return ' '.join(tokens)

In [82]:
data['processed_text'] = data['origin'].apply(preprocess_text)

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=15000, stop_words=['this','is','that','and','are','I'])
X = vectorizer.fit_transform(data['processed_text'])
y = data['label']

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [85]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Naive Bayers Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Naive Bayers Accuracy: 0.9866490416391276

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2534
           1       0.99      0.99      0.99      5031

    accuracy                           0.99      7565
   macro avg       0.98      0.99      0.99      7565
weighted avg       0.99      0.99      0.99      7565



In [86]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.9949768671513549

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2534
           1       1.00      0.99      1.00      5031

    accuracy                           0.99      7565
   macro avg       0.99      1.00      0.99      7565
weighted avg       0.99      0.99      0.99      7565



In [87]:
from sklearn.svm import SVC

svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)

y_pred_svm = svm_classifier.predict(X_test)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)
print("\nSVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.9931262392597489

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2534
           1       1.00      0.99      0.99      5031

    accuracy                           0.99      7565
   macro avg       0.99      0.99      0.99      7565
weighted avg       0.99      0.99      0.99      7565

