# Get the data

In [2]:
import os

# Ensure your kaggle.json is in the correct location
os.makedirs(os.path.expanduser("~/.kaggle/"), exist_ok=True)

# Replace 'dataset-owner/dataset-name' with the actual dataset path from Kaggle
dataset = 'purusinghvi/email-spam-classification-dataset'

# Use Kaggle API to download the dataset
os.system(f'kaggle datasets download -d {dataset}')

# Unzip the dataset (optional, if it's a compressed file)
os.system(f'unzip {dataset.split("/")[-1]}.zip')


0

# creating pipelines

# steps:
1) Preprocessing the emails: 
> Strip headers, lowercase the email, remove punctuation,replace URLs, numbers,etc.

2) Vectorization:
> Use CountVectorizer or TfidfVectorizer to convert the processed text into a sparse matrix of feature vectors.

3) Building a classifier:
> Use classifiers like Logistic Regression, Random Forest, etc, and evaluate precision & recall

In [12]:
#import all modules
import re
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,accuracy_score

In [15]:
#Download stopwords from nltk if not already downloaded
import nltk
nltk.download('stopwords')

#initialize stemming and stopwords
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

#preprocessing function
def preprocess_email(email,strip_headers=True,lower_case=True,
                     remove_punctuation=True,replace_urls=True,
                     replace_numbers=True,stem=True):
    # optinally strip headers(this assume that the email header is seperated by a "\n\n")
    if strip_headers:
        email_parts = email.split("\n\n",1) #Remove header if exists
        if len(email_parts) > 1:
            email = email_parts[1]  #Only take the body is header exists
            
    #convert to lowercase
    if lower_case:
        email = email.lower()
    
    #Replace URLs with "URL"
    if replace_urls:
        email = re.sub(r'http\S+|www\S+|https\S+','URL',email,flags=re.MULTILINE)

    #Replace numbers with "NUMBER"
    if replace_numbers:
        email = re.sub(r'\b\d+\b','NUMBER',email)
    
    #Replace Punctuation
    if remove_punctuation:
        email = email.translate(str.maketrans('','',string.punctuation))

    #Tokenize, remove stopwords and optionally stem
    words = email.split()
    words = [word for word in words if word not in stop_words] #remove stopwords
    if stem:
        words = [ps.stem(word) for word in words] #Apply stemming
    
    return ' '.join(words)




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\haide\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Organizing data

In [17]:
#dataframe
df = pd.read_csv('combined_data.csv')

#preprocess the emails
df['email_cleaned'] = df['text'].apply(lambda x: preprocess_email(x))

# Vectorize the cleaned email using TfidfVectorizer
vectorizer = TfidfVectorizer(binary=True) #inary Vectorization (presence or absence of a word)
X = vectorizer.fit_transform(df['email_cleaned'])


In [19]:
#train-test split
X_train,X_test,y_train,y_test = train_test_split(X,df['label'],test_size=0.2,random_state=42)
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size=0.2,random_state=42)

In [24]:
print('X_train: ',X_train.shape)
print('y_train: ',y_train.shape)
print('X_val: ',X_val.shape) 
print('y_val: ',y_val.shape)
print('X_test: ',X_test.shape)
print('y_test: ',y_test.shape)

X_train:  (53406, 254820)
y_train:  (53406,)
X_val:  (13352, 254820)
y_val:  (13352,)
X_test:  (16690, 254820)
y_test:  (16690,)


# Training & Evaluation

In [28]:
#Logistic Regression
lr = LogisticRegression()
lr.fit(X_train,y_train)

#predict on test set
y_pred = lr.predict(X_val)

#Evaluate the model
print("Logistic Regression Results: ")
print(classification_report(y_val,y_pred))
print("Accuracy: ",accuracy_score(y_val,y_pred))


Logistic Regression Results: 
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      6252
           1       0.97      0.99      0.98      7100

    accuracy                           0.98     13352
   macro avg       0.98      0.98      0.98     13352
weighted avg       0.98      0.98      0.98     13352

Accuracy:  0.981725584182145


In [26]:
#Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

#predict on test set
y_pred = rf.predict(X_val)

#Evaluate the model
print("Logistic Regression Results: ")
print(classification_report(y_val,y_pred))
print("Accuracy: ",accuracy_score(y_val,y_pred))


Logistic Regression Results: 
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      6252
           1       0.99      0.98      0.98      7100

    accuracy                           0.98     13352
   macro avg       0.98      0.98      0.98     13352
weighted avg       0.98      0.98      0.98     13352

Accuracy:  0.9837477531455961


In [27]:
knc = KNeighborsClassifier()
knc.fit(X_train,y_train)

y_pred = knc.predict(X_val)

print("KNeighborsClassifier Results: ")
print(classification_report(y_val,y_pred))
print('Accuracy: ',accuracy_score(y_val,y_pred))

KNeighborsClassifier Results: 
              precision    recall  f1-score   support

           0       0.97      0.35      0.52      6252
           1       0.63      0.99      0.77      7100

    accuracy                           0.69     13352
   macro avg       0.80      0.67      0.65     13352
weighted avg       0.79      0.69      0.65     13352

Accuracy:  0.6923307369682444


# Conclusion:

1) Best Model : RF with 98.3% acc score
2) KNN model score issue causes: 
> curse of dimensionality, computational complexity, Sensitivity to Noise..