In [198]:
import numpy as np
import pandas as pd

In [199]:
dataset = 'Email_Spam_Detection/spam.csv'

# Read the dataset

df = pd.read_csv(dataset, encoding='ISO-8859-1')

# Display the first few rows of the dataset

df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [200]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [201]:
# remove columns
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [202]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [203]:
from bs4 import BeautifulSoup
import itertools

# Function to clean the text

def clean_text(text):
    # Remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    # Remove punctuation
    import string
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert text to lowercase
    #text = text.lower()
    # Tokenize the text
    words = text.split()
    # remove numbers and underscores
    words = [word for word in words if word.isalpha()]

    return words

# Clean the 'text' column in the dataset

df['v2'] = df['v2'].apply(clean_text)

# Display the first few rows of the cleaned dataset

df.head()


Unnamed: 0,v1,v2
0,ham,"[Go, until, jurong, point, crazy, Available, o..."
1,ham,"[Ok, lar, Joking, wif, u, oni]"
2,spam,"[Free, entry, in, a, wkly, comp, to, win, FA, ..."
3,ham,"[U, dun, say, so, early, hor, U, c, already, t..."
4,ham,"[Nah, I, dont, think, he, goes, to, usf, he, l..."


In [204]:
from nltk.corpus import stopwords

# Function to remove stopwords

def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

# Remove stopwords from the 'text' column in the dataset

df['v2'] = df['v2'].apply(remove_stopwords)

# Display the first few rows of the cleaned dataset with stopwords removed

df.head()

Unnamed: 0,v1,v2
0,ham,"[Go, jurong, point, crazy, Available, bugis, n..."
1,ham,"[Ok, lar, Joking, wif, u, oni]"
2,spam,"[Free, entry, wkly, comp, win, FA, Cup, final,..."
3,ham,"[U, dun, say, early, hor, U, c, already, say]"
4,ham,"[Nah, I, dont, think, goes, usf, lives, around..."


In [205]:
def remove_words(words):
    filtered_tokens = [word for word in words if len(word) > 2]
    return filtered_tokens

df['v2'] = df['v2'].apply(remove_words)
df.head()

Unnamed: 0,v1,v2
0,ham,"[jurong, point, crazy, Available, bugis, great..."
1,ham,"[lar, Joking, wif, oni]"
2,spam,"[Free, entry, wkly, comp, win, Cup, final, tkt..."
3,ham,"[dun, say, early, hor, already, say]"
4,ham,"[Nah, dont, think, goes, usf, lives, around, t..."


In [206]:
from nltk.stem import WordNetLemmatizer

# Function to lemmatize words

def lemmatize_words(words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words

# Lemmatize the 'text' column in the dataset

df['v2'] = df['v2'].apply(lemmatize_words)

# Display the first few rows of the cleaned dataset with stopwords removed and lemmatized

df.head()

Unnamed: 0,v1,v2
0,ham,"[jurong, point, crazy, Available, bugis, great..."
1,ham,"[lar, Joking, wif, oni]"
2,spam,"[Free, entry, wkly, comp, win, Cup, final, tkt..."
3,ham,"[dun, say, early, hor, already, say]"
4,ham,"[Nah, dont, think, go, usf, life, around, though]"


In [207]:
def remove_duplicates(words):
    filtered_tokens = [word for word, _ in itertools.groupby(words)]
    return filtered_tokens

df['v2'] = df['v2'].apply(remove_duplicates)
df.head()

Unnamed: 0,v1,v2
0,ham,"[jurong, point, crazy, Available, bugis, great..."
1,ham,"[lar, Joking, wif, oni]"
2,spam,"[Free, entry, wkly, comp, win, Cup, final, tkt..."
3,ham,"[dun, say, early, hor, already, say]"
4,ham,"[Nah, dont, think, go, usf, life, around, though]"


In [208]:
import pandas as pd



# Attempting to apply lower() directly
# This raises an error if the column has lists
df['v2'] = df['v2'].apply(lambda x: x.lower() if isinstance(x, str) else x)

# Handle lists properly
df['v2'] = df['v2'].apply(lambda x: " ".join(x).lower() if isinstance(x, list) else x.lower())
print(df)


        v1                                                 v2
0      ham  jurong point crazy available bugis great world...
1      ham                                 lar joking wif oni
2     spam  free entry wkly comp win cup final tkts may te...
3      ham                      dun say early hor already say
4      ham           nah dont think go usf life around though
...    ...                                                ...
5567  spam  this time tried contact pound prize claim easy...
5568   ham                          will going esplanade home
5569   ham                         pity mood soany suggestion
5570   ham  the guy bitching acted like interested buying ...
5571   ham                                 rofl its true name

[5572 rows x 2 columns]


In [209]:
# Save the cleaned dataset

df.to_csv('Email_spam_Detection/cleaned_spam.csv', index=False)

In [210]:
dataset = pd.read_csv('Email_Spam_Detection/cleaned_spam.csv')
dataset.head()

Unnamed: 0,v1,v2
0,ham,jurong point crazy available bugis great world...
1,ham,lar joking wif oni
2,spam,free entry wkly comp win cup final tkts may te...
3,ham,dun say early hor already say
4,ham,nah dont think go usf life around though


In [217]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Step 1: Load and prepare data

X = features
y = dataset['v1']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Choose kernel and set hyperparameters
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale')

# Step 3: Train the model
svm_model.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = svm_model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

ValueError: Found input variables with inconsistent numbers of samples: [7610, 5572]