**Dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

In [None]:
file_path = '/content/drive/MyDrive/email.csv'

In [None]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
print(df.columns)
print(df.info())
print(df.describe())
print(df.isnull().sum())
print(df.nunique())

Index(['Category', 'Message'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5573 entries, 0 to 5572
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5573 non-null   object
 1   Message   5573 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
       Category                 Message
count      5573                    5573
unique        3                    5158
top         ham  Sorry, I'll call later
freq       4825                      30
Category    0
Message     0
dtype: int64
Category       3
Message     5158
dtype: int64


**Preprocessing**

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/email.csv")

# Function to clean text data
def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'\W', ' ', text)      # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)     # Remove extra whitespace
    text = text.lower()                  # Convert to lowercase
    return text

# Apply cleaning function to 'Message' column
df['cleaned_text'] = df['Message'].apply(clean_text)
print("Cleaned Text:")
print(df['cleaned_text'].head())

# Tokenization, removal of stopwords, and stemming
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
df['processed_text'] = df['cleaned_text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split() if word not in stop_words]))
print("\nProcessed Text:")
print(df['processed_text'].head())

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['processed_text'])
y = df['Category']
print("\nTF-IDF Vectorized Features:")
print(X.toarray())
print("\nLabels:")
print(y.head())


Cleaned Text:
0    go until jurong point crazy available only in ...
1                             ok lar joking wif u oni 
2    free entry in 2 a wkly comp to win fa cup fina...
3         u dun say so early hor u c already then say 
4    nah i don t think he goes to usf he lives arou...
Name: cleaned_text, dtype: object

Processed Text:
0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri 2 wkli comp win fa cup final tkt 21...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
Name: processed_text, dtype: object

TF-IDF Vectorized Features:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Labels:
0     ham
1     ham
2    spam
3     ham
4     ham
Name: Category, dtype: object


**Feature selection**

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
feature_selector = SelectKBest(score_func=chi2, k=500)
X_selected = feature_selector.fit_transform(X, y)
print("Selected Features:")
print(X_selected)

Selected Features:
  (0, 470)	0.3154994788205559
  (0, 204)	0.2677555522815362
  (0, 205)	0.3154994788205559
  (0, 343)	0.38998552404702763
  (0, 198)	0.22895039480876872
  (1, 479)	0.5404246413019002
  (1, 249)	0.5112478916942955
  (1, 319)	0.3404328442312527
  (2, 63)	0.2349290546928991
  (2, 357)	0.22696258273773448
  (2, 450)	0.17264382736031447
  (2, 412)	0.2763551241044243
  (2, 353)	0.2317464413068739
  (2, 360)	0.22696258273773448
  (2, 432)	0.16577271119306444
  (2, 180)	0.23605087201186084
  (2, 129)	0.28341096610044614
  (2, 480)	0.2034234028372385
  (2, 118)	0.2732574723060882
  (2, 485)	0.26771947142467245
  (2, 169)	0.5057953519856099
  (2, 190)	0.16206737867134877
  (3, 54)	0.37573397529496755
  (3, 160)	0.4515303487784322
  (3, 384)	0.6942157979102204
  :	:
  (5566, 190)	0.19348581543889293
  (5566, 205)	0.2279865848158193
  (5567, 35)	0.2939968388518677
  (5567, 10)	0.28591224422490796
  (5567, 161)	0.27315570035749925
  (5567, 87)	0.26793290545623183
  (5567, 123)	0.2

**Model**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Make predictions using Naive Bayes
nb_pred = nb_model.predict(X_test)

# Evaluate accuracy of Naive Bayes
nb_accuracy = accuracy_score(y_test, nb_pred)
print("Naive Bayes Accuracy:", nb_accuracy)

# Train SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Make predictions using SVM
svm_pred = svm_model.predict(X_test)

# Evaluate accuracy of SVM
svm_accuracy = accuracy_score(y_test, svm_pred)
print("SVM Accuracy:", svm_accuracy)

Naive Bayes Accuracy: 0.9784688995215312
SVM Accuracy: 0.9802631578947368


**Model evaluation**

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Confusion matrix and classification report for Naive Bayes
print("Confusion Matrix (Naive Bayes):")
print(confusion_matrix(y_test, nb_pred))


# Confusion matrix and classification report for SVM
print("\nConfusion Matrix (SVM):")
print(confusion_matrix(y_test, svm_pred))



Confusion Matrix (Naive Bayes):
[[1444    8]
 [  28  192]]

Confusion Matrix (SVM):
[[1443    9]
 [  24  196]]


**Error rate**

In [None]:
# Error rate for Naive Bayes
nb_error_rate = 1 - nb_accuracy
print("Naive Bayes Error Rate:", nb_error_rate)

# Error rate for SVM
svm_error_rate = 1 - svm_accuracy
print("SVM Error Rate:", svm_error_rate)


Naive Bayes Error Rate: 0.021531100478468845
SVM Error Rate: 0.019736842105263164
