## SPAM MAIL PREDICTION


In [1]:

# importing neccesary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score , classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

### data collection and preprocessing

In [2]:
# loading csv data into dataframe
mail_data_df = pd.read_csv('/content/mail_data.csv')
print(mail_data_df)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [3]:
# checking to find missing values
print(mail_data_df.isnull().sum())

Category    0
Message     0
dtype: int64


In [4]:
# extracting first five rows
mail_data_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### label encoding (converting text data to numerical)


*   ham -->1
*   spam-->0



In [5]:
# remove extra spaces from column name
mail_data_df.columns = mail_data_df.columns.str.strip()

In [6]:
# modifying category column
mail_data_df.loc[mail_data_df['Category'] == 'ham', 'Category'] = 1
mail_data_df.loc[mail_data_df['Category'] == 'spam', 'Category'] = 0

In [7]:
# saperating the data as texts and labels

x = mail_data_df['Message']
y = mail_data_df['Category']


In [8]:
print(x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [9]:
print(y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [10]:
#splitting data into train and test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state=23)
print(x.shape,x_train.shape,x_test.shape) # to know how many datas are used in training and testing data

(5572,) (4457,) (1115,)


## feature extraction

In [12]:
# transforming text data into feature vectors that can be used in logistic regression

feather_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
x_train_feature = feather_extraction.fit_transform(x_train) # fitting and transforming to vectoiser
x_test_feature = feather_extraction.transform(x_test)# transforming into vector from

# convert y test and y train data into int

y_train = y_train.astype(int)
y_test = y_test.astype(int)

## trainig the model

### logistic regression

In [13]:
# initialize model
model = LogisticRegression()
model.fit(x_train_feature,y_train) # training the data

In [14]:
model.fit(x_train_feature,y_train)

## evaluating the model

In [15]:
train_data_predict = model.predict(x_train_feature)
accuracy_train_data = accuracy_score(y_train,train_data_predict)
print('accurcay of train data is : ',accuracy_train_data*100)

accurcay of train data is :  96.74669059905766


In [16]:
# prediction on test data
test_data_predict = model.predict(x_test_feature)
accuracy_test_data = accuracy_score(y_test,test_data_predict)
print('accurcay of test data is : ',accuracy_test_data*100)

accurcay of test data is :  96.41255605381166


## MODEL 2

## support vector machine

In [17]:
#importing the neccessary libraires

from sklearn.svm import SVC

# train svc model
model = SVC(kernel='linear')  # Linear kernel is often effective for text classification
model.fit(x_train_feature,y_train)



In [18]:
 #Predict on the train dataset
 x_train_predict= model.predict(x_train_feature)

# Evaluate model performance
print("train data Accuracy:", accuracy_score(y_train, x_train_predict)*100)

train data Accuracy: 99.68588736818488


In [19]:
#Predict on the test dataset
x_test_predict= model.predict(x_test_feature)

# Evaluate model performance
print("test data Accuracy:", accuracy_score(y_test, x_test_predict)*100)

test data Accuracy: 97.9372197309417
