<a href="https://colab.research.google.com/github/jerinSabrina/NLP-projects/blob/main/kaggle_dataset_spam_message_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **# Data Preprocessing**

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [5]:
df = pd.read_csv('/content/spam_ham_dataset.csv') #file are tap seperated, files name tsv and need to told sep type.


In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [7]:
#check missing data
df.isna().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [8]:
# check backside of data
df.tail()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0
5170,4807,spam,Subject: important online banking alert\r\ndea...,1


In [9]:
#check statistical analysis
df.describe()


Unnamed: 0.1,Unnamed: 0,label_num
count,5171.0,5171.0
mean,2585.0,0.289886
std,1492.883452,0.453753
min,0.0,0.0
25%,1292.5,0.0
50%,2585.0,0.0
75%,3877.5,1.0
max,5170.0,1.0


In [10]:
#Check how many of data is ham or spam
df['label'].value_counts()

label
ham     3672
spam    1499
Name: count, dtype: int64

In [11]:
df.label.value_counts(normalize=True) #The dataset is imbalanced. so we can check also the percentage of data ratio.

label
ham     0.710114
spam    0.289886
Name: proportion, dtype: float64

In [12]:
#So we created two bucket to seperate the ham and spam labeled data. One bucket is for ham data. and another for spam data
df_ham = df[df['label'] == 'ham']
df_spam = df[df['label'] == 'spam']

df_ham.shape,df_spam.shape

((3672, 4), (1499, 4))

In [13]:
#By using sample function, we will take how many data we need to take.
df_ham = df_ham.sample(df_spam.shape[0])
df_ham.shape,df_spam.shape

((1499, 4), (1499, 4))

In [14]:
# Use the concat function instead of append
# Now the seperate two bucket append to one bucket named data as before
# Some issues can happen as same index.
# So we need to ignore the index.

data = pd.concat([df_ham, df_spam], ignore_index=True)
data.shape

(2998, 4)

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.3, random_state=0, shuffle= True )

In [17]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((2098,), (900,), (2098,), (900,))

# **2. Building the Model (Random Forest)**

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer # Tfidf Vector to extract the feature
from sklearn.ensemble import RandomForestClassifier #Randomforest classifier

from sklearn.pipeline import Pipeline  #pipeline object needed to run sklearn . part of sklearn pipeline

In [19]:
classifier = Pipeline([('tfidf', TfidfVectorizer()),('rfclassifer', RandomForestClassifier(n_estimators= 10))]) #creating object


In [20]:
classifier.fit(X_train, y_train)

# **3. Predicting the Result (Random Forest)**

In [21]:
y_prad = classifier.predict(X_test)

In [22]:
y_test , y_prad

(311      ham
 1025     ham
 1586    spam
 2939    spam
 2978    spam
         ... 
 1678    spam
 1611    spam
 757      ham
 39       ham
 1042     ham
 Name: label, Length: 900, dtype: object,
 array(['spam', 'ham', 'spam', 'spam', 'spam', 'ham', 'spam', 'ham',
        'spam', 'spam', 'spam', 'ham', 'ham', 'ham', 'spam', 'spam', 'ham',
        'ham', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham',
        'spam', 'ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham',
        'ham', 'ham', 'ham', 'spam', 'spam', 'spam', 'ham', 'ham', 'ham',
        'ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'spam', 'ham',
        'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'spam',
        'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'spam',
        'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam',
        'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham',
        'spam', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 

In [23]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [24]:
accuracy_score(y_test, y_prad) # We got 91% accurate result. Our model is quite good.

0.9522222222222222

In [25]:
print(confusion_matrix(y_test, y_prad))

[[468  22]
 [ 21 389]]


In [26]:
print(classification_report(y_test, y_prad))

              precision    recall  f1-score   support

         ham       0.96      0.96      0.96       490
        spam       0.95      0.95      0.95       410

    accuracy                           0.95       900
   macro avg       0.95      0.95      0.95       900
weighted avg       0.95      0.95      0.95       900



# **4. Building the Model (SVM)**

In [27]:
from sklearn.svm import SVC #svm classifier

In [28]:
classifier_svm = Pipeline([('tfidf', TfidfVectorizer()),('svc', SVC( C = 100, gamma = 'auto'))])

In [29]:
classifier_svm.fit(X_train, y_train)

# **5. Predicting the Result (SVM)**

In [30]:
y_pred = classifier_svm.predict(X_test)

In [31]:
accuracy_score(y_test, y_pred)

0.45555555555555555

# Testing the two model SVM and RandomForest to check
**bold text**

In [32]:
test1 = ' Hi ! I hope this mail finds you well. I just need the book you borrowed. regards Sab'
test2 = 'Congratulations! you won a lottery ticket worth $1 million ! To calim call on 22222'

In [33]:
print (classifier.predict([test1]))
print (classifier.predict([test2]))

['spam']
['spam']


In [34]:
print(classifier_svm.predict([test1]))
print(classifier_svm.predict([test2]))

['spam']
['spam']
