# **Spam Detector Using Python**

**by :** **Harrish Ragavendar S**

## **Aim:**
### The task is to analyse the given text dataset, and classify the text as ham or spam text.

In [24]:
import pandas as pd
nltk.download('stopwords')
df = pd.read_csv("spam.csv",encoding="latin")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
tags = df["v1"]
text = df["v2"]

#**Cleaning the data**

* Converting all charecters to lowercase
* Removing numbers and special charecters
* Removing stopwords
* Stem(Combining rootwords)

In [19]:
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

#defining the stopwords
stop = stopwords.words("english")

stemmer = SnowballStemmer("english")

In [26]:
import re
def cleaner(messages):
    clean_text=[]
    for msg in messages:
        #converting to lowercase
        cln = msg.lower()

        #removing numbers and special charecters
        cln = re.findall(r'[a-zA-Z]+',cln)

        #removing stopwords
        cln = [ w for w in cln if not w in stop]

        cln = " ".join(cln)
        clean = stemmer.stem(cln)
        clean_text.append(clean)
    return clean_text

In [29]:
clean_text = cleaner(text)

# **Vectorizing the data**

In [44]:
from sklearn.feature_extraction.text import CountVectorizer as CV
cv = CV(ngram_range = (0,2),encoding="latin",max_features=20000)

x = cv.fit_transform(clean_text)
features = cv.get_feature_names()
dtm = pd.DataFrame(x.toarray(), columns=features)
dtm = dtm.drop([""], axis = 1)

dtm.head()



Unnamed: 0,aah,aathi,aathi dear,aathi love,abi,ability,abiola,abj,able,able deliver,able get,able pay,absolutly,absolutly fin,abt,abt already,abt tht,abt ur,abta,abta complimentary,aburo,aburo enjoy,ac,ac nat,academic,acc,accept,accept brother,accept day,access,access number,accident,accident claim,accidentally,accidentally deleted,accordingly,account,account balance,account bank,account details,...,yup send,yup shd,yup song,yup still,yup studying,yup thk,yup trying,yup wun,yupz,yupz oredi,zac,zac stand,zaher,zaher got,zealand,zebra,zebra animation,zed,zed per,zed pobox,zed profit,zeros,zeros savings,zf,zf msg,zhong,zhong se,zindgi,zindgi wo,zoe,zoe hit,zoe join,zogtorius,zogtorius staring,zoom,zoom cine,zouk,zouk nichols,zs,zs subscription
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# **Testing and Training the model**

In [36]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(dtm, tags, test_size = 0.1, random_state = 40)

In [38]:
from sklearn.ensemble import RandomForestClassifier
print("Running Model")
classifier = RandomForestClassifier(200, n_jobs = -1, bootstrap = "True")
classifier.fit(x_train, y_train)
prediction = classifier.predict(x_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(prediction, y_test))

Running Model
0.974910394265233


In [43]:
from sklearn.metrics import precision_recall_fscore_support as score
precision, recall, fscore, support = score(y_test, prediction)
print('Precision : {}'.format(precision))
print('Recall : {}'.format(recall))
print('FScore : {}'.format(fscore))
print('Support : {}'.format(support))

Precision : [0.972167 1.      ]
Recall : [1.         0.79710145]
FScore : [0.9858871  0.88709677]
Support : [489  69]


### **Thus the given dataset is analyzed, and it is found that the given dataset has an accuracy of 97.49 %.**
---