# Spam detection via text classification

In [None]:
# Spam detection from 
# 
# We use text classification to find out the spam message. 
# The adopted approach is the multinomial Naive Baynes.
# 
# The data set can be downloaded from the kaggle website as follows:
# https://www.kaggle.com/uciml/sms-spam-collection-dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv("spam.csv",encoding='ISO-8859-1')

In [3]:
df=df.drop(['Unnamed: 2','Unnamed: 3', 'Unnamed: 4'],axis=1)

In [4]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Text Preprocessing

In [5]:
from nltk.corpus import stopwords

In [6]:
# Define a function to remove stopwords
def removestopwords(x):
    word=x.lower().split(" ")
    non_stop=[]
    for w in word:
        if w not in stopwords.words('english'):
            non_stop.append(w)
    non_stop=" ".join(non_stop)
    return non_stop

In [7]:
df['Clean']=df['v2'].apply(removestopwords)

In [8]:
# Remove digits and non-alphabet
df['Clean']=df['Clean'].str.replace(r'[\d|\W]',' ')

In [9]:
df.head()

Unnamed: 0,v1,v2,Clean
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n gre...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though


# Split the data into training and testing sets 

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X=df.iloc[:,2].values
y=df.iloc[:,0].values

In [12]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=21)

# Feature extraction

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
Count=CountVectorizer()

In [15]:
Count_train=Count.fit_transform(X_train)
Count_test=Count.transform(X_test)

# Text classification using Multinomial Naive Baynes

In [16]:
from sklearn.naive_bayes import MultinomialNB

In [17]:
NB=MultinomialNB()

In [18]:
NB.fit(Count_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
print("Score of training set is {}".format(NB.score(Count_train,y_train)))
print("Score of testing set is {}".format(NB.score(Count_test,y_test)))

Score of training set is 0.9939421135292797
Score of testing set is 0.9838565022421525
