In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [35]:
        #Reading the text file
    
df=pd.read_table('SMSSpamCollection.txt',header=None)
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [37]:
y=df[0]
y.value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

In [38]:
    #Labelling ham as 0 and Spam as 1
    
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y_enc=le.fit_transform(y)
y_enc

array([0, 0, 1, ..., 0, 0, 0])

In [39]:
text=df[1]

In [40]:
    #Replacing email addresses with emailaddr, urls with httpaddr, money symbol with moneysymb, phone numbers with 
    #phonenumbr, numbers with numbr

processed = text.str.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b',
                                 'emailaddr')
processed = processed.str.replace(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)',
                                  'httpaddr')
processed = processed.str.replace(r'£|\$', 'moneysymb')    
processed = processed.str.replace(
    r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b',
    'phonenumbr')    
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')


    #Removing puctuations and whitespace

processed = processed.str.replace(r'[^\w\d\s]', ' ')
processed = processed.str.replace(r'\s+', ' ')
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [41]:
    #Converting into lower strings

processed=processed.str.lower()

In [42]:
    #Removing stopwords
    
import nltk 
from nltk.corpus import stopwords
stopwords=nltk.corpus.stopwords.words('english')

In [43]:
processed=processed.apply(lambda x: ' '.join(term for term in x.split() if term not in set(stopwords)))

In [44]:
processed.head()

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry numbr wkly comp win fa cup final tk...
3                  u dun say early hor u c already say
4               nah think goes usf lives around though
Name: 1, dtype: object

In [50]:
    #Stemming (Stemming is the process of reducing the words to their root form like fishing into fish, available
    # into avail)
    
from nltk.stem import PorterStemmer
porter=nltk.PorterStemmer()
processed=processed.apply(lambda x: ' '.join(porter.stem(term) for term in x.split()))
    

In [59]:
    #Feature Engineering

from sklearn.feature_extraction.text import  TfidfVectorizer
vectorizer=TfidfVectorizer(ngram_range=(1,2))
X_ngrams=vectorizer.fit_transform(processed)
X_ngrams.shape

(5572, 36323)

In [61]:
    #Training and evaluating the model
    
from sklearn.model_selection import train_test_split
from sklearn import svm
X_train,X_test,y_train,y_test=train_test_split(X_ngrams,y_enc,test_size=0.2,random_state=42,stratify=y_enc)

In [64]:
clf=svm.LinearSVC(loss='hinge')
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

In [65]:
    #Confusion Metric
    
from sklearn import metrics
pd.DataFrame(metrics.confusion_matrix(y_test,y_pred),index=[['actual','actual'],['spam','ham']],columns=[['predicted','predicted'],['spam','ham']])

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,spam,ham
actual,spam,965,1
actual,ham,19,130


In [68]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9820627802690582