# load data

In [1]:
import pandas as pd

In [5]:
data=pd.read_csv('sms.tsv',delimiter='\t',names=['label','messages'])

In [6]:
data.head()

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
data.shape

(5572, 2)

In [8]:
data['label']=data['label'].map({'ham':0,'spam':1})

In [9]:
data.head()

Unnamed: 0,label,messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
data.label.value_counts()

0    4825
1     747
Name: label, dtype: int64

# clean and prepare data

In [11]:
import re

def clean(x):
    #remove all html tags from data
    #remove all numbers from data
    #remove all special chars from data
    #etc..
    s = re.sub('[^A-Za-z]',' ',x)  #to replace everything except A-Z or a-z
    s = re.sub('\s+',' ',s)
    s = s.strip()
    
    return s.lower()

In [12]:
data['Messages']=data.messages.apply(clean)

In [13]:
X=data.Messages.values
Y=data.label.values

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=.25,random_state=12)

In [13]:
#if not already installed uncomment the following lines
#import nltk
#nltk.download()

In [16]:
from nltk.corpus import stopwords
stopwords=stopwords.words('english')

In [17]:
if 'not' in stopwords:
    stopwords.remove('not')

# transform text data into numeric

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
cv=CountVectorizer(stop_words=stopwords)

In [20]:
cv_train=cv.fit_transform(xtrain).toarray()
cv_test=cv.transform(xtest).toarray()

In [21]:
cv.get_feature_names()

['aah',
 'aaniye',
 'aaooooright',
 'aathi',
 'ab',
 'abbey',
 'abeg',
 'aberdeen',
 'abi',
 'ability',
 'abiola',
 'abj',
 'able',
 'aboutas',
 'absence',
 'absolutly',
 'abstract',
 'abt',
 'abta',
 'aburo',
 'abuse',
 'abusers',
 'ac',
 'academic',
 'acc',
 'accent',
 'accenture',
 'accept',
 'access',
 'accidant',
 'accident',
 'accidentally',
 'accommodation',
 'accommodationvouchers',
 'accomodate',
 'accomodations',
 'accordin',
 'accordingly',
 'account',
 'accounting',
 'accounts',
 'accumulation',
 'ache',
 'acid',
 'acl',
 'acnt',
 'aco',
 'across',
 'act',
 'acted',
 'actin',
 'acting',
 'action',
 'activ',
 'activate',
 'active',
 'activities',
 'actor',
 'actual',
 'actually',
 'ad',
 'adam',
 'add',
 'addamsfa',
 'added',
 'addicted',
 'addie',
 'adding',
 'address',
 'adds',
 'adi',
 'admin',
 'administrator',
 'admirer',
 'admission',
 'admit',
 'adore',
 'adress',
 'adrian',
 'adrink',
 'ads',
 'adsense',
 'adult',
 'advance',
 'adventure',
 'adventuring',
 'advice',


In [22]:
cv_train.shape

(4179, 6489)

In [23]:
cv_test.shape

(1393, 6489)

# Train Model

# naive bayes

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix,recall_score,precision_score

import numpy as np

In [23]:
nb=MultinomialNB()
nb.fit(cv_train,ytrain)
test_score=nb.score(cv_test,ytest)
test_score

0.9741564967695621

In [24]:
pred=nb.predict(cv_test)

In [25]:
np.bincount(ytest)

array([1197,  196], dtype=int64)

In [26]:
confusion_matrix(ytest,pred)

array([[1182,   15],
       [  21,  175]], dtype=int64)

In [27]:
recall_score(ytest,pred)

0.8928571428571429

In [28]:
precision_score(ytest,pred)

0.9210526315789473

# logistic regression

In [29]:
from sklearn.model_selection import GridSearchCV

In [30]:
#nb = MultinomialNB()
log = LogisticRegression(C=.01,class_weight={1:3})

log.fit(cv_train,ytrain)

test_score = log.score(cv_test,ytest)
test_score

0.9540559942569993

In [31]:
log_pred = log.predict(cv_test)

In [32]:
confusion_matrix(ytest,log_pred)

array([[1182,   15],
       [  49,  147]], dtype=int64)

In [33]:
recall_score(ytest,log_pred)

0.75

In [34]:
precision_score(ytest,log_pred)

0.9074074074074074

# evaluate model on test data

In [35]:
test = ['Get free tickets..!Win cash','hi john I will call you later']

In [36]:
cleaned_data=[]
for i in test:
    t=clean(i)
    cleaned_data.append(t)

In [37]:
cleaned_data

['get free tickets win cash', 'hi john i will call you later']

In [38]:
t1=cv.transform(cleaned_data)

In [39]:
t1.shape

(2, 6489)

In [40]:
nb.predict(t1)

array([1, 0], dtype=int64)

In [41]:
log.predict(t1)

array([0, 0], dtype=int64)