In [65]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import BernoulliNB,GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from src.utils import evaluate_score

In [6]:
df=pd.read_csv('data/SMSSpamCollection',sep='\t',names=['label','message'])

In [7]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
X=df.drop('label',axis=1)
y=df['label']

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)


In [10]:
X_train.head()

Unnamed: 0,message
1978,Reply to win £100 weekly! Where will the 2006 ...
3989,Hello. Sort of out in town already. That . So ...
3935,How come guoyang go n tell her? Then u told her?
4078,Hey sathya till now we dint meet not even a si...
4086,Orange brings you ringtones from all time Char...


In [11]:
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

In [12]:
y_train

1978    spam
3989     ham
3935     ham
4078     ham
4086    spam
        ... 
3772     ham
5191     ham
5226     ham
5390     ham
860      ham
Name: label, Length: 4457, dtype: object

In [96]:
models={
    "GaussianNB":GaussianNB(),
    "SVC":SVC(),
    "KNeighborsClassifier":KNeighborsClassifier(),
    "DecisionTreeClassifier":DecisionTreeClassifier(),
    "RandomForestClassifier":RandomForestClassifier(),
    "AdaBoostClassifier":AdaBoostClassifier(),
    "GradientBoostingClassifier":GradientBoostingClassifier(),
    # "XGBClassifier":XGBClassifier()
}


pandas.core.series.Series

In [36]:
import nltk
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
ps=PorterStemmer()
lemetizer=WordNetLemmatizer()


In [33]:
msg=df['message'][0]
lemetizer.lemmatize(msg)


'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [34]:
corpus=[]

groups=[]
for i in df['message']:
    groups.append(i) 

In [41]:
corpus=[]
for i in range(0,len(df)):
    review=re.sub('^[a-zA-Z]',' ',df['message'][i])
    review=review.lower()
    review=review.split()
    review=[lemetizer.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)
    
    

In [42]:
pd.DataFrame(corpus)

Unnamed: 0,0
0,"jurong point, crazy.. available bugis n great ..."
1,k lar... joking wif u oni...
2,ree entry 2 wkly comp win fa cup final tkts 21...
3,dun say early hor... u c already say...
4,"ah think go usf, life around though"
...,...
5567,2nd time tried 2 contact u. u £750 pound prize...
5568,ill ü b going esplanade fr home?
5569,"ity, * mood that. so...any suggestions?"
5570,guy bitching acted like i'd interested buying ...


In [48]:
y=pd.get_dummies(df['label'])
y=y.iloc[:,0]
X_train,X_test,y_train,y_test=train_test_split(corpus,y,random_state=42,test_size=0.2)

In [49]:
len(X_train),len(y_train)

(4457, 4457)

In [51]:
X_train,y_train

(['eply win £100 weekly! 2006 fifa world cup held? send stop 87239 end service',
  'ello. sort town already. . dont rush home, eating nachos. let know eta.',
  'ow come guoyang go n tell her? u told her?',
  'ey sathya till dint meet even single time saw situation sathya.',
  'range brings ringtones time chart heroes, free hit week! go ringtones & pic wap. stop receiving tip reply stop.',
  'itting mu waiting everyone get suite take shower',
  'inish liao... u?',
  'rgent! mobile 07808726822 awarded £2,000 bonus caller prize 02/09/03! 2nd attempt contact you! call 0871-872-9758 box95qu',
  'robably not, still going stuff',
  'ah lucky man... save money... hee...',
  'ey u still gym?',
  'h... lk tt den take e one tt end cine lor... dun wan yogasana oso can...',
  'k lor...',
  'f still havent collected dough pls let know go place sent get control number',
  'tupid.its possible',
  'secret admirer. reveal think u r special. call 09065174042. opt reply reveal stop. 1.50 per msg recd. cus

In [52]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500,ngram_range=(1,2))

In [54]:
X_transform_train=cv.fit_transform(X_train).toarray()
X_transform_test=cv.transform(X_test).toarray()

In [55]:
X_transform_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [98]:
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_transform_train,y_train)
    y_pred=model.predict(X_transform_test)
    accuracy_score=evaluate_score(y_test,y_pred)
    print(model,accuracy_score,'\n')
    

GaussianNB() 0.8484304932735426 

SVC() 0.9847533632286996 

KNeighborsClassifier() 0.9255605381165919 

DecisionTreeClassifier() 0.9704035874439462 

RandomForestClassifier() 0.9829596412556054 





AdaBoostClassifier() 0.979372197309417 

GradientBoostingClassifier() 0.9739910313901345 



In [71]:
X_transform_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [103]:
model=RandomForestClassifier()
model.fit(X_transform_train,y_train)
y_pred=model.predict(X_transform_test)

In [105]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_pred,y_test))

0.9829596412556054


In [106]:
modell=BernoulliNB()
modell.fit(X_transform_train,y_train)
print(accuracy_score(modell.predict(X_transform_test),y_test))

0.9829596412556054
