In [1]:
# !pip install pandas scikit-learn nltk

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('./../dataset/spam.csv', encoding = 'ISO-8859-1')

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.shape

(5572, 5)

In [6]:
import nltk, re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [7]:
df['v2'] = df['v2'].map(lambda text : re.sub('[^a-zA-Z0-9]', ' ', text)).apply(lambda x : (x.lower()).split())

In [8]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"[go, until, jurong, point, crazy, available, o...",,,
1,ham,"[ok, lar, joking, wif, u, oni]",,,
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f...",,,
3,ham,"[u, dun, say, so, early, hor, u, c, already, t...",,,
4,ham,"[nah, i, don, t, think, he, goes, to, usf, he,...",,,


In [9]:
corpus = df['v2'].apply(lambda text : ' '.join(list(map(lambda w : ps.stem(w), \
                                                        (list(filter(lambda text : text not in set(stopwords.words('english')),\
                                                                                            text)))))))

In [10]:
corpus

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri 2 wkli comp win fa cup final tkt 21...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    2nd time tri 2 contact u u 750 pound prize 2 c...
5568                                b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: v2, Length: 5572, dtype: object

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [12]:
X_data = tfidf.fit_transform(corpus.values).toarray()

In [13]:
X_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
X_data.shape

(5572, 7163)

In [15]:
y_data = df.v1

In [16]:
y_data.shape

(5572,)

In [17]:
y_data.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: v1, dtype: object

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.2)

In [19]:
from sklearn.naive_bayes import MultinomialNB

In [20]:
model = MultinomialNB(alpha = 1.0, fit_prior = True)

In [21]:
model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
y_pred = model.predict(X_test)

In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9704035874439462

In [24]:
from joblib import dump

In [25]:
dump(tfidf, './../savedModels/tfidf.joblib')

['./../savedModels/tfidf.joblib']

In [26]:
dump(model, './../savedModels/model.joblib')

['./../savedModels/model.joblib']