In [0]:
import pandas as pd
import numpy as np
import string

from nltk.tokenize import sent_tokenize, word_tokenize # nltk is the toolkit for natual language processing developed by Princeton Univ.
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'\w+')

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix

In [0]:
import nltk
nltk.download('stopwords')

In [0]:
df = pd.read_csv('spam.csv',encoding='latin-1')

In [0]:
df.drop(['Unnamed: 2',	'Unnamed: 3',	'Unnamed: 4'],axis=1,inplace=True)

In [0]:
df.columns = ['label','text']

In [0]:
def process_text(text):

    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    clean_words = [word for word in nopunc.split(' ') if word.lower() not in stopwords.words('english')]
    
    return clean_words

In [0]:
df['text'] = df['text'].apply(lambda x:process_text(x))

In [12]:
df.head(2)

Unnamed: 0,label,text
0,ham,"[Go, jurong, point, crazy, Available, bugis, n..."
1,ham,"[Ok, lar, Joking, wif, u, oni]"


In [13]:
df.head()

Unnamed: 0,label,text
0,ham,"[Go, jurong, point, crazy, Available, bugis, n..."
1,ham,"[Ok, lar, Joking, wif, u, oni]"
2,spam,"[Free, entry, 2, wkly, comp, win, FA, Cup, fin..."
3,ham,"[U, dun, say, early, hor, U, c, already, say]"
4,ham,"[Nah, dont, think, goes, usf, lives, around, t..."


In [0]:
train_X,test_X,train_y,test_y = train_test_split(df['text'],df['label'],test_size=0.2)

In [15]:
print(train_X.shape,test_X.shape)

(4457,) (1115,)


In [0]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=process_text)), # converts strings to integer counts
#     ('tfidf',TfidfTransformer()), # converts integer counts to weighted TF-IDF scores
    ('classifier',MultinomialNB()) # train on TF-IDF vectors with Naive Bayes classifier
])

In [20]:
pipeline.fit(train_X,train_y)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function process_text at 0x7f3d6e09cae8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=No...None, vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [0]:
pred_y = pipeline.predict(test_X)

In [24]:
print(classification_report(test_y,pred_y))

             precision    recall  f1-score   support

        ham       0.87      1.00      0.93       970
       spam       1.00      0.01      0.03       145

avg / total       0.89      0.87      0.81      1115

