# Ham/Spam Classification

In [14]:
import pandas as pd
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
import numpy as np

In [2]:
df = pd.read_table('SMSSpamCollection',sep='\t',header=None,names = ['label','message'])

In [3]:
values = df.get_values()

#### Changing labels by numbers

In [4]:
df['label'] = df.label.map({'ham':0,'spam':1})

#### convert all characters in the message to lower case

In [5]:
df['message'] = df.message.map(lambda x: x.lower())

#### Remove any punctuation

In [6]:
df['message'] = df.message.str.replace('[^\w\s]','')

#### Tokenizing the messages

First, we have to import and download the tokenizer from the console:
An installation window will appear. Go to the "Models" tab and select "punkt" from the "Identifier" column. Then click "Download" and it will install the necessary files. 

In [None]:
import nltk
nltk.download()

Now we can apply the tokenization:

In [7]:
df['message'] = df['message'].apply(nltk.word_tokenize)

#### Stemming the messages using the Porter Stemmer algorithm

In [8]:
stemmer = PorterStemmer()
df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x])

#### Transforming data into occurrences

In [9]:
df['message'] = df['message'].apply(lambda x: ' '.join(x))
count_vect = CountVectorizer()
counts = count_vect.fit_transform(df['message'])

#### Using Term Frequency Inverse Document Frequency

In [10]:
transformer = TfidfTransformer().fit(counts)
counts = transformer.transform(counts)

## Training the model

#### Splitting data into training and test sets 

In [11]:
x_train, x_test, y_train, y_test = train_test_split(counts, df['label'], test_size=.1, random_state = 60)

#### Initializing Multinomial Bayes Classifier

In [12]:
model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Evaluating the model

In [15]:
predicted = model.predict(x_test)
print(np.mean(predicted == y_test))

0.960573476703


#### Looking at the confusion matrix

In [16]:
print(confusion_matrix(y_test, predicted))

[[487   1]
 [ 21  49]]
