### Honey Berk - HW10
### Document Classification - Multinomial Naive Bayes Classifier

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from textblob import TextBlob
import pandas
import sklearn
import numpy as np
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split

In [2]:
# Read in dataset, view data
# SMS Spam Collection Dataset (https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection)
smsspam = pandas.read_csv('https://raw.githubusercontent.com/honeyberk/DATA620/master/hw10/SMSSpamCollection.tsv', sep='\t', quoting=csv.QUOTE_NONE, names=["category", "text"])
smsspam.head(10)

Unnamed: 0,category,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
# Descriptive stats
smsspam.groupby('category').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
category,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,count,4827
ham,unique,4518
ham,top,"Sorry, I'll call later"
ham,freq,30
spam,count,747
spam,unique,653
spam,top,Please call our customer service representativ...
spam,freq,4


In [4]:
# Check for missing data
smsspam[smsspam.isnull().any(axis=1)].head()

Unnamed: 0,category,text


### After the corpus is loaded and examined, the words are lemmatized to return the base form of the word, for matching purposes.

In [5]:
# Lemmatize (maps the various forms of a word to the canonical or citation form of the word, the lemma)
def split_into_lemmas(text):
    text = unicode(text, 'utf8').lower()
    words = TextBlob(text).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in words]

### The words are then classified using the bag of words model, which counts the number of times each words apepars and creates a sparse vector of occurrence counts.

In [6]:
# Bag of words (learns vocabulary from doc, then models each doc by counting the number of times each word appears)
bag_transform = CountVectorizer(analyzer=split_into_lemmas).fit(smsspam['text'])
smsspam_bag = bag_transform.transform(smsspam['text'])
print 'sparse matrix shape:', smsspam_bag.shape
print 'number of non-zeros:', smsspam_bag.nnz
print 'sparsity: %.2f%%' % (100.0 * smsspam_bag.nnz / (smsspam_bag.shape[0] * smsspam_bag.shape[1]))

sparse matrix shape: (5574, 8874)
number of non-zeros: 80272
sparsity: 0.16%


### The data is then run through the Multinomial Naive Bayes classifier from the scikit-learn package. With a multinomial event model, samples (feature vectors) represent the frequencies with which certain events have been generated by a multinomial.

In [7]:
# Multinomial Naive Bayes classifier (suitable for classification with discrete features, like word counts)
spam_id = MultinomialNB().fit(smsspam_bag, smsspam['category'])

In [8]:
predictions = spam_id.predict(smsspam_bag)
print predictions

['ham' 'ham' 'spam' ..., 'ham' 'ham' 'ham']


### For this corpus of 5,574 records, this process yields a 0.99 precision rate.

In [9]:
print classification_report(smsspam['category'], predictions)

             precision    recall  f1-score   support

        ham       0.99      1.00      0.99      4827
       spam       0.97      0.96      0.97       747

avg / total       0.99      0.99      0.99      5574

