<img height="60" width="120" src="https://shwetkm.github.io/upxlogo.png"></img>
# UpX Academy - Machine Learning Track
# Naive Bayes Classifier

## Goal:  Train a Naive Bayes model to classify future SMS messages as either spam or ham.

Steps:

1.  Convert the words ham and spam to a binary indicator variable(0/1)

2.  Convert the txt to a sparse matrix of TFIDF vectors

3.  Fit a Naive Bayes Classifier

4.  Measure your success using roc_auc_score



In [3]:
# Install NLTK package if not already installed. Uncomment the last line
# Anaconda prompt should be opened with admin previliges (Run as adminstrator)
#Download NLTK if not already downloaded

#!conda install nltk      #Uncomment if required
#import nltk              #Uncomment if required
#nltk.download()          #Uncomment if required

In [36]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score, confusion_matrix

In [5]:
df= pd.read_csv("sms_spam.csv")

In [6]:
df.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Train the classifier if it is spam or ham based on the text

In [11]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [12]:
#TFIDF Vectorizer
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [14]:
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'yours', 'because', 'yourselves', 'about', 'wouldn', 'didn', 'had', 'our', 'whom', 'won', 'who', 'against', 'most', 'ma', 'them', 'which', 'from', 'me', 'an', 'to', 'or', 'yourself', 'am', 're', 'any', 'myself', 'haven', 'they', 'into', 'down', 'you', 'ourselves', 'some', 'why', 'then', ...tn', 'himself', 'and', 'again', 'should', 'having', 'o', 'she', 'after', 'between', 'only', 'weren'},
        strip_accents='ascii', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

#### Convert the spam and ham to 1 and 0 values respectively for probability testing

In [15]:
df.type.replace('spam', 1, inplace=True)

In [16]:
df.type.replace('ham', 0, inplace=True)

In [17]:
df.head()

Unnamed: 0,type,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
df.shape

(5574, 2)

In [19]:
##Our dependent variable will be 'spam' or 'ham' 
y = df.type

In [20]:
#Convert df.txt from text to features
X = vectorizer.fit_transform(df.text)

In [21]:
print (y.shape)
print (X.shape)

(5574,)
(5574, 8586)


In [30]:
##Split the test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [31]:
##Train Naive Bayes Classifier
## Fast (One pass)
## Not affected by sparse data, so most of the 8605 words dont occur in a single observation
clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
y_test

3690    0
3527    0
724     0
3370    0
468     0
5412    0
4362    0
4241    0
5442    0
5309    0
2232    0
3573    0
4379    0
3316    1
4895    0
296     1
453     0
4880    0
2034    0
4287    0
605     0
1615    0
5169    0
4655    0
2754    0
2727    0
4295    1
3893    1
2559    0
730     0
       ..
3768    0
3809    0
3034    0
5082    0
257     0
507     0
1438    0
99      0
1957    0
5216    1
3412    0
4058    0
3650    0
2707    0
1954    0
4028    0
2164    0
4564    0
366     0
2561    0
3680    0
4320    0
3133    0
949     0
4842    0
19      1
4758    0
668     0
218     0
4660    0
Name: type, dtype: int64

#### Check for null values in spam

In [33]:
df[df.type.isnull()]

Unnamed: 0,type,text


#### There are no null values

In [34]:
clf.predict_proba(X_test)[:,1]

array([ 0.00270358,  0.01501181,  0.0666378 , ...,  0.00803285,
        0.0139652 ,  0.00349621])

In [39]:
##Check model's accuracy
print("ROC =",roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))
confusion_matrix(y_test,clf.predict(X_test))

ROC = 0.986071035326


array([[1203,    0],
       [  39,  152]])

### With the model, the Area Under Curve is ~98.60%