# Multinomial Naive Bayes based SMS Classifier

In [28]:
import pandas as pd

docs = pd.read_csv('sms.tsv', sep='\t', names=['Class', 'sms'])

#classifier in column 1, sms in column 2.
docs.head()

Unnamed: 0,Class,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [29]:
# counting spam and ham instances
# df.column_name.value_counts() - gives no. of unique inputs in the columns

ham_spam=docs.Class.value_counts()
ham_spam

ham     4825
spam     747
Name: Class, dtype: int64

In [30]:
# mapping labels to 0 and 1
docs['label'] = docs.Class.map({'ham':0, 'spam':1})

In [31]:
docs.head()

Unnamed: 0,Class,sms,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [53]:
X = docs.sms
y = docs.label

print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [54]:
# splitting into test and train
from sklearn.model_selection  import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [55]:
X_train.head()

169     Great escape. I fancy the bridge but needs her...
3676                             Whos this am in class:-)
2192    Thankyou so much for the call. I appreciate yo...
2112    Yar he quite clever but aft many guesses lor. ...
3713                                   Wat u doing there?
Name: sms, dtype: object

In [56]:
# vectorizing the sentences; removing stop words
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')

In [57]:
vect.fit(X_train)
# X_train_dtm = vect.transform(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [58]:
# transforming the train and test datasets
X_train_transformed = vect.transform(X_train)
X_test_transformed =vect.transform(X_test)

In [59]:
# note that the type is transformed matrix
print(type(X_train_transformed))
print(X_train_transformed.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(4179, 7156)


In [61]:
# training the NB model and making predictions
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

# fit
mnb.fit(X_train_transformed,y_train)

# predict class
y_pred_class = mnb.predict(X_test_transformed)

# predict probabilities
y_pred_proba = mnb.predict_proba(X_test_transformed)

#print(y_pred_proba[0])

#Custom Chek for the comment

# t = ['Hope you enjoyed your new content']

# comment =  vect.transform(t)

# print(comment.shape)

# predicted_class = mnb.predict_proba(comment)

# print(predicted_class)

# #predict probabilities
# y_pred_proba = mnb.predict_proba(comment)
# predicted_list =  predicted_class.tolist()
# pre_class = docs.Class[predicted_list.index(max(predicted_list))]

# print("text:{} \nPredicted Class: {}".format(t,pre_class))
# print(y_pred_proba)

# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.9870782483847811