In [5]:
import pandas as pd
#data available on uci ml archives
df = pd.read_table("SMSSpamCollection", sep="\t", header=None, names=["label", "message"])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
#data preprocessing
#spam -> 1, ham -> 0
df["label"] = df.label.map({"ham":0, "spam":1})
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
#splitting data into train and test sets
from sklearn.cross_validation import train_test_split as tts
xtrain, xtest, ytrain, ytest = tts(df["message"], df["label"], random_state=1)
print("rows in total: ", df.shape[0])
print("rows in train data: ", xtrain.shape[0])
print("rows in test data: ", xtest.shape[0])

rows in total:  5572
rows in train data:  4179
rows in test data:  1393


In [15]:
#applying badg of words to the dataset
#meaning - eliminating punctuations & stop words, transforming text to lowercase and applying other data cleanings
#using count vectorizer to count occurence of each word of the email
from sklearn.feature_extraction.text import CountVectorizer as CV
cntvec = CV(stop_words="english")
print(cntvec)
#these are typical parameters of a count vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [24]:
#transforming and fitting train data 
#freqmat gives us count of occurences of each word from the dataset
wordarr = cntvec.transform(xtrain).toarray()
freqmat = pd.DataFrame(wordarr, columns = cntvec.get_feature_names())
traindata = cntvec.fit(xtrain)
testdata = cntvec.transform(xtest)
print(freqmat.shape)
freqmat.head()

(4179, 7204)


Unnamed: 0,00,000,008704050406,0121,01223585236,01223585334,0125698789,02,0207,02072069400,...,zed,zeros,zhong,zindgi,zoe,zoom,zouk,zyada,èn,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
#implementing naive bayes 
#training the model
from sklearn.naive_bayes import MultinomialNB as mnb
naibay = mnb()
naibay.fit(traindata, ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
#predicting the o/p on testdata
predictions = naibay.predict(testdata)

In [19]:
#evaluating our model with the help of some performance metrics
#true_positives -> words classified as spam, and which are actually spam
#false_positives -> words classfied as spam, and which are not actually spam
#accuracy = (no.of correct predictions) / (total no.of predictions)
#precision = (true_positives) / (true_positives + false_positives)
#recall(sensitivity) -> (true_positives) / (true_positives + false_negatives)
#f1 -> weighted average of recall and precision scores
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy Score: ", accuracy_score(ytest, predictions))
print("Precision Score: ", precision_score(ytest, predictions))
print("Recall Score: ", recall_score(ytest, predictions))
print("F1 Score: ", f1_score(ytest, predictions))

Accuracy Score:  0.9877961234745154
Precision Score:  0.9615384615384616
Recall Score:  0.9459459459459459
F1 Score:  0.9536784741144414
