In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.externals import joblib

In [2]:
df = pd.read_csv('spam.csv', encoding="latin-1")
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.columns = ['Class', 'message']                                              # renaming columns

In [3]:
df['label'] = df['Class'].map({'ham': 0, 'spam': 1})     # creating a new column based on another column values class

In [4]:
df.shape # size of dataframe

(5572, 3)

In [5]:
df.head() # data before going into model

Unnamed: 0,Class,message,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
X = df['message']
y = df['label']
cv = CountVectorizer()
X = cv.fit_transform(X) # Fit the Data into cv model, 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

- X is just a variable which has the transformed features. ex: term document matrix
- After fitting and transforming the data we get a sparse matrix of length 8k+ so they are basically the distinct words from 5k+ records
- These 8k data points are splitted into training and test data sets
- X or X_train will have the message/text index number from original dataframe and the index of words which are extracted using the model, and the count of that word. Ex: if a word "python" comes twice in a message/text then it will have count of 2 

In [7]:
print(X) 

  (0, 8267)	1
  (0, 1069)	1
  (0, 3594)	1
  (0, 7645)	1
  (0, 2048)	1
  (0, 1749)	1
  (0, 4476)	1
  (0, 8489)	1
  (0, 3634)	1
  (0, 1751)	1
  (0, 4087)	1
  (0, 5537)	1
  (0, 1303)	1
  (0, 2327)	1
  (0, 5920)	1
  (0, 4350)	1
  (0, 8030)	1
  (0, 3550)	1
  (1, 5533)	1
  (1, 8392)	1
  (1, 4318)	1
  (1, 4512)	1
  (1, 5504)	1
  (2, 77)	1
  (2, 1156)	1
  :	:
  (5570, 1786)	1
  (5570, 3470)	1
  (5570, 2892)	1
  (5570, 7049)	1
  (5570, 1778)	1
  (5570, 8065)	1
  (5570, 2592)	1
  (5570, 5334)	1
  (5570, 1438)	1
  (5570, 7627)	1
  (5570, 3308)	1
  (5570, 7039)	1
  (5570, 4615)	1
  (5570, 1084)	1
  (5570, 8313)	1
  (5570, 4218)	1
  (5570, 3781)	1
  (5570, 7756)	1
  (5570, 3358)	1
  (5570, 4087)	1
  (5571, 6505)	1
  (5571, 7885)	1
  (5571, 4225)	2
  (5571, 5244)	1
  (5571, 7756)	1


In [8]:
type(X_train)

scipy.sparse.csr.csr_matrix

In [9]:
df_result = pd.DataFrame(X.todense())      # converts the sparse into dense matrix and then into dataframe 
df_result.columns = cv.get_feature_names() # get_feature_names simply assigns words to the columns

In [10]:
df_result.head() 

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,ó_,û_,û_thanks,ûªm,ûªt,ûªve,ûï,ûïharry,ûò,ûówell
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df_result.shape  # so 5572 records have 8672 words

(5572, 8672)

In [12]:
words = list(df_result.columns)
df = pd.DataFrame(words)
df.to_csv('words.csv', index = False, header = False) #To get the list of words that have been extracted using the model

In [13]:
#Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1587
           1       0.93      0.92      0.92       252

   micro avg       0.98      0.98      0.98      1839
   macro avg       0.96      0.95      0.96      1839
weighted avg       0.98      0.98      0.98      1839



Building the naive bayes model to predict whether a message is spam or not. 

__Note:__ We can improve the model much better.

In [14]:
joblib.dump(clf, 'NB_spam_model.pkl') 

['NB_spam_model.pkl']

In [15]:
NB_spam_model = open('NB_spam_model.pkl','rb')
clf = joblib.load(NB_spam_model)

This is a pickle file which saves the trained model and whenever we want to train it simply appends new data so that we dont have to train from the first.