#### Data Description

The dataset is a collection of SMS messages and have been classified as Spam or Ham.
Have used NLP to represent the text in the SMS into numbers and perform a classification using Naive Bayes classifier.

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score
from sklearn import metrics

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saurabh.sinha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
sms_data = pd.read_csv("../Data/sms_spam.csv")

In [5]:
sms_data.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
sms_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5574 entries, 0 to 5573
Data columns (total 2 columns):
type    5574 non-null object
text    5574 non-null object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
sms_data.shape

(5574, 2)

#### Convert the spam and ham to 1 and 0 values respectively for probability testing

In [8]:
sms_data['type'].replace(['ham', 'spam'], [0, 1], inplace = True)

In [9]:
sms_data.head()

Unnamed: 0,type,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


Stopwords: Words which are filtered before applying NLP as they have no meaning.

In [10]:
stopset = set(stopwords.words('english'))

In [11]:
stopset

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [11]:
#TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)
tfidf_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'it', 'this', 'but', 'which', 'because', 'out', 'once', 'how', 'so', 'myself', 'yourselves', 'all', 'have', 'was', 'then', 'there', 'weren', 'against', "hasn't", "should've", 'needn', 'shan', 'y', 'up', 'these', 'theirs', 'has', 'with', 'only', "haven't", 'she', "it's", "couldn't", "need...n', 'didn', 'between', 'being', 'its', 'while', 'further', 'by', "mightn't", 'yours', 'where', 'at'},
        strip_accents='ascii', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [12]:
tfidf_vectorizer.fit(sms_data)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'it', 'this', 'but', 'which', 'because', 'out', 'once', 'how', 'so', 'myself', 'yourselves', 'all', 'have', 'was', 'then', 'there', 'weren', 'against', "hasn't", "should've", 'needn', 'shan', 'y', 'up', 'these', 'theirs', 'has', 'with', 'only', "haven't", 'she', "it's", "couldn't", "need...n', 'didn', 'between', 'being', 'its', 'while', 'further', 'by', "mightn't", 'yours', 'where', 'at'},
        strip_accents='ascii', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [13]:
# Convert sms_data.txt from text to features
# X is a Document-Term matrix with [n_samples, n_features]
X = tfidf_vectorizer.fit_transform(sms_data.text) 

In [14]:
X

<5574x8586 sparse matrix of type '<class 'numpy.float64'>'
	with 47400 stored elements in Compressed Sparse Row format>

In [15]:
# Dimension of the Document-Term matrix
X.shape

(5574, 8586)

In [16]:
# Dependent variable will be 'spam' or 'ham' 
y = sms_data.type
y.shape

(5574,)

#### First SMS 

In [17]:
sms_data["text"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [18]:
# Spliting the SMS to separate text into individual words
splt_txt0 = sms_data["text"][0].split()
print(splt_txt0)

['Go', 'until', 'jurong', 'point,', 'crazy..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet...', 'Cine', 'there', 'got', 'amore', 'wat...']


In [19]:
# Number of words in first sms
len(splt_txt0)

20

In [20]:
# Most frequent word apears in first sms
max(splt_txt0)

'world'

In [21]:
X[0]

<1x8586 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

There are 20 words in the first sms out of which only 14 elements have been taken

#### Second SMS 

In [22]:
sms_data["text"][1]

'Ok lar... Joking wif u oni...'

In [23]:
# Spliting the SMS to separate text into individual words
splt_txt1 = sms_data["text"][1].split()
print(splt_txt1)

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']


In [24]:
# Number of words in second sms
len(splt_txt1)

6

In [25]:
# Most frequent word apears in second sms
max(splt_txt1)

'wif'

In [26]:
X[1]

<1x8586 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

There are 6 words in the second sms out of which 5 elements have been taken

In [27]:
print(X)

  (0, 3536)	0.1570070817542793
  (0, 4316)	0.3466185073652293
  (0, 5877)	0.2711124074492608
  (0, 2316)	0.26843531434169243
  (0, 1301)	0.25926284833436075
  (0, 1746)	0.2928268764441005
  (0, 3620)	0.19147848622350877
  (0, 8428)	0.23446497404204308
  (0, 4442)	0.2928268764441005
  (0, 1744)	0.3308854638944828
  (0, 2038)	0.2928268764441005
  (0, 3580)	0.1625034702178997
  (0, 1074)	0.3466185073652293
  (0, 8218)	0.19367543856970723
  (1, 5466)	0.27190435673704183
  (1, 4478)	0.4083285209202484
  (1, 4284)	0.5236769406481622
  (1, 8333)	0.4316309977097208
  (1, 5493)	0.5466195966483365
  (2, 3340)	0.11532016948053561
  (2, 2931)	0.3598966605883333
  (2, 8387)	0.19049443007546943
  (2, 2155)	0.19443486429295845
  (2, 8345)	0.14768604533962174
  (2, 3068)	0.46962403601340863
  :	:
  (5569, 165)	0.3330442123216397
  (5569, 5384)	0.3330442123216397
  (5570, 3876)	0.3652144637345925
  (5570, 3549)	0.3642455181785356
  (5570, 3327)	0.5597074067013798
  (5570, 2963)	0.6485917181474956
  (55

##### Observation 

There are 14 tf-idf values for the first sms, 5 if-idf values for the second sms and so on.....

In [28]:
# Most freaquent word across all the SMSes
max(tfidf_vectorizer.get_feature_names())

'zyada'

In [29]:
# Split the test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [30]:
# Train Naive Bayes Classifier

nb_clf = naive_bayes.MultinomialNB()
model = nb_clf.fit(X_train, y_train)

In [31]:
predicted_class=model.predict(X_test)
print(predicted_class)

[0 0 0 ... 0 0 0]


#### Probability of assigning a sms to a specific class 

In [32]:
prob = model.predict_proba(X_test)

In [33]:
prob

array([[0.99571389, 0.00428611],
       [0.99776692, 0.00223308],
       [0.99463735, 0.00536265],
       ...,
       [0.95699131, 0.04300869],
       [0.99345174, 0.00654826],
       [0.87579345, 0.12420655]])

In [34]:
# Checking Model's accuracy
metrics.accuracy_score(y_test, predicted_class)

0.966284074605452

** Observation: ** With this model the success rate is 96.6%