### SMS SPAM CLASSIFICATION 

#### Input Dataset: SMS Spam Collection Data Set
https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [1]:
# import statements

import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

### I. Data Load

In [2]:
path = r'smsspamcollection/SMSSpamCollection'
df = pd.read_csv(path, sep = '\t', names=['label','SMS'])
df.head()

Unnamed: 0,label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
print(f'Structure: {df.shape}')

Structure: (5572, 2)


### II. Data Preprocessing

In [4]:
corpus = []
for sms in df.SMS:
    
    # normalising each sms
    temp_sms = sms.lower()
    
    # removing the punctuation marks
    temp_sms = re.sub('[^a-zA-Z]', ' ', temp_sms)
    
    # tokenise each sms into words
    temp_sms = nltk.word_tokenize(temp_sms)
    
    #lemmatise each token if not a stopword
    wordnet = WordNetLemmatizer()
    temp_sms = [wordnet.lemmatize(each_sms)for each_sms in temp_sms if each_sms not in stopwords.words('english') ]
    temp_sms = ' '.join(temp_sms)
    corpus.append(temp_sms)

### III. Bag Of Words Creation
-> Basically independent features matrix will be created

In [None]:
# 2500 most frequent features are considered.
cv = CountVectorizer(max_features=2500)

X = cv.fit_transform(corpus).toarray()
X.shape

### III OR TF-IDF Creation

In [5]:
# 2500 most frequent features are considered.
tf_idf = TfidfVectorizer(max_features=2500)

X = tf_idf.fit_transform(corpus).toarray()
X.shape

(5572, 2500)

In [6]:
# since the label is categorical, we need to encode them in dummy or indicator variables
y = pd.get_dummies(data = df.label,prefix= 'class', prefix_sep='-')
y

Unnamed: 0,class-ham,class-spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [7]:
# both labels Ham and Spam can be represented using 1 column only. -> 1-> Spam 0-> Ham
# So, we will consider only 1 column out of these two from y

y = y.iloc[:,-1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

### IV. Train Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 1)
print((X_train.shape, y_train.shape))
print((X_test.shape, y_test.shape))

((4457, 2500), (4457,))
((1115, 2500), (1115,))


### V. Model Training

Naive Bayes model usually works well with NLP. Even though the accurcay is not good, we can further fine tune it.

In [9]:
model = MultinomialNB()

# training the model
model.fit(X_train, y_train)

MultinomialNB()

### VI. Testing the Model

In [10]:
y_pred = model.predict(X_test)

### VII. Evaluation of the Model

In [11]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

array([[967,   1],
       [ 19, 128]], dtype=int64)

In [12]:
acc_score = accuracy_score(y_test, y_pred)
acc_score

0.9820627802690582