## Spam SMS Classification

In [122]:
import pandas as pd
import numpy as np
#importing the dataframe, this data contains values as tab seperated
data = pd.read_csv('SMSSpamCollection',sep='\t',names=['Label','Message'])
print(data.info(),data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Label    5572 non-null   object
 1   Message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None   Label                                            Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


#### Data Preprocessing: Words Lemmatization

In [129]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# Lemmatizing each row and saving it in words_lem and finally saving the output in corpus
lemmitizer = WordNetLemmatizer()
corpus = []
for i in range(0,len(data['Message'])):
    words_lem = re.sub('[^a-zA-Z]',' ',data['Message'][i])
    words_lem = words_lem.lower()
    words_lem = words_lem.split()
    words_lem = [lemmitizer.lemmatize(word) for word in words_lem 
                 if word not in stopwords.words('english')]
    corpus.append(' '.join(words_lem))
len(corpus)

5572

#### Bag of Words

In [130]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
CVect = CountVectorizer()
le = LabelEncoder()
X = CVect.fit_transform(corpus).toarray() # applying bag of words
Y = le.fit_transform(data['Label']) # Label encoding the ham and spam values
#-----------------------------
# Model fitting: Bag of Words
from sklearn.model_selection import train_test_split
X_train,X_vald,Y_train,Y_vald = train_test_split(X,Y,random_state=1,train_size=0.8)
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
lg = LogisticRegression()
model = lg.fit(X_train,Y_train)
Y_predict = model.predict(X_vald)

print('accuracy',model.score(X_vald,Y_vald))
print('RMSE',np.sqrt(mean_squared_error(Y_vald,Y_predict)))

accuracy 0.9838565022421525
RMSE 0.12705706496628802


#### TF_IDF

In [131]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
Vectorizer = TfidfVectorizer()
tf_idf = TfidfTransformer()
le = LabelEncoder()
X = Vectorizer.fit_transform(corpus) # converting 'corpus' into vector matrix
X = tf_idf.fit_transform(X) # TF_IDF Transformer
Y = le.fit_transform(data['Label']) # Encoding 'Y'

#--------------
# Model Fitting
from sklearn.model_selection import train_test_split
X_train,X_vald,Y_train,Y_vald = train_test_split(X,Y,random_state=1,train_size=0.8)
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
lg = LogisticRegression()
model = lg.fit(X_train,Y_train)
Y_predict = model.predict(X_vald)

print('accuracy',model.score(X_vald,Y_vald))
print('RMSE',np.sqrt(mean_squared_error(Y_vald,Y_predict)))

accuracy 0.9551569506726457
RMSE 0.21176177494381335
