In [2]:
import pandas as pd 
data =  pd.read_csv('SMSSpamCollection.tsv', sep = '\t', names = ['label', 'message'])
data.head()

Unnamed: 0,label,message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [3]:
text = data['message']
label = data['label']

#Number of Words
#x = lambda a : a + 10
#print(x(5))
data['word_count'] = data['message'].apply(lambda x: len(str(x).split(" ")))
data[['message','word_count']].head()

Unnamed: 0,message,word_count
0,I've been searching for the right words to tha...,37
1,Free entry in 2 a wkly comp to win FA Cup fina...,28
2,"Nah I don't think he goes to usf, he lives aro...",13
3,Even my brother is not like to speak with me. ...,16
4,I HAVE A DATE ON SUNDAY WITH WILL!!,8


In [4]:
#Number of characters
data['char_count'] = data['message'].str.len() ## this also includes spaces
data[['message','char_count']].head()

Unnamed: 0,message,char_count
0,I've been searching for the right words to tha...,196
1,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,"Nah I don't think he goes to usf, he lives aro...",61
3,Even my brother is not like to speak with me. ...,77
4,I HAVE A DATE ON SUNDAY WITH WILL!!,35


In [5]:
#Average Word Length
def avg_word(sentence):
  words = sentence.split()
  #print(words)
  return (sum(len(word) for word in words)/len(words))

data['avg_word'] = data['message'].apply(lambda x: avg_word(x))
data[['message','avg_word']].head()

Unnamed: 0,message,avg_word
0,I've been searching for the right words to tha...,4.324324
1,Free entry in 2 a wkly comp to win FA Cup fina...,4.571429
2,"Nah I don't think he goes to usf, he lives aro...",3.769231
3,Even my brother is not like to speak with me. ...,3.875
4,I HAVE A DATE ON SUNDAY WITH WILL!!,3.5


In [6]:
#Number of stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

data['stopwords'] = data['message'].apply(lambda x: len([x for x in x.split() if x in stop]))
data[['message','stopwords']].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Krishna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,message,stopwords
0,I've been searching for the right words to tha...,19
1,Free entry in 2 a wkly comp to win FA Cup fina...,5
2,"Nah I don't think he goes to usf, he lives aro...",5
3,Even my brother is not like to speak with me. ...,6
4,I HAVE A DATE ON SUNDAY WITH WILL!!,0


In [7]:
#Number of special characters
data['hastags'] = data['message'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
data[['message','hastags']].head()

Unnamed: 0,message,hastags
0,I've been searching for the right words to tha...,0
1,Free entry in 2 a wkly comp to win FA Cup fina...,0
2,"Nah I don't think he goes to usf, he lives aro...",0
3,Even my brother is not like to speak with me. ...,0
4,I HAVE A DATE ON SUNDAY WITH WILL!!,0


In [8]:
#Number of numerics
data['numerics'] = data['message'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
data[['message','numerics']].head()

Unnamed: 0,message,numerics
0,I've been searching for the right words to tha...,0
1,Free entry in 2 a wkly comp to win FA Cup fina...,2
2,"Nah I don't think he goes to usf, he lives aro...",0
3,Even my brother is not like to speak with me. ...,0
4,I HAVE A DATE ON SUNDAY WITH WILL!!,0


In [9]:
#Number of Uppercase words
data['upper'] = data['message'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
data[['message','upper']].head()

Unnamed: 0,message,upper
0,I've been searching for the right words to tha...,1
1,Free entry in 2 a wkly comp to win FA Cup fina...,2
2,"Nah I don't think he goes to usf, he lives aro...",1
3,Even my brother is not like to speak with me. ...,0
4,I HAVE A DATE ON SUNDAY WITH WILL!!,8


In [10]:

pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
from textblob import TextBlob, Word, Blobber
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

data['noun_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'noun'))
data['verb_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'verb'))
data['adj_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'adj'))
data['adv_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'adv'))
data['pron_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'pron'))
data[['message','noun_count','verb_count','adj_count', 'adv_count', 'pron_count' ]].head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Krishna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Krishna\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,message,noun_count,verb_count,adj_count,adv_count,pron_count
0,I've been searching for the right words to tha...,6,11,3,0,6
1,Free entry in 2 a wkly comp to win FA Cup fina...,14,3,4,0,0
2,"Nah I don't think he goes to usf, he lives aro...",1,5,0,3,3
3,Even my brother is not like to speak with me. ...,3,3,0,2,4
4,I HAVE A DATE ON SUNDAY WITH WILL!!,4,1,0,0,1


In [11]:
data[['message','word_count','char_count','avg_word','stopwords','hastags','numerics','upper','noun_count','verb_count','adj_count', 'adv_count', 'pron_count','label' ]].head()

Unnamed: 0,message,word_count,char_count,avg_word,stopwords,hastags,numerics,upper,noun_count,verb_count,adj_count,adv_count,pron_count,label
0,I've been searching for the right words to tha...,37,196,4.324324,19,0,0,1,6,11,3,0,6,ham
1,Free entry in 2 a wkly comp to win FA Cup fina...,28,155,4.571429,5,0,2,2,14,3,4,0,0,spam
2,"Nah I don't think he goes to usf, he lives aro...",13,61,3.769231,5,0,0,1,1,5,0,3,3,ham
3,Even my brother is not like to speak with me. ...,16,77,3.875,6,0,0,0,3,3,0,2,4,ham
4,I HAVE A DATE ON SUNDAY WITH WILL!!,8,35,3.5,0,0,0,8,4,1,0,0,1,ham


In [13]:
features = data[['word_count','char_count','avg_word','stopwords','hastags','numerics','upper','noun_count','verb_count','adj_count', 'adv_count', 'pron_count']]
#label = data['label']

import numpy as np
classes_list = ["ham","spam"]
label_index = data['label'].apply(classes_list.index)
label = np.asarray(label_index)

In [14]:
import numpy as np
features_array = np.asarray(features)
features_array.shape

(5568, 12)

In [15]:
# data split into train and text
import numpy as np
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_array, label, test_size=0.33, random_state=42)

In [16]:
from sklearn.metrics import accuracy_score
from sklearn import metrics

from sklearn.svm import SVC
model_SVM = SVC()
model_SVM.fit(x_train, y_train)
y_pred_SVM = model_SVM.predict(x_test)
print("SVM")
print("Accuracy score =", accuracy_score(y_test, y_pred_SVM))
print(metrics.classification_report(y_test, y_pred_SVM))



from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(x_train,y_train)
y_pred_naive = naive.predict(x_test)
print("Naive Bayes")
print("Accuracy score =", accuracy_score(y_test, y_pred_naive))
print(metrics.classification_report(y_test, y_pred_naive ))

SVM
Accuracy score = 0.9314472252448314
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1608
           1       0.77      0.65      0.70       230

    accuracy                           0.93      1838
   macro avg       0.86      0.81      0.83      1838
weighted avg       0.93      0.93      0.93      1838

Naive Bayes
Accuracy score = 0.9221980413492927
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      1608
           1       0.67      0.74      0.71       230

    accuracy                           0.92      1838
   macro avg       0.82      0.85      0.83      1838
weighted avg       0.93      0.92      0.92      1838



In [18]:
# data split into train and text
import numpy as np
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_array, label, test_size=0.33, random_state=42)
x_train.shape

(3730, 12)

In [21]:
data = pd.read_csv('SMSSpamCollection.tsv', sep = '\t', names = ['label','message'])
text = data['message']
class_label = data['label']

In [22]:
import numpy as np
classes_list = ["ham","spam"]
label_index = class_label.apply(classes_list.index)
label = np.asarray(label_index)

In [23]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=0.33, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1,1))
x_train = vectorizer.fit_transform(X_train)
x_test = vectorizer.transform(X_test)

x_train.shape

(3730, 7118)

In [24]:
vectorizer.get_feature_names()

['00',
 '000',
 '008704050406',
 '0089',
 '0121',
 '01223585334',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07046744435',
 '07090201529',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808247860',
 '07808726822',
 '07821230901',
 '078498',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08006344447',
 '0808',
 '08081560665',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '08452810071',
 '08452810073',
 '08452810075over18',
 '0870',
 '08700469649',
 '08700621170150p',
 '08701213186',
 '08701237397',
 '08701417012',
 '08701417012150p',
 '087016248',
 '087

In [26]:
from sklearn.metrics import accuracy_score
from sklearn import metrics

from sklearn.svm import SVC
model_SVM = SVC()
model_SVM.fit(x_train, y_train)
y_pred_SVM = model_SVM.predict(x_test)
print("SVM")
print("Accuracy score =", accuracy_score(y_test, y_pred_SVM))
print(metrics.classification_report(y_test, y_pred_SVM))


from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(x_train.toarray(),y_train)
y_pred_naive = naive.predict(x_test.toarray())
print("Naive Bayes")
print("Accuracy score =", accuracy_score(y_test, y_pred_naive))
print(metrics.classification_report(y_test, y_pred_naive ))

SVM
Accuracy score = 0.9793253536452666
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1608
           1       0.99      0.84      0.91       230

    accuracy                           0.98      1838
   macro avg       0.98      0.92      0.95      1838
weighted avg       0.98      0.98      0.98      1838

Naive Bayes
Accuracy score = 0.8906420021762785
              precision    recall  f1-score   support

           0       0.98      0.89      0.93      1608
           1       0.54      0.87      0.67       230

    accuracy                           0.89      1838
   macro avg       0.76      0.88      0.80      1838
weighted avg       0.92      0.89      0.90      1838

