In [85]:
import csv
from collections import Counter
import string
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk import WordNetLemmatizer

### Data importation and grouping

In [3]:
with open('SMSSpamCollection.txt', 'r') as file:
    csv_reader = csv.reader(file, delimiter = '\t')
    smsdata_data = [line[1] for line in csv_reader]
    
with open('SMSSpamCollection.txt', 'r') as file:
    csv_reader = csv.reader(file, delimiter = '\t')
    smsdata_label = [line[0] for line in csv_reader]


### The number of spam and ham messages present in the data

In [4]:
label_count = Counter(smsdata_label)
print(label_count)


Counter({'ham': 4825, 'spam': 747})


## Text preprocesssing

### A brief step by step method of how to remove punctuations from a sentence

In [5]:
a = 'This.one, is..for u?'
b = ' '.join(a)
b

'T h i s . o n e ,   i s . . f o r   u ?'

In [6]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
z = [' ' if i in string.punctuation else i for i in a ]

In [13]:
z_ = ''.join(z)
z_

'This one  is  for u '

In [17]:
' '.join(z_)

'T h i s   o n e     i s     f o r   u  '

In [22]:
ex = [1,'.', 'a', '4']
sent = 'This is a boy 4 u.'
k = [' ' if i in ex else i for i in sent]

In [26]:
a = ''.join(k)
a

'This is   boy   u '

In [30]:
' '.join(a.split())

'This is boy u'

### punctuation from our sentence

do note that we are doing this for each line in the data and not on the whole data at once. so to effect this, we will have to iterate through our data and apply the function in each iteration

In [43]:
t = smsdata_data[2]
t

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [48]:
text = ' '.join((''.join([' ' if i in string.punctuation else i for i in t])).split())
text

'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry question std txt rate T C s apply 08452810075over18 s'

### tokenizing the sentenses based on white space

In [51]:
tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]

### changing the words to lower case

In [54]:
tokens = [word.lower() for word in tokens]
tokens

['free',
 'entry',
 'in',
 '2',
 'a',
 'wkly',
 'comp',
 'to',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 '21st',
 'may',
 '2005',
 'text',
 'fa',
 'to',
 '87121',
 'to',
 'receive',
 'entry',
 'question',
 'std',
 'txt',
 'rate',
 't',
 'c',
 's',
 'apply',
 '08452810075over18',
 's']

### removal of stop words

In [69]:
new_tokens = [i for i in tokens if i not in stopwords.words('english')]
print('The length of tokens before removal of stopwords is {} while the length after removal of stopwords is {}'.format(len(tokens), len(new_tokens)))

The length of tokens before removal of stopwords is 33 while the length after removal of stopwords is 25


### Keep words with length greater than 3

In [70]:
tokens = [word for word in new_tokens if len(word) >= 3]

### stemming

In [76]:
stemmer = PorterStemmer()

tokens = [stemmer.stem(word) for word in tokens]

### POS tagging

This is a prerequisite for lemmatization based on whether the word us noun or verb, this will reduce the word to the root word

In [82]:
tagged_corpus = pos_tag(tokens)

Noun_tags = ['NN', 'NNP', 'NNPS', 'NNS']
verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
lemmatizer = WordNetLemmatizer()

In [87]:
def prac_lemmatize(token, tag):
    if tag in Noun_tags:
        return lemmatizer.lemmatize(token, 'n')
    elif tag in verb_tags:
        return lemmatizer.lemmatize(token, 'v')
    else:
        return lemmatizer.lemmatize(token, 'n')
    

In [90]:
lemmatized_text = ' '.join([prac_lemmatize(token, tag) for token, tag in tagged_corpus])
lemmatized_text

'free entri wkli comp win cup final tkt 21st may 2005 text 87121 receiv entri question std txt rate appli 08452810075over18'

### Combinating the above text analysis steps into a function

In [98]:
def text_preprocessing(text):
    text = ' '.join((''.join([' ' if i in string.punctuation else i for i in text])).split())
    tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    tokens = [word.lower() for word in tokens]
    new_tokens = [i for i in tokens if i not in stopwords.words('english')]
    tokens = [word for word in new_tokens if len(word) >= 3]
    stemmer = PorterStemmer()

    tokens = [stemmer.stem(word) for word in tokens]
    
    tagged_corpus = pos_tag(tokens)

    Noun_tags = ['NN', 'NNP', 'NNPS', 'NNS']
    verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    lemmatizer = WordNetLemmatizer()
    
    def prac_lemmatize(token, tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token, 'n')
        elif tag in verb_tags:
            return lemmatizer.lemmatize(token, 'v')
        else:
            return lemmatizer.lemmatize(token, 'n')
    lemmatized_text = ' '.join([prac_lemmatize(token, tag) for token, tag in tagged_corpus])
    
    return lemmatized_text 
    
    
    
    

In [99]:
preprocessed_sms_data = [text_preprocessing(i) for i in smsdata_data]

In [100]:
preprocessed_sms_data[0:5]

['jurong point crazi avail bugi great world buffet cine get amor wat',
 'lar joke wif oni',
 'free entri wkli comp win cup final tkt 21st may 2005 text 87121 receiv entri question std txt rate appli 08452810075over18',
 'dun say earli hor alreadi say',
 'nah think goe usf live around though']

### Getting data ready to feed into the algorithm

In [101]:
import numpy as np

In [104]:
train_set_size = int(round(len(preprocessed_sms_data) * 0.70))
print('Train set size is', train_set_size)

Train set size is 3900


### splitting the training set and testing set

In [115]:
x_train = np.array([i for i in preprocessed_sms_data[:train_set_size]])
x_test = np.array([i for i in preprocessed_sms_data[train_set_size:]])

In [118]:
y_train =  np.array([i for i in smsdata_label[:train_set_size]])
y_test =  np.array([i for i in smsdata_label[train_set_size:]])

### converting the words into vectorizer format

In [121]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [131]:
vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words= 'english', max_features = 4000, strip_accents = 'unicode', norm = 'l2')

In [132]:
X_train_2 = vectorizer.fit_transform(x_train).todense()
X_test_2 = vectorizer.transform(x_test).todense()

In [157]:
from lightgbm import LGBMClassifier

### Training

In [153]:
clf = LGBMClassifier()
model = clf.fit(X_train_2, y_train)


### Predicting

In [154]:
predicted_train = clf.predict(X_train_2)
predicted_test = clf.predict(X_test_2)

In [155]:
from sklearn.metrics import classification_report, accuracy_score

In [156]:
accuracy_score(predicted_test, y_test)

0.9766746411483254

### I used lightgbm but feel free to use any classification model  of your choice