In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:

from nltk.stem import WordNetLemmatizer
import pandas as pd
import string
import re

In [None]:
#Inspecting data
full_corpus = pd.read_csv('/content/gdrive/My Drive/nlp_proj/data/SMSSpamCollection', sep='\t', header=None, names=['label', 'msg_body'])
# print("Input data has {} rows and {} columns".format(len(full_corpus), len(full_corpus.columns)))
# print(full_corpus.info())

# Separating messages into ham and spam
ham_text = []
spam_text = []

In [None]:
def separate_msgs():
    for index, column in full_corpus.iterrows():
        label = column[0]
        message_text = column[1]
        if label == 'ham':
            ham_text.append(message_text)
        elif label == 'spam':
            spam_text.append(message_text)

separate_msgs()
ham_text = ham_text[:1500]

In [None]:
# Preprocessing of text

#removing punctuation marks from the email messages
def remove_msg_punctuations(msg):
    puntuation_removed_msg = "".join([word for word in msg if word not in string.punctuation])
    return puntuation_removed_msg


In [None]:
#converting text into lowercase and word tokenizing
def tokenize_into_words(text):
    tokens = re.split('\W+', text)
    return tokens


In [None]:
#lemmatizing
word_lemmatizer = WordNetLemmatizer()
def lemmatization(tokenized_words):
    lemmatized_text = [word_lemmatizer.lemmatize(word)for word in tokenized_words]
    return ' '.join(lemmatized_text)


In [None]:
def preprocessing_msgs(corpus):
    categorized_text = pd.DataFrame(corpus)
    categorized_text['non_punc_message_body'] = categorized_text[0].apply(lambda msg: remove_msg_punctuations(msg))
    categorized_text['tokenized_msg_body'] = categorized_text['non_punc_message_body'].apply(lambda msg: tokenize_into_words(msg.lower()))
    categorized_text['lemmatized_msg_words'] = categorized_text['tokenized_msg_body'].apply(lambda word_list: lemmatization(word_list))
    return categorized_text['lemmatized_msg_words']


In [None]:
# Extracting features i.e. n-grams
def feature_extraction(preprocessed_text):
    bigrams = []
    unigrams_lists = []
    for msg in preprocessed_text:
        # adding end of and start of a message
        msg = '<s> ' +msg +' </s>'
        unigrams_lists.append(msg.split())
    unigrams = [uni_list for sub_list in unigrams_lists for uni_list in sub_list]
    bigrams.extend(nltk.bigrams(unigrams))
    return bigrams


In [None]:
# removing bigrams only with stop words
stopwords = nltk.corpus.stopwords.words('english')
def filter_stopwords_bigrams(bigram_list):
    filtered_bigrams = []
    for bigram in bigram_list:
        if bigram[0] in stopwords and bigram[1] in stopwords:
            continue
        filtered_bigrams.append(bigram)
    return filtered_bigrams


In [None]:
# Acquiring frequencies of features
def ham_bigram_feature_frequency():
    # features frequency for ham messages
    ham_bigrams = feature_extraction(preprocessing_msgs(ham_text))
    ham_bigram_frequency = nltk.FreqDist(filter_stopwords_bigrams(ham_bigrams))
    return ham_bigram_frequency

def spam_bigram_feature_frequency():
    # features frequency for spam messages
    spam_bigrams = feature_extraction(preprocessing_msgs(spam_text))
    spam_bigram_frequency = nltk.FreqDist(filter_stopwords_bigrams(spam_bigrams))
    return spam_bigram_frequency


In [None]:
# calculating bigram probabilities
def bigram_probability(message):
    probability_h = 1
    probability_s = 1
    # preprocessing input messages
    punc_removed_message = "".join(word for word in message if word not in string.punctuation)
    punc_removed_message = '<s> ' +punc_removed_message +' </s>'
    tokenized_msg = re.split('\s+', punc_removed_message)
    lemmatized_msg = [word_lemmatizer.lemmatize(word)for word in tokenized_msg]
    # bigrams for message
    bigrams_for_msg = list(nltk.bigrams(lemmatized_msg))
    # stop words removed unigrams for vocabulary
    ham_unigrams = [word for word in feature_extraction(preprocessing_msgs(ham_text)) if word not in stopwords]
    spam_unigrams = [word for word in feature_extraction(preprocessing_msgs(spam_text)) if word not in stopwords]
    # frequecies of bigrams extracted
    ham_frequency = ham_bigram_feature_frequency()
    spam_frequency  = spam_bigram_feature_frequency()
    print('========================== Calculating Probabilities ==========================')
    print('----------- Ham Freuquencies ------------')
    for bigram in bigrams_for_msg:
        # probability of first word in bigram
        ham_probability_denominator = 0
        # probability of bigram (smoothed)
        ham_probability_of_bigram = ham_frequency[bigram] + 1
        print(bigram, ' occurs ', ham_probability_of_bigram)
        for (first_unigram, second_unigram) in filter_stopwords_bigrams(ham_unigrams):
            ham_probability_denominator += 1
            if(first_unigram == bigram[0]):
                ham_probability_denominator += ham_frequency[first_unigram, second_unigram]
        probability = ham_probability_of_bigram / ham_probability_denominator
        probability_h *= probability
    print('\n')
    print('----------- Spam Freuquencies ------------')
    for bigram in bigrams_for_msg:
        # probability of first word in bigram
        spam_probability_denominator = 0
        # probability of bigram (smoothed)
        spam_probability_of_bigram = spam_frequency[bigram] + 1
        print(bigram, ' occurs ', spam_probability_of_bigram)
        for (first_unigram, second_unigram) in filter_stopwords_bigrams(spam_unigrams):
            spam_probability_denominator += 1
            if(first_unigram == bigram[0]):
                spam_probability_denominator += spam_frequency[first_unigram, second_unigram]
        probability = spam_probability_of_bigram / spam_probability_denominator
        probability_s *= probability
    print('\n')
    print('Ham Probability: ' +str(probability_h))
    print('Spam Probability: ' +str(probability_s))
    print('\n')
    if(probability_h >= probability_s):
        print('\"' +message +'\" is a Ham message')
    else:
        print('\"' +message +'\" is a Spam message')
    print('\n')


In [None]:
bigram_probability('Sorry,  ..use your brain dear')
bigram_probability('SIX chances to win CASH.')

----------- Ham Freuquencies ------------
('<s>', 'Sorry')  occurs  1
('Sorry', 'use')  occurs  1
('use', 'your')  occurs  2
('your', 'brain')  occurs  2
('brain', 'dear')  occurs  2
('dear', '</s>')  occurs  10


----------- Spam Freuquencies ------------
('<s>', 'Sorry')  occurs  1
('Sorry', 'use')  occurs  1
('use', 'your')  occurs  3
('your', 'brain')  occurs  1
('brain', 'dear')  occurs  1
('dear', '</s>')  occurs  1


Ham Probability: 3.487628063507234e-25
Spam Probability: 4.9965257136356e-26


"Sorry,  ..use your brain dear" is a Ham message


----------- Ham Freuquencies ------------
('<s>', 'SIX')  occurs  1
('SIX', 'chance')  occurs  1
('chance', 'to')  occurs  1
('to', 'win')  occurs  1
('win', 'CASH')  occurs  1
('CASH', '</s>')  occurs  1


----------- Spam Freuquencies ------------
('<s>', 'SIX')  occurs  1
('SIX', 'chance')  occurs  1
('chance', 'to')  occurs  18
('to', 'win')  occurs  19
('win', 'CASH')  occurs  1
('CASH', '</s>')  occurs  1


Ham Probability: 4.049455

In [None]:
bigram_probability(';Congratulations Dear Get Rs. 2000 Welcome Bonus Play Rummy and Register for Free Click Here')

----------- Ham Freuquencies ------------
('<s>', 'Congratulations')  occurs  1
('Congratulations', 'Dear')  occurs  1
('Dear', 'Get')  occurs  1
('Get', 'Rs')  occurs  1
('Rs', '2000')  occurs  1
('2000', 'Welcome')  occurs  1
('Welcome', 'Bonus')  occurs  1
('Bonus', 'Play')  occurs  1
('Play', 'Rummy')  occurs  1
('Rummy', 'and')  occurs  1
('and', 'Register')  occurs  1
('Register', 'for')  occurs  1
('for', 'Free')  occurs  1
('Free', 'Click')  occurs  1
('Click', 'Here')  occurs  1
('Here', '</s>')  occurs  1


----------- Spam Freuquencies ------------
('<s>', 'Congratulations')  occurs  1
('Congratulations', 'Dear')  occurs  1
('Dear', 'Get')  occurs  1
('Get', 'Rs')  occurs  1
('Rs', '2000')  occurs  1
('2000', 'Welcome')  occurs  1
('Welcome', 'Bonus')  occurs  1
('Bonus', 'Play')  occurs  1
('Play', 'Rummy')  occurs  1
('Rummy', 'and')  occurs  1
('and', 'Register')  occurs  1
('Register', 'for')  occurs  1
('for', 'Free')  occurs  1
('Free', 'Click')  occurs  1
('Click', 'H

In [None]:
bigram_probability('Hi Shyam, book your batch now for Internshala Online Summer Trainings. Get up to 55% + 10% off on trainings till 15th March bit.ly/summer-launch20')

----------- Ham Freuquencies ------------
('<s>', 'Hi')  occurs  1
('Hi', 'Shyam')  occurs  1
('Shyam', 'book')  occurs  1
('book', 'your')  occurs  1
('your', 'batch')  occurs  1
('batch', 'now')  occurs  1
('now', 'for')  occurs  1
('for', 'Internshala')  occurs  1
('Internshala', 'Online')  occurs  1
('Online', 'Summer')  occurs  1
('Summer', 'Trainings')  occurs  1
('Trainings', 'Get')  occurs  1
('Get', 'up')  occurs  1
('up', 'to')  occurs  1
('to', '55')  occurs  1
('55', '10')  occurs  1
('10', 'off')  occurs  1
('off', 'on')  occurs  1
('on', 'training')  occurs  1
('training', 'till')  occurs  1
('till', '15th')  occurs  1
('15th', 'March')  occurs  1
('March', 'bitlysummerlaunch20')  occurs  1
('bitlysummerlaunch20', '</s>')  occurs  1


----------- Spam Freuquencies ------------
('<s>', 'Hi')  occurs  1
('Hi', 'Shyam')  occurs  1
('Shyam', 'book')  occurs  1
('book', 'your')  occurs  2
('your', 'batch')  occurs  1
('batch', 'now')  occurs  1
('now', 'for')  occurs  1
('for'

In [None]:
bigram_probability('You are selected for Rs70000 Monthly Pension after Retirement. Just Rs120/day & Upto 1.5 Lacs Tax Discount. Know more http://ap6m.com/lhhamzkilzl')

----------- Ham Freuquencies ------------
('<s>', 'You')  occurs  1
('You', 'are')  occurs  1
('are', 'selected')  occurs  1
('selected', 'for')  occurs  1
('for', 'Rs70000')  occurs  1
('Rs70000', 'Monthly')  occurs  1
('Monthly', 'Pension')  occurs  1
('Pension', 'after')  occurs  1
('after', 'Retirement')  occurs  1
('Retirement', 'Just')  occurs  1
('Just', 'Rs120day')  occurs  1
('Rs120day', 'Upto')  occurs  1
('Upto', '15')  occurs  1
('15', 'Lacs')  occurs  1
('Lacs', 'Tax')  occurs  1
('Tax', 'Discount')  occurs  1
('Discount', 'Know')  occurs  1
('Know', 'more')  occurs  1
('more', 'httpap6mcomlhhamzkilzl')  occurs  1
('httpap6mcomlhhamzkilzl', '</s>')  occurs  1


----------- Spam Freuquencies ------------
('<s>', 'You')  occurs  1
('You', 'are')  occurs  1
('are', 'selected')  occurs  12
('selected', 'for')  occurs  1
('for', 'Rs70000')  occurs  1
('Rs70000', 'Monthly')  occurs  1
('Monthly', 'Pension')  occurs  1
('Pension', 'after')  occurs  1
('after', 'Retirement')  occu

In [None]:
bigram_probability('Your loan form is missing a signature. Click here and get approved for Rs.25lakh Business Loan')

----------- Ham Freuquencies ------------
('<s>', 'Your')  occurs  1
('Your', 'loan')  occurs  1
('loan', 'form')  occurs  1
('form', 'is')  occurs  1
('is', 'missing')  occurs  3
('missing', 'a')  occurs  1
('a', 'signature')  occurs  1
('signature', 'Click')  occurs  1
('Click', 'here')  occurs  1
('here', 'and')  occurs  1
('and', 'get')  occurs  3
('get', 'approved')  occurs  1
('approved', 'for')  occurs  1
('for', 'Rs25lakh')  occurs  1
('Rs25lakh', 'Business')  occurs  1
('Business', 'Loan')  occurs  1
('Loan', '</s>')  occurs  1


----------- Spam Freuquencies ------------
('<s>', 'Your')  occurs  1
('Your', 'loan')  occurs  1
('loan', 'form')  occurs  1
('form', 'is')  occurs  1
('is', 'missing')  occurs  1
('missing', 'a')  occurs  1
('a', 'signature')  occurs  1
('signature', 'Click')  occurs  1
('Click', 'here')  occurs  1
('here', 'and')  occurs  1
('and', 'get')  occurs  5
('get', 'approved')  occurs  1
('approved', 'for')  occurs  1
('for', 'Rs25lakh')  occurs  1
('Rs25l

In [None]:
bigram_probability('LENSKART Spring 2020 is Live! New Rounders, Hexagons, Transparents, Pilots, Cateyes! Buy One Get One with BLU lenses. Get App: lskt.me/z4. Store: lskt.me/u4')

----------- Ham Freuquencies ------------
('<s>', 'LENSKART')  occurs  1
('LENSKART', 'Spring')  occurs  1
('Spring', '2020')  occurs  1
('2020', 'is')  occurs  1
('is', 'Live')  occurs  1
('Live', 'New')  occurs  1
('New', 'Rounders')  occurs  1
('Rounders', 'Hexagons')  occurs  1
('Hexagons', 'Transparents')  occurs  1
('Transparents', 'Pilots')  occurs  1
('Pilots', 'Cateyes')  occurs  1
('Cateyes', 'Buy')  occurs  1
('Buy', 'One')  occurs  1
('One', 'Get')  occurs  1
('Get', 'One')  occurs  1
('One', 'with')  occurs  1
('with', 'BLU')  occurs  1
('BLU', 'lens')  occurs  1
('lens', 'Get')  occurs  1
('Get', 'App')  occurs  1
('App', 'lsktmez4')  occurs  1
('lsktmez4', 'Store')  occurs  1
('Store', 'lsktmeu4')  occurs  1
('lsktmeu4', '</s>')  occurs  1


----------- Spam Freuquencies ------------
('<s>', 'LENSKART')  occurs  1
('LENSKART', 'Spring')  occurs  1
('Spring', '2020')  occurs  1
('2020', 'is')  occurs  1
('is', 'Live')  occurs  1
('Live', 'New')  occurs  1
('New', 'Rounder

In [None]:
bigram_probability('Going Abroad? Get unlimited Incoming calls & stay connected to your loved ones with Airtel International Packs starting at Rs150/day. Click www.bit.ly/2HkllBk')

----------- Ham Freuquencies ------------
('<s>', 'Going')  occurs  1
('Going', 'Abroad')  occurs  1
('Abroad', 'Get')  occurs  1
('Get', 'unlimited')  occurs  1
('unlimited', 'Incoming')  occurs  1
('Incoming', 'call')  occurs  1
('call', 'stay')  occurs  1
('stay', 'connected')  occurs  1
('connected', 'to')  occurs  1
('to', 'your')  occurs  1
('your', 'loved')  occurs  1
('loved', 'one')  occurs  1
('one', 'with')  occurs  2
('with', 'Airtel')  occurs  1
('Airtel', 'International')  occurs  1
('International', 'Packs')  occurs  1
('Packs', 'starting')  occurs  1
('starting', 'at')  occurs  1
('at', 'Rs150day')  occurs  1
('Rs150day', 'Click')  occurs  1
('Click', 'wwwbitly2HkllBk')  occurs  1
('wwwbitly2HkllBk', '</s>')  occurs  1


----------- Spam Freuquencies ------------
('<s>', 'Going')  occurs  1
('Going', 'Abroad')  occurs  1
('Abroad', 'Get')  occurs  1
('Get', 'unlimited')  occurs  1
('unlimited', 'Incoming')  occurs  1
('Incoming', 'call')  occurs  1
('call', 'stay')  occ

In [None]:
bigram_probability('Good morning, what is the plan')

----------- Ham Freuquencies ------------
('<s>', 'Good')  occurs  1
('Good', 'morning')  occurs  1
('morning', 'what')  occurs  1
('what', 'is')  occurs  1
('is', 'the')  occurs  1
('the', 'plan')  occurs  4
('plan', '</s>')  occurs  8


----------- Spam Freuquencies ------------
('<s>', 'Good')  occurs  1
('Good', 'morning')  occurs  1
('morning', 'what')  occurs  1
('what', 'is')  occurs  1
('is', 'the')  occurs  1
('the', 'plan')  occurs  1
('plan', '</s>')  occurs  1


Ham Probability: 6.295411400957923e-30
Spam Probability: 1.0183244041407944e-30


"Good morning, what is the plan" is a Ham message




In [None]:
bigram_probability('Text me if forensics class is there')

----------- Ham Freuquencies ------------
('<s>', 'Text')  occurs  1
('Text', 'me')  occurs  1
('me', 'if')  occurs  1
('if', 'forensics')  occurs  1
('forensics', 'class')  occurs  1
('class', 'is')  occurs  3
('is', 'there')  occurs  1
('there', '</s>')  occurs  9


----------- Spam Freuquencies ------------
('<s>', 'Text')  occurs  1
('Text', 'me')  occurs  1
('me', 'if')  occurs  1
('if', 'forensics')  occurs  1
('forensics', 'class')  occurs  1
('class', 'is')  occurs  1
('is', 'there')  occurs  1
('there', '</s>')  occurs  1


Ham Probability: 2.429628479770275e-34
Spam Probability: 6.126585421480635e-35


"Text me if forensics class is there" is a Ham message




In [None]:
bigram_probability("We missed you! We are doing it again for those who missed our webinar on 5 steps to get a job in cloud computing")

----------- Ham Freuquencies ------------
('<s>', 'We')  occurs  1
('We', 'missed')  occurs  1
('missed', 'you')  occurs  1
('you', 'We')  occurs  1
('We', 'are')  occurs  1
('are', 'doing')  occurs  1
('doing', 'it')  occurs  1
('it', 'again')  occurs  1
('again', 'for')  occurs  1
('for', 'those')  occurs  1
('those', 'who')  occurs  1
('who', 'missed')  occurs  1
('missed', 'our')  occurs  1
('our', 'webinar')  occurs  1
('webinar', 'on')  occurs  1
('on', '5')  occurs  1
('5', 'step')  occurs  1
('step', 'to')  occurs  1
('to', 'get')  occurs  25
('get', 'a')  occurs  9
('a', 'job')  occurs  3
('job', 'in')  occurs  4
('in', 'cloud')  occurs  1
('cloud', 'computing')  occurs  1
('computing', '</s>')  occurs  1


----------- Spam Freuquencies ------------
('<s>', 'We')  occurs  1
('We', 'missed')  occurs  1
('missed', 'you')  occurs  1
('you', 'We')  occurs  1
('We', 'are')  occurs  1
('are', 'doing')  occurs  1
('doing', 'it')  occurs  1
('it', 'again')  occurs  1
('again', 'for') 

In [None]:
bigram_probability("VIT wins Golden Globe Award")

----------- Ham Freuquencies ------------
('<s>', 'VIT')  occurs  1
('VIT', 'win')  occurs  1
('win', 'Golden')  occurs  1
('Golden', 'Globe')  occurs  1
('Globe', 'Award')  occurs  1
('Award', '</s>')  occurs  1


----------- Spam Freuquencies ------------
('<s>', 'VIT')  occurs  1
('VIT', 'win')  occurs  1
('win', 'Golden')  occurs  1
('Golden', 'Globe')  occurs  1
('Globe', 'Award')  occurs  1
('Award', '</s>')  occurs  1


Ham Probability: 4.439640606090059e-27
Spam Probability: 1.9291421151709503e-26


"VIT wins Golden Globe Award" is a Spam message


