In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import RegexpTokenizer,word_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import string

In [None]:
from google.colab import files
uploaded=files.upload()


Saving SMSSpamCollection to SMSSpamCollection


In [None]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
dataset=pd.read_csv('SMSSpamCollection',sep='\t',header=None,names=['LABEL','TEXT'])
dataset.drop_duplicates(inplace=True)
random_data=dataset.sample(frac=1,random_state=1)
index=round(len(random_data)*0.8)
train_set=random_data[:index].reset_index(drop=True)
test_set=random_data[index:].reset_index(drop=True)

In [None]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from nltk.stem import PorterStemmer
import re
porter_stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()
def process_text(text):
    text=text.lower()  
    words=[word for word in text if word not in string.punctuation]
    words=''.join(words)
    words=[word for word in words.split() if len(word)>2 and word.isalpha() and word not in stopwords.words('english')]
    stemmed_words=[porter_stemmer.stem(word=word) for word in words]
    return stemmed_words

    
from sklearn.feature_extraction.text import CountVectorizer
dataset_count_transform= CountVectorizer(analyzer=process_text,min_df=2)
dataset_count=dataset_count_transform.fit_transform(dataset['TEXT'])
text_count_transform=CountVectorizer(analyzer=process_text,vocabulary=dataset_count_transform.vocabulary_)
text_count=text_count_transform.fit_transform(train_set['TEXT'])


In [None]:
print(len(list(text_count_transform.vocabulary_.keys())))
train_set[text_count_transform.get_feature_names()]=pd.DataFrame(text_count.toarray(),index=train_set.index)



2702


In [None]:
spam_mails=train_set[train_set['LABEL']=='spam']
ham_mails=train_set[train_set['LABEL']=='ham']
p_spam=len(spam_mails)/len(train_set)
p_ham=len(ham_mails)/len(train_set)
print(spam_mails.shape,ham_mails.shape)
print(p_spam,p_ham)
num_words_spam=(spam_mails["TEXT"].apply(len)).sum()
num_words_ham=(ham_mails["TEXT"].apply(len)).sum()
print(num_words_spam,num_words_ham)
num_words_vocab=len(text_count_transform.vocabulary_.keys())
print(num_words_vocab)
alpha=1

(498, 2704) (3637, 2704)
0.12043530834340992 0.8795646916565901
68784 254443
2702


In [None]:
def cal_probability(word,label):
  if label==1:
      num_word_given_label=spam_mails[word].sum()
      p_num_word_given_label=(num_word_given_label+alpha)/(num_words_spam+(alpha*num_words_vocab))
  else:
     num_word_given_label=ham_mails[word].sum()
     p_num_word_given_label=(num_word_given_label+alpha)/(num_words_ham+(alpha*num_words_vocab))
  return p_num_word_given_label
keys=text_count_transform.vocabulary_.keys()
dict_p_word_given_spam={word:cal_probability(word,1) for word in keys}
dict_p_word_given_ham={word:cal_probability(word,0) for word in keys}
print(dict_p_word_given_ham)
print(dict_p_word_given_spam)

{'point': 5.444399074452157e-05, 'crazi': 2.7221995372260786e-05, 'avail': 4.6666277781018494e-05, 'bugi': 2.3333138890509247e-05, 'great': 0.0002955530926131171, 'world': 7.388827315327927e-05, 'buffet': 1.1666569445254623e-05, 'cine': 2.3333138890509247e-05, 'got': 0.0006999941667152773, 'wat': 0.0003383305139123841, 'lar': 0.0001322211203795524, 'joke': 4.6666277781018494e-05, 'wif': 9.722141204378853e-05, 'oni': 1.9444282408757704e-05, 'free': 0.0001788873981605709, 'entri': 3.888856481751541e-06, 'wkli': 3.888856481751541e-06, 'comp': 7.777712963503082e-06, 'win': 5.055513426277003e-05, 'cup': 1.9444282408757704e-05, 'final': 5.055513426277003e-05, 'tkt': 3.888856481751541e-06, 'may': 0.00012055455093429777, 'text': 0.0002799976666861109, 'receiv': 3.111085185401233e-05, 'txt': 5.444399074452157e-05, 'appli': 1.5555425927006165e-05, 'dun': 0.0001711096851970678, 'say': 0.00042388535651091794, 'earli': 0.0001049991250072916, 'hor': 1.1666569445254623e-05, 'alreadi': 0.0002799976666

In [None]:
def predict_label(test_mail):
  p_spam_given_mail=p_spam
  p_ham_given_mail=p_ham
  for word in test_mail:
    if word in dict_p_word_given_spam:
      p_spam_given_mail=p_spam_given_mail*dict_p_word_given_spam[word]
    if word in dict_p_word_given_ham:
      p_ham_given_mail=p_ham_given_mail*dict_p_word_given_ham[word]
  if p_spam_given_mail>=p_ham_given_mail:
    return 'spam'
  else:
    return 'ham'


test_set['TEXT']=test_set['TEXT'].astype(str)
test_set['TEXT']=test_set['TEXT'].apply(process_text)
test_set['PREDICTED']=test_set['TEXT'].apply(predict_label)

correct=0
total=test_set.shape[0]
actual_true=0
actual_false=0
actual_true_predicted_true=0
actual_true_predicted_false=0
actual_false_predicted_true=0
actual_false_predicted_false=0
for row in test_set.iterrows():
  row=row[1]
  if row['LABEL']==row['PREDICTED']:
    correct=correct+1
  if(row['LABEL']=='spam'):
    actual_true+=1
  if(row['LABEL']=='ham'):
    actual_false+=1
  if(row['LABEL']=='spam' and row['PREDICTED']=='spam'):
    actual_true_predicted_true+=1
  if(row['LABEL']=='spam' and row['PREDICTED']=='ham'):
    actual_true_predicted_false+=1
  if(row['LABEL']=='ham' and row['PREDICTED']=='spam'):
    actual_false_predicted_true+=1
  if(row['LABEL']=='ham' and row['PREDICTED']=='ham'):
    actual_false_predicted_false+=1
  

    
print(test_set[test_set['LABEL']=='spam'])
print(test_set[test_set['LABEL']=='ham'])
print("Correct:",correct)
print("Total",total)
print(actual_true,actual_true_predicted_true,actual_true_predicted_false)
print(actual_false,actual_false_predicted_true,actual_false_predicted_false)
print("accuracy:",correct/total)

0                 Nothing, i got msg frm tht unknown no..
1       Jason says it's cool if we pick some up from h...
2                               Call me when u're done...
3       Well there's not a lot of things happening in ...
4       I was about to do it when i texted. I finished...
                              ...                        
1029                          Tell me pa. How is pain de.
1030    Actually i deleted my old website..now i m blo...
1031    accordingly. I repeat, just text the word ok o...
1032    New Theory: Argument wins d SITUATION, but los...
1033                    Anything lor. Juz both of us lor.
Name: TEXT, Length: 1034, dtype: object
0                     [noth, got, msg, frm, tht, unknown]
1             [jason, say, cool, pick, place, like, hour]
2                                       [call, ure, done]
3       [well, there, lot, thing, happen, lindsay, new...
4          [text, finish, long, time, ago, shower, eryth]
                              ..