In [7]:
from google.colab import files
uploaded = files.upload()

Saving SMSSpamCollection to SMSSpamCollection (1)


In [8]:
import pandas as pd
import re
from copy import deepcopy
from sklearn.model_selection import cross_val_score
sms_spam = pd.read_csv('SMSSpamCollection', sep='\t',
header=None, names=['Label', 'SMS'])

print(sms_spam.shape)
sms_spam.head()

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [26]:
# Randomize the dataset
import time
data_randomized = sms_spam.sample(frac=1, random_state=1)


def multinomial(training_set,test_set):

    # Split into training and test sets
    training_set = training_set.reset_index(drop=True)
    test_set = test_set.reset_index(drop=True)
    training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ') # Removes punctuation
    training_set['SMS'] = training_set['SMS'].str.lower()
    training_set['SMS'] = training_set['SMS'].str.split()
    vocabulary = []
    for sms in training_set['SMS']:
        for word in sms:
            vocabulary.append(word)
    vocabulary = list(set(vocabulary))
    
    word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}

    for index, sms in enumerate(training_set['SMS']):
        for word in sms:
            word_counts_per_sms[word][index] += 1
    word_counts = pd.DataFrame(word_counts_per_sms)
    training_set_clean = pd.concat([training_set, word_counts], axis=1)
    # Isolating spam and ham messages first
    spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
    ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

    # P(Spam) and P(Ham)
    p_spam = len(spam_messages) / len(training_set_clean)
    p_ham = len(ham_messages) / len(training_set_clean)

    # N_Spam
    n_words_per_spam_message = spam_messages['SMS'].apply(len)
    n_spam = n_words_per_spam_message.sum()

    # N_Ham
    n_words_per_ham_message = ham_messages['SMS'].apply(len)
    n_ham = n_words_per_ham_message.sum()

    # N_Vocabulary
    n_vocabulary = len(vocabulary)

    # Laplace smoothing
    alpha = 1
    # Initiate parameters
    parameters_spam = {unique_word:0 for unique_word in vocabulary}
    parameters_ham = {unique_word:0 for unique_word in vocabulary}
    
    # Calculate parameters
    for word in vocabulary:
        n_word_given_spam = spam_messages[word].sum() # spam_messages already defined
        p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
        parameters_spam[word] = p_word_given_spam

        n_word_given_ham = ham_messages[word].sum() # ham_messages already defined
        p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
        parameters_ham[word] = p_word_given_ham
    predicted_labels=[]
    for message in test_set['SMS']:
      message = re.sub('\W', ' ', message)
      message = message.lower().split()

      p_spam_given_message = p_spam
      p_ham_given_message = p_ham
      
      for word in message:
          if word in parameters_spam:
              p_spam_given_message *= parameters_spam[word]

          if word in parameters_ham:
              p_ham_given_message *= parameters_ham[word]

      if p_ham_given_message >= p_spam_given_message:
          predicted_labels.append('ham')
      elif p_spam_given_message > p_ham_given_message:
          predicted_labels.append('spam')
    test_set['predicted'] = predicted_labels
    correct = 0
    total = test_set.shape[0]

    for row in test_set.iterrows():
        row = row[1]
        if row['Label'] == row['predicted']:
            correct += 1
    return correct/total
def cross_val_score(func,data,cv):
  
  scores=[]
  for i in range(cv):
    temp_init=time.time()
    test_set=data[i*len(data)//cv:min(((i+1)*len(data))//cv,len(data))]
    train_set=data[~data.index.isin(test_set.index)]
    ac=multinomial(deepcopy(train_set),deepcopy(test_set))
    temp_fin=time.time()
    temp_fin=temp_fin-temp_init
    print('iteration ',i+1, ac,end="")
    print("   Time taken is ",temp_fin)
    
  scores.append(ac)
 
  return scores

init=time.time()
scores = cross_val_score(multinomial, data_randomized, cv=10)
final=time.time()
final=final-init

print(scores)
print("Total time taken is",final)


iteration  1 0.9946140035906643   Time taken is  15.232913494110107
iteration  2 0.9874326750448833   Time taken is  15.004887580871582
iteration  3 0.9838420107719928   Time taken is  14.885815382003784
iteration  4 0.9784560143626571   Time taken is  15.034783124923706
iteration  5 0.9874551971326165   Time taken is  15.528115272521973
iteration  6 0.9910233393177738   Time taken is  14.991657495498657
iteration  7 0.9874326750448833   Time taken is  15.03548526763916
iteration  8 0.9892280071813285   Time taken is  15.533244609832764
iteration  9 0.9802513464991023   Time taken is  15.329902410507202
iteration  10 0.992831541218638   Time taken is  15.395689725875854
[0.992831541218638]
Total time taken is 151.98455214500427
