In [1]:
import pandas as pd
import re
import math
from copy import deepcopy
from sklearn.model_selection import cross_val_score
sms_spam = pd.read_csv('SMSSpamCollection', sep='\t',header=None, names=['Label', 'SMS'])

print(sms_spam.shape)
sms_spam.head()

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Randomize the dataset
import time
data_randomized = sms_spam.sample(frac=1, random_state=1)

def multivariate(training_set,test_set):

    training_set = training_set.reset_index(drop=True)
    test_set = test_set.reset_index(drop=True)
    training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ') # Removes punctuation
    training_set['SMS'] = training_set['SMS'].str.lower()
    training_set['SMS'] = training_set['SMS'].str.split()

    vocabulary = []   #unique words only

    for sms in training_set['SMS']:
        for word in sms:
            vocabulary.append(word)
    vocabulary = list(set(vocabulary)) #list if unique vocab.
    
    word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}

    for index, sms in enumerate(training_set['SMS']):
        for word in sms:
            word_counts_per_sms[word][index] += 1
    word_counts = pd.DataFrame(word_counts_per_sms)
    training_set_clean = pd.concat([training_set, word_counts], axis=1)

    # Isolating spam and ham messages first
    spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
    ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

    # P(Spam) and P(Ham)
    p_spam = len(spam_messages) / len(training_set_clean)
    p_ham = len(ham_messages) / len(training_set_clean)

    # N_Spam
    n_words_per_spam_message = spam_messages['SMS'].apply(len)
    n_spam = n_words_per_spam_message.sum()

    # N_Ham
    n_words_per_ham_message = ham_messages['SMS'].apply(len)
    n_ham = n_words_per_ham_message.sum()

    # N_Vocabulary
    n_vocabulary = len(vocabulary)

    # Initiate parameters
    parameters_spam = {unique_word:0 for unique_word in vocabulary}
    parameters_ham = {unique_word:0 for unique_word in vocabulary}
    
  
    len_spam=0
    for message in spam_messages:
      for word in message:
        len_spam+=1

    len_ham=0
    for message in ham_messages:
      for word in message:
        len_ham+=1

    #suppose its your friend
    # Calculate parameters
    count1 =0     #your in spam    #then count of friend in spam
    count2 =0     #your in ham     #then count of friend in ham
    for word in vocabulary:
        for msg in spam_messages:
          if word in msg:
            count1+=1
        p_word_given_spam = (count1 + 1)/(len(vocabulary) + len_spam )   #add1 smoothing
        parameters_spam[word] = p_word_given_spam

        for msg in ham_messages:
          if word in msg:
            count2+=1
        p_word_given_ham = (count2 + 1)/(len(vocabulary) + len_ham)
        parameters_ham[word] = p_word_given_ham
    #####################################

    predicted_labels=[]   #array to store our labels predicted

    for message in test_set['SMS']:   #picking test data to check if our msg is correct
      message = re.sub('\W', ' ', message)
      message = message.lower().split()

      p_spam_given_message = p_spam
      p_ham_given_message = p_ham
      
      for word in message:
          if word in parameters_spam:
              p_spam_given_message *= parameters_spam[word]
          #else:
          #    p_spam_given_message *= 1 - parameters_spam[word]
          
          if word in parameters_ham:
              p_ham_given_message *= parameters_ham[word]
          #else:
          #    p_ham_given_message *= 1 - parameters_spam[word]

      if p_ham_given_message >= p_spam_given_message:
          predicted_labels.append('ham')        ##declaring it ham
      elif p_spam_given_message > p_ham_given_message:
          predicted_labels.append('spam')       ##declaring it spam
    
    ###################################
    test_set['predicted'] = predicted_labels    ##returning training result in variable correct to find accuracy
    correct = 0   #to count number of correct predictions
    total = test_set.shape[0]

    for row in test_set.iterrows():
        row = row[1]
        if row['Label'] == row['predicted']:
            correct += 1
    return correct/total


def cross_val_score(func,data,cv):    #for 5 iterations
  scores=[]
  for i in range(cv):
    temp_init=time.time()
    test_set=data[i*len(data)//cv:min(((i+1)*len(data))//cv,len(data))]
    train_set=data[~data.index.isin(test_set.index)]
    ac=multivariate(deepcopy(train_set),deepcopy(test_set))
    print('iteration ',i+1, ac,end="")
    temp_final=time.time()
    temp_final=temp_final-temp_init
    print("   Time taken is ",temp_final)
  scores.append(ac)
  return scores

init=time.time()
scores = cross_val_score(multivariate, data_randomized, cv=5)
final=time.time()
final=final-init
print(scores)
print("Total Time taken is ",final)

iteration  1 0.8680430879712747   Time taken is  27.747196435928345
iteration  2 0.855475763016158   Time taken is  28.98190402984619
iteration  3 0.8618834080717489   Time taken is  27.837849617004395
iteration  4 0.8761220825852782   Time taken is  28.146554470062256
iteration  5 0.8681614349775785   Time taken is  27.984909296035767
[0.8681614349775785]
Total Time taken is  140.70057606697083
