In [74]:
import numpy as np
import io

def readMatrix(file):
    f = open(file, "r")
    f.readline()
    dim = np.array(f.readline().split()).astype(np.int)
    tokenlist = np.array(f.readline().split())
    matrix = np.zeros((dim[0], dim[1]))
    category = np.zeros(dim[0])
    
    for i in np.arange(dim[0]):
        data = np.array(f.readline().split()).astype(np.int)
        category[i] = data[0]
        
        matrixindex = 0
        dataindex = 1
        while data[dataindex] != -1:
            matrixindex += data[dataindex]
            matrix[i, matrixindex] = data[dataindex+1]
            dataindex += 2

    f.close()
    return(matrix, tokenlist, category)

def nb_train(matrix, category):
    spam_prior = np.sum(category)/len(category) #fraction of training set that is spam
    
    spam_indices = category == 1 #boolean where Trues are spam rows
    spam_data = matrix[spam_indices,:] #split matrix into spam and nonspam matrices
    email_data = matrix[~spam_indices,:] 
    
    spam_token_probs = np.divide(np.sum(spam_data, axis=0) + 1, np.sum(spam_data) + spam_data.shape[0])
    email_token_probs = np.divide(np.sum(email_data, axis=0) + 1, np.sum(email_data) + email_data.shape[0])
        
    return(spam_prior, spam_token_probs, email_token_probs)

def nb_predict(matrix, spam_prior, spam_token_probs, email_token_probs):
    
    spam_odds = np.add(np.dot(matrix, np.log(spam_token_probs)), np.log(spam_prior))
    email_odds = np.add(np.dot(matrix, np.log(email_token_probs)), np.log(1-spam_prior))
      
    predictions = np.zeros(matrix.shape[0])
    predictions[spam_odds>email_odds] = 1
    
    return(predictions)

def main():
    #2a
    #import training data and fit parameters
    matrix, tokenlist, category = readMatrix('spam_data/MATRIX.TRAIN')
    spam_prior, spam_token_probs, email_token_probs = nb_train(matrix, category)
    
    #train_predictions = nb_predict(matrix, spam_prior, spam_token_probs, email_token_probs)
    
    #import test data, predict on test set, and report test error
    test_matrix, test_tokenlist, test_category = readMatrix('spam_data/MATRIX.TEST')
    test_predictions = nb_predict(test_matrix, spam_prior, spam_token_probs, email_token_probs)
    test_error = np.divide(np.sum(np.abs(np.subtract(test_predictions, test_category))), len(test_category))
    print("2a. test error: "+str(test_error*100)+"%")
    
    #2b
    tokenratios = np.log(np.divide(spam_token_probs, email_token_probs))
    top_token_indices = np.argpartition(tokenratios, -5)[-5:]
    top_token_indices = top_token_indices[np.argsort(tokenratios[top_token_indices])]
    print("2b. top 5 predictive tokens: " + np.flip(tokenlist[top_token_indices], 0))
    
    #2c
    
    
if __name__ == '__main__':
    main()

test error: 1.625%
['httpaddr' 'spam' 'unsubscrib' 'ebai' 'valet']
