In [85]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

from preprocessing import normalize
from nltk import word_tokenize

### Preprocessing & Read in data

In [87]:
df = pd.read_excel("Naive Bayes Data.xlsx")

features = df['abstract']
labels = df.loc[:3]['class']

In [88]:
#normalize and tokenize then restring text
cleaned_features = list()
for feature in features:
    words = word_tokenize(feature)
    normalised_words = normalize(words)
    abstract = " ".join(normalised_words)
    cleaned_features.append(abstract)
    
features = np.array(cleaned_features)
features

array(['kiwi sheep kiwi', 'kiwi kiwi bird', 'kiwi auckland',
       'munich oktoberfest kiwi', 'kiwi kiwi kiwi munich oktoberfest'],
      dtype='<U33')

### Naive Bayes

In [68]:
#calculate priors
priors = dict()
unique_classes = pd.unique(labels)
for label in unique_classes:
    priors[label] = np.sum(label == labels)/len(labels)
    
priors

{'NZ': 0.75, 'DE': 0.25}

In [167]:
#conditional probabilities

count_vect = CountVectorizer()
#features_count: rows = number of documents, columns = unique word counts
features_count = count_vect.fit_transform(features)

feature_mapping = count_vect.vocabulary_ #words mapped to column indexes
unique_words = np.array(list(feature_mapping.keys())) #unique words in the documents

#using test words to calculate conditional probs
train_features = features_count[:4, :]
test_features = features_count[4, :]

In [146]:
#conditional prob matrix
# row = label
# col = features

number_of_words = len(unique_words) # |V|

#calculate count(c)
number_of_words_per_class = dict() #class is the key and value is the number of unique words
for label in labels:
    #subset by class
    subset_features = train_features.toarray()[label == labels]
    
    word_count = 0
    for abstract in subset_features:
        word_count += np.sum(abstract)
    
    number_of_words_per_class[label] = word_count

conditional_prob = pd.DataFrame()

#calculate count(w,c) and calculate conditional prob
for label in unique_classes:
    
    row = list()
    for word in unique_words:
        
        
        
        #subset by class
        subset_features = train_features.toarray()[label == labels]
        count_of_word_in_class = np.sum(subset_features[:,feature_mapping[word]]) #count(w,c)
        
        #P(w|c)
        prob = (count_of_word_in_class + 1)/(number_of_words_per_class[label] + number_of_words)
        
        row.append(prob)
        
        
    break

In [147]:
row

[0.42857142857142855,
 0.14285714285714285,
 0.14285714285714285,
 0.14285714285714285,
 0.07142857142857142,
 0.07142857142857142]

In [178]:
unique_words[test_features > 0]

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [172]:
test_features.toarray() > 0

array([[False, False,  True,  True,  True, False]])

array(['kiwi', 'sheep', 'bird', 'auckland', 'munich', 'oktoberfest'],
      dtype='<U11')

In [111]:
shuffled_mappings

['bird', 'oktoberfest', 'sheep', 'kiwi', 'auckland', 'munich']

In [112]:
feature_index

{'kiwi': 2,
 'sheep': 5,
 'bird': 1,
 'auckland': 0,
 'munich': 3,
 'oktoberfest': 4}