In [1]:
from naive_bayes import priors,likelihood,predict
import numpy as np
from preprocessing import preprocess_abstracts
from transformations import features_count_transform,feature_occurance_transform
from naive_bayes import priors,likelihood,predict

In [2]:
def BayesClassifier(labels_train,features_train,features_test):

    #preprocess features
    training_words,features_train = preprocess_abstracts(features_train)
    test_words, features_test = preprocess_abstracts(features_test)

    all_words = [word for word in training_words if word in test_words]

    #label encoder - for both train and test set
    unique_labels = list(np.unique(labels_train))
    integer_labels = list(np.arange(0,len(unique_labels)))
    labels_train = np.array([integer_labels[unique_labels.index(label)] for label in labels_train])

    features_train = features_count_transform(all_words, features_train)
    features_test = features_count_transform(all_words, features_test)

    #IDF: convert counts to idf weights
    weights,word_occurances = feature_occurance_transform(features_train)
    
    #calculate priors 
    class_prior,occurance_class_groups,instance_class_groups = priors(labels_train,features_train,word_occurances)
    
    #calculate likelihood
    weight_conditional_prob = likelihood(instance_class_groups,occurance_class_groups,weights)
    
    #predict
    pred = predict(features_test,weight_conditional_prob,class_prior)
    
    return pred

## Main - Predict unlabelled set

In [3]:
#import data and split into features/labels
data = np.genfromtxt('trg.csv', delimiter=',',dtype=str)
labels_train = data[1:, 1]
features_train = data[1:,2]
features_test = np.genfromtxt('tst.csv', delimiter=',',dtype=str)[1:,1]

In [4]:
#predict
predictions = BayesClassifier(labels_train,features_train,features_test)

#convert predictions from integer labels to class labels again
unique_labels = list(np.unique(labels_train))
integer_labels = list(np.arange(0,len(unique_labels)))
predictions = [unique_labels[integer_labels.index(pred)] for pred in predictions]

## Main - Model Selection

In [87]:
#import data and split into features/labels
data = np.genfromtxt('trg.csv', delimiter=',',dtype=str)
labels = data[1:, 1]
features = data[1:,2]

In [None]:
#setup cross validation
no_of_folds = 10
test_size = features.shape[0]//no_of_folds
idx = np.arange(0,features.shape[0])
np.random.shuffle(idx)

acc_cv=list()
for k in range(0,no_of_folds):
    
    print('Iteration ', k)
    #extract the correct cross-validation subset
    test_idx = idx[k*test_size:(k+1)*test_size]
    train_idx = np.append(idx[:k*test_size],idx[(k+1)*test_size:])
    
    features_train = features[train_idx]
    features_test = features[test_idx]
    labels_train = labels[train_idx]
    labels_test = labels[test_idx]
    
    #label encoder - encode labels as integer
    unique_labels = list(np.unique(labels_train))
    integer_labels = list(np.arange(0,len(unique_labels)))
    labels_train = np.array([integer_labels[unique_labels.index(label)] for label in labels_train])
    labels_test = np.array([integer_labels[unique_labels.index(label)] for label in labels_test])
    
    #predict
    predictions = BayesClassifier(labels_train,features_train,features_test)
    
    #accuracy
    acc = np.sum(np.array(predictions) == labels_test)/labels_test.shape[0]
    acc_cv.append(acc)
    
np.mean(acc_cv)    

Iteration  0
