In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

%matplotlib inline 

# define project path
projectpath = 'projectpath/'
train_data_csv = projectpath + 'ready_train.csv'
test_data_csv = projectpath + 'ready_test.csv'

df_train = pd.read_csv(train_data_csv,header=None)

# save target values with index into new dataframe 'labels'
df_labels = pd.read_csv(train_data_csv, names=['target'], header=None, usecols=[1])

# save as array
labels = df_labels['target'].values

# save data into new dataframe 'training_data'
df_training_data_raw = df_train.drop(columns=[0,1])

# split dataset into train (80%) and test data (20%); random_state=1 makes it reproducible (could be any number); stratify ensures that the proportion stays in test and training data sets
data_train, data_test, labels_train, labels_test = train_test_split(df_training_data_raw, labels, shuffle=True, test_size=0.2, random_state=1, stratify=df_labels)

In [20]:
labels_test[:5]

array(['G', 'F', 'G', 'J', 'G'], dtype=object)

# Naive Bayes (Gaussian)

In [2]:
from sklearn.naive_bayes import GaussianNB

## no gridsearchcv necessary so just create model

In [3]:
optimal_model = GaussianNB()

## fit model on training data

In [4]:
# fit model to data
optimal_model.fit(data_train, labels_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## see how model performs on test data

In [5]:
# predict the labels for the test slice
labels_pred = optimal_model.predict(data_test)
print(labels_pred[:100])

['G' 'F' 'E' 'J' 'E' 'J' 'B' 'D' 'C' 'J' 'J' 'C' 'G' 'C' 'G' 'C' 'H' 'E'
 'D' 'C' 'I' 'H' 'C' 'B' 'A' 'B' 'H' 'B' 'E' 'H' 'H' 'I' 'J' 'D' 'I' 'F'
 'J' 'H' 'A' 'B' 'E' 'H' 'I' 'I' 'I' 'D' 'A' 'H' 'G' 'G' 'F' 'G' 'A' 'J'
 'B' 'D' 'B' 'I' 'B' 'E' 'H' 'H' 'A' 'A' 'I' 'C' 'A' 'J' 'B' 'J' 'A' 'H'
 'E' 'I' 'H' 'H' 'F' 'J' 'D' 'E' 'G' 'A' 'B' 'G' 'I' 'C' 'F' 'B' 'A' 'H'
 'A' 'B' 'F' 'I' 'J' 'F' 'J' 'J' 'J' 'C']


In [6]:
# check the accuracy for the test data
print("Accuracy:", metrics.accuracy_score(labels_test, labels_pred))
# check the precision for the test data
print("Precision:",metrics.precision_score(labels_test, labels_pred, average=None))
# check the recall for the test data
print("Recall:",metrics.recall_score(labels_test, labels_pred, average=None))

Accuracy: 0.7618856837606838
Precision: [0.77842566 0.6590621  0.87624467 0.87280702 0.85668277 0.79880952
 0.86363636 0.77401894 0.62043796 0.62854251]
Recall: [0.7129506  0.69425901 0.82352941 0.79706275 0.71028037 0.89586115
 0.71028037 0.76368491 0.68090788 0.8302139 ]


Repeat for all Naive Bayes Algorithms there are in scikitlearn...

# Naive Bayes (Multinomial)

In [21]:
from sklearn.naive_bayes import MultinomialNB

In [22]:
optimal_model = MultinomialNB()
optimal_model.fit(data_train, labels_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
# predict the labels for the test slice
labels_pred = optimal_model.predict(data_test)
print(labels_pred[:100])

['G' 'F' 'E' 'J' 'I' 'I' 'B' 'D' 'G' 'J' 'J' 'C' 'G' 'C' 'G' 'C' 'H' 'E'
 'D' 'G' 'I' 'H' 'C' 'G' 'A' 'B' 'H' 'B' 'E' 'H' 'H' 'I' 'J' 'D' 'I' 'F'
 'J' 'H' 'A' 'B' 'E' 'H' 'I' 'I' 'I' 'D' 'A' 'H' 'C' 'G' 'I' 'G' 'B' 'J'
 'B' 'D' 'B' 'I' 'B' 'E' 'H' 'H' 'A' 'A' 'B' 'C' 'A' 'J' 'B' 'J' 'B' 'H'
 'E' 'I' 'H' 'H' 'F' 'J' 'D' 'F' 'G' 'B' 'D' 'G' 'I' 'E' 'F' 'B' 'B' 'H'
 'A' 'E' 'F' 'I' 'J' 'A' 'I' 'I' 'J' 'G']


In [24]:
# check the accuracy for the test data
print("Accuracy:", metrics.accuracy_score(labels_test, labels_pred))
# check the precision for the test data
print("Precision:",metrics.precision_score(labels_test, labels_pred, average=None))
# check the recall for the test data
print("Recall:",metrics.recall_score(labels_test, labels_pred, average=None))

Accuracy: 0.7988782051282052
Precision: [0.89123377 0.74484536 0.90882353 0.85074627 0.89983845 0.87931034
 0.8757485  0.80726257 0.54591837 0.78418231]
Recall: [0.7329773  0.77169559 0.82620321 0.83711615 0.74365821 0.88518024
 0.78104139 0.77169559 0.85714286 0.78208556]


has best result, so use for test data and make submission file

In [27]:
df_test=pd.read_csv(test_data_csv, sep=',', header=None)
test_data_new = df_test.drop(columns=[0])
# predict values for new data
predicted = optimal_model.predict(test_data_new)
print(predicted[0:100])

['A' 'I' 'A' 'H' 'I' 'B' 'H' 'I' 'J' 'H' 'J' 'J' 'A' 'D' 'A' 'A' 'A' 'A'
 'A' 'A' 'J' 'A' 'A' 'I' 'A' 'A' 'A' 'A' 'A' 'A' 'B' 'A' 'A' 'J' 'G' 'H'
 'H' 'H' 'A' 'A' 'A' 'A' 'A' 'J' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'J' 'J' 'J' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'A' 'B' 'A' 'A' 'A' 'J' 'A' 'A' 'I' 'E' 'J' 'H' 'A' 'A' 'A' 'A' 'G'
 'A' 'A' 'G' 'A' 'A' 'A' 'A' 'A' 'A' 'A']


In [28]:
df_submission = pd.DataFrame(columns=['id','target'])
predicted = pd.Series(predicted)
df_submission.head()

Unnamed: 0,id,target


In [30]:
# add calculated target values to csv and format for submission
df_submission['target'] = predicted
df_submission['id'] = df_submission.index
submission_file = projectpath + 'submission_naive_bayes.csv'
df_submission.head()

Unnamed: 0,id,target
0,0,A
1,1,I
2,2,A
3,3,H
4,4,I


In [31]:
df_submission.to_csv(submission_file, index=False, columns=['id','target'])

# Naive Bayes (Complement)

In [11]:
from sklearn.naive_bayes import ComplementNB

In [12]:
optimal_model = ComplementNB()
optimal_model.fit(data_train, labels_train)

ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)

In [13]:
# predict the labels for the test slice
labels_pred = optimal_model.predict(data_test)
print(labels_pred[:100])

['I' 'F' 'I' 'J' 'J' 'I' 'B' 'D' 'C' 'J' 'I' 'C' 'G' 'C' 'C' 'C' 'H' 'C'
 'D' 'C' 'I' 'H' 'C' 'G' 'A' 'J' 'H' 'A' 'F' 'H' 'H' 'I' 'J' 'D' 'I' 'F'
 'J' 'H' 'A' 'H' 'E' 'H' 'I' 'I' 'I' 'D' 'A' 'H' 'C' 'J' 'I' 'G' 'J' 'J'
 'B' 'D' 'B' 'A' 'H' 'C' 'H' 'H' 'A' 'A' 'I' 'C' 'I' 'I' 'A' 'D' 'J' 'H'
 'I' 'I' 'H' 'H' 'I' 'D' 'D' 'F' 'G' 'A' 'G' 'G' 'I' 'F' 'F' 'D' 'I' 'H'
 'A' 'E' 'I' 'I' 'J' 'A' 'I' 'I' 'J' 'D']


In [14]:
# check the accuracy for the test data
print("Accuracy:", metrics.accuracy_score(labels_test, labels_pred))
# check the precision for the test data
print("Precision:",metrics.precision_score(labels_test, labels_pred, average=None))
# check the recall for the test data
print("Recall:",metrics.recall_score(labels_test, labels_pred, average=None))

Accuracy: 0.6971153846153846
Precision: [0.7994012  0.98305085 0.73684211 0.68627451 0.96261682 0.71210341
 0.82748092 0.81656805 0.43384224 0.6550152 ]
Recall: [0.7129506  0.38718291 0.86096257 0.8411215  0.41255007 0.80907877
 0.72363151 0.73698264 0.9105474  0.57620321]


# Naive Bayes (Bernoulli)

In [15]:
from sklearn.naive_bayes import BernoulliNB

In [16]:
optimal_model = BernoulliNB()
optimal_model.fit(data_train, labels_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [17]:
# predict the labels for the test slice
labels_pred = optimal_model.predict(data_test)
print(labels_pred[:100])

['G' 'F' 'E' 'J' 'E' 'J' 'B' 'D' 'C' 'J' 'J' 'C' 'G' 'C' 'G' 'C' 'H' 'E'
 'D' 'C' 'I' 'H' 'C' 'G' 'A' 'B' 'H' 'B' 'E' 'H' 'H' 'I' 'J' 'D' 'I' 'F'
 'J' 'H' 'A' 'B' 'E' 'H' 'I' 'I' 'I' 'D' 'A' 'H' 'C' 'G' 'A' 'G' 'J' 'J'
 'B' 'D' 'B' 'I' 'B' 'E' 'H' 'H' 'A' 'A' 'B' 'C' 'A' 'J' 'B' 'J' 'A' 'H'
 'E' 'I' 'H' 'H' 'F' 'J' 'D' 'F' 'G' 'A' 'B' 'G' 'I' 'E' 'F' 'B' 'G' 'H'
 'A' 'E' 'F' 'I' 'J' 'A' 'J' 'J' 'J' 'C']


In [18]:
# check the accuracy for the test data
print("Accuracy:", metrics.accuracy_score(labels_test, labels_pred))
# check the precision for the test data
print("Precision:",metrics.precision_score(labels_test, labels_pred, average=None))
# check the recall for the test data
print("Recall:",metrics.recall_score(labels_test, labels_pred, average=None))

Accuracy: 0.7585470085470085
Precision: [0.75397974 0.64030612 0.82526882 0.8774584  0.85572139 0.83121827
 0.87025316 0.82378855 0.63409091 0.60839844]
Recall: [0.69559413 0.67022697 0.82085561 0.77436582 0.68891856 0.87449933
 0.73431242 0.74899866 0.74499332 0.8328877 ]
