In [83]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import math
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.preprocessing import LabelEncoder

In [84]:
def conditionalprobability(x,mean,stdev):
    exponent = math.exp(-(math.pow((x-mean),2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
    

In [85]:
def predict(means,stdev,X,k):
    probabilities = {}
    for j in range(0,k):
        probabilities[j]=1
        for i in range(len(X)):
            probabilities[j]*=conditionalprobability(X[i],means[j,i],stdev[j,i])
   
    return max([(value, key) for key, value in probabilities.items()])[1]
                   
     

In [86]:
def predictData(means,stdev,Xtest,ytest):
    
    k=np.unique(ytest,axis=0).shape[0]
    ypred=np.zeros(len(ytest))
    for i in range(Xtest.shape[0]):
        ypred[i]=predict(means,stdev,Xtest[i],k)
        
    return ypred


In [87]:
data = pd.read_csv('project3_dataset2.txt', sep='\t', header=None)
lc = LabelEncoder()
data.iloc[:,4]= lc.fit_transform(data.iloc[:,4])
rows,col=data.shape
x = data.iloc[:,:col-1].values
y = data.iloc[:,-1].values



In [88]:
# run for K fold
from sklearn.model_selection import KFold

kf = KFold(n_splits=10)
precision = np.zeros(10)
recall =  np.zeros(10)
fscore =  np.zeros(10)
accuracy =  np.zeros(10)
support = np.zeros(10)
i = 0
for train_index, test_index in kf.split(x):
    x_train,x_test = x[train_index],x[test_index]
    y_train , y_test = y[train_index], y[test_index]
    x_train = pd.DataFrame(x_train)
    x_test = pd.DataFrame(x_test)
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    x_train['labels'] = y_train
    means = x_train.groupby(x_train.iloc[:,-1]).mean().iloc[:,:-1].values
    stdev = x_train.groupby(x_train.iloc[:,-1]).std().iloc[:,:-1].values
    x_train,x_test,y_train,y_test = x_train.iloc[:,:-1].values,x_test.values,y_train.values,y_test.values
    y_pred = predictData(means,stdev,x_test,y_test)
    support = 0
    p,r,f,s = score(y_test,y_pred)
    precision[i],recall[i],fscore[i],accuracy[i]= p.mean(),r.mean(),f.mean(),accuracy_score(y_test,y_pred)
    i +=1
    

In [89]:
print(accuracy)
print('precision:',precision.mean())
print('recall:',recall.mean())
print('f1measure',fscore.mean())
print('accuracy',accuracy.mean())

[0.65957447 0.70212766 0.7826087  0.69565217 0.69565217 0.60869565
 0.7826087  0.73913043 0.58695652 0.63043478]
precision: 0.6737587700831676
recall: 0.6893750777285307
f1measure 0.6697982178043362
accuracy 0.6883441258094357


## Verifying Using sklearn

In [18]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
data = pd.read_csv('project3_dataset1.txt', sep='\t', header=None)
rows,col=data.shape
x=data.iloc[:,:col-1]
y = data.iloc[:,-1]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
x_train['labels'] = y_train
mean_features = x_train.groupby(x_train.iloc[:,-1]).mean().iloc[:,:-1].values
std_features = x_train.groupby(x_train.iloc[:,-1]).std().iloc[:,:-1].values
x_train,x_test,y_train,y_test = x_train.iloc[:,:-1].values,x_test.values,y_train.values,y_test.values
print(mean_features)
NB = GaussianNB()
NB.fit(x_train,y_train)
y_pred = NB.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[1.21387612e+01 1.79541176e+01 7.79722145e+01 4.62700000e+02
  9.21629412e-02 7.81427682e-02 4.45265156e-02 2.49689654e-02
  1.73745675e-01 6.27300692e-02 2.79725260e-01 1.21631073e+00
  1.96689723e+00 2.08203287e+01 7.11047059e-03 2.08482249e-02
  2.55086540e-02 9.66694118e-03 2.03878166e-02 3.56135398e-03
  1.33652664e+01 2.35756401e+01 8.68895156e+01 5.58658131e+02
  1.24832318e-01 1.79596055e-01 1.64003457e-01 7.37568131e-02
  2.69644291e-01 7.91220761e-02]
 [1.75439157e+01 2.16837952e+01 1.15969277e+02 9.86533735e+02
  1.03373855e-01 1.47273133e-01 1.65420663e-01 8.98458434e-02
  1.93970482e-01 6.27310843e-02 6.15719880e-01 1.19955723e+00
  4.32641566e+00 7.33675904e+01 6.87795181e-03 3.28988855e-02
  4.30798193e-02 1.51694157e-02 2.04272831e-02 4.05986145e-03
  2.13030120e+01 2.93239759e+01 1.42555000e+02 1.44593133e+03
  1.44921024e-01 3.78316325e-01 4.60396265e-01 1.84287289e-01
  3.23690964e-01 9.12156024e-02]]
0.9298245614035088
             precision    recall  f1-score   s

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
