In [1]:
import pandas as pd

import numpy as np

import scipy.stats as s

from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt

import seaborn as sns

from Rashtriya_Raksha_University_Gaussian_NB import rru_gaussian_nb

from imblearn.over_sampling import SMOTE

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score,precision_score,recall_score

In [2]:
class rru_gaussian_nb_scania(rru_gaussian_nb):
    
    def __init__(xerox_copy,data,non_missing_threshold,split_ratio,apply_pca_or_not,n_principal_components):
        
        data.replace(to_replace='na',value=np.nan,inplace=True)
        
        data.dropna(axis=1,inplace=True,thresh=int(non_missing_threshold*data.shape[0]))
        
        data_labels = data['class']
        
        imputer = SimpleImputer()
        
        data_array = imputer.fit_transform(X=data.iloc[:,1:])
        
        data_columns = data.columns
        
        data = pd.DataFrame(data=data_array,columns=data_columns[1:])
        
        np_array_list = list()
        
        for column in data.columns:
    
            data[column] = pd.qcut(x=data[column],q=10,duplicates='drop').cat.codes
        
            np_array_list.append(np.eye(10,10)[data[column]])
            
        data_array = np.concatenate(np_array_list,axis=1)
        
        data = pd.DataFrame(data=data_array)

        data['class'] = data_labels
        
        xerox_copy.data = data
        
        super().__init__(features=data.iloc[:,0:data.shape[1]-1],labels=data['class'],data_split_ratio=split_ratio,
                         apply_pca=apply_pca_or_not,n_components=n_principal_components)

In [3]:
data = pd.read_csv("./aps_failure_training_set.csv",header=None,skiprows=20)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
column_names = data.iloc[0]

data = pd.read_csv("./aps_failure_training_set.csv",header=None,skiprows=21,names=column_names)

In [5]:
naive_bayes_configs = dict()

for non_na_thresh in np.arange(0.7,1,0.1):
    
    for n_comp in np.arange(20,170,50):
        
        naive_bayes_configs[(non_na_thresh,n_comp)] = rru_gaussian_nb_scania(data,non_na_thresh,(0.8,0.2,0.0),True,n_comp)

In [6]:
naive_bayes_configs

{(0.7, 20): <__main__.rru_gaussian_nb_scania at 0x2a66c598160>,
 (0.7, 70): <__main__.rru_gaussian_nb_scania at 0x2a666db9dc0>,
 (0.7, 120): <__main__.rru_gaussian_nb_scania at 0x2a66c56caf0>,
 (0.7999999999999999, 20): <__main__.rru_gaussian_nb_scania at 0x2a66c598310>,
 (0.7999999999999999, 70): <__main__.rru_gaussian_nb_scania at 0x2a66c598e20>,
 (0.7999999999999999, 120): <__main__.rru_gaussian_nb_scania at 0x2a61e0177f0>,
 (0.8999999999999999, 20): <__main__.rru_gaussian_nb_scania at 0x2a61e017400>,
 (0.8999999999999999, 70): <__main__.rru_gaussian_nb_scania at 0x2a61e017b20>,
 (0.8999999999999999, 120): <__main__.rru_gaussian_nb_scania at 0x2a61e017ac0>,
 (0.9999999999999999, 20): <__main__.rru_gaussian_nb_scania at 0x2a6118bd1f0>,
 (0.9999999999999999, 70): <__main__.rru_gaussian_nb_scania at 0x2a6118bd370>,
 (0.9999999999999999, 120): <__main__.rru_gaussian_nb_scania at 0x2a6118bd3a0>}

In [7]:
naive_bayes = list()

cv_data_list = list()

for obj in naive_bayes_configs.values():
    
    X_resampled,y_resampled = SMOTE(sampling_strategy='minority').fit_sample(X=obj.X_new,y=data['class'])
    
    data_resampled = pd.DataFrame(data=X_resampled)
    
    data_resampled['class'] = data['class']
    
    train_data,cv_data,test_data = obj.data_splitting(data_resampled)
    
    cv_data_list.append(cv_data)
    
    naive_bayes.append(GaussianNB().fit(X=np.array(train_data.iloc[:,0:train_data.shape[1]-1]),y=train_data['label']))

In [8]:
metrics = dict()

for obj,cv_data,config in tuple(zip(naive_bayes,cv_data_list,naive_bayes_configs.keys())):
    
    predicted_category = obj.predict(X=np.array(cv_data.iloc[:,0:cv_data.shape[1]-1]))
    
    acc = accuracy_score(y_true=np.array(cv_data['label']),y_pred=predicted_category)
    
    metrics[config] = {'acc':acc}

In [9]:
metrics

{(0.7, 20): {'acc': 0.9361016949152542},
 (0.7, 70): {'acc': 0.9513559322033899},
 (0.7, 120): {'acc': 0.9513559322033899},
 (0.7999999999999999, 20): {'acc': 0.9408474576271186},
 (0.7999999999999999, 70): {'acc': 0.9533898305084746},
 (0.7999999999999999, 120): {'acc': 0.956864406779661},
 (0.8999999999999999, 20): {'acc': 0.9374576271186441},
 (0.8999999999999999, 70): {'acc': 0.9509322033898305},
 (0.8999999999999999, 120): {'acc': 0.9536440677966102},
 (0.9999999999999999, 20): {'acc': 0.9141525423728813},
 (0.9999999999999999, 70): {'acc': 0.9141525423728813},
 (0.9999999999999999, 120): {'acc': 0.9141525423728813}}

# Using the metrics above, you can decide which configuration (which naive bayes classifier configuration in naive_bayes list)is working best for the case of Cross Validation Data.

# Use that trained configuration of naive bayes classifier in naive_bayes list to perform the prediction on testing data

# This is going to be your assignment. 

In [10]:
data = pd.read_csv("aps_failure_test_set.csv") 

In [11]:
#encode labels to 0 and 

from sklearn.preprocessing import LabelEncoder #Encode target labels with value
 
le = LabelEncoder() #importing label encoder

data['class'] = le.fit_transform(data['class']) #picking the values of column 'class'

df = data.copy()

df.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,0,60,0,20,12,0,0,0,0,0,...,1098,138,412,654,78,88,0,0,0,0
1,0,82,0,68,40,0,0,0,0,0,...,1068,276,1620,116,86,462,0,0,0,0
2,0,66002,2,212,112,0,0,0,0,0,...,495076,380368,440134,269556,1315022,153680,516,0,0,0
3,0,59816,na,1010,936,0,0,0,0,0,...,540820,243270,483302,485332,431376,210074,281662,3232,0,0
4,0,1814,na,156,140,0,0,0,0,0,...,7646,4144,18466,49782,3176,482,76,0,0,0


In [12]:
naive_bayes_configs = dict()

for non_na_thresh in np.arange(0.7,1,0.3):
    
    for n_comp in np.arange(15,165,40):
        
        naive_bayes_configs[(non_na_thresh,n_comp)] = rru_gaussian_nb_scania(data,non_na_thresh,(0.7,0.3,0.0),True,n_comp)

In [13]:
naive_bayes_configs

{(0.7, 15): <__main__.rru_gaussian_nb_scania at 0x2a6118bda30>,
 (0.7, 55): <__main__.rru_gaussian_nb_scania at 0x2a666db9490>,
 (0.7, 95): <__main__.rru_gaussian_nb_scania at 0x2a6050fe7c0>,
 (0.7, 135): <__main__.rru_gaussian_nb_scania at 0x2a6050f8850>,
 (1.0, 15): <__main__.rru_gaussian_nb_scania at 0x2a6050f8bb0>,
 (1.0, 55): <__main__.rru_gaussian_nb_scania at 0x2a6050f8c10>,
 (1.0, 95): <__main__.rru_gaussian_nb_scania at 0x2a6050f8be0>,
 (1.0, 135): <__main__.rru_gaussian_nb_scania at 0x2a6051001c0>}

In [14]:
naive_bayes = list()

cv_data_list = list()

for obj in naive_bayes_configs.values():
    
    X_resampled,y_resampled = SMOTE(sampling_strategy='minority').fit_sample(X=obj.X_new,y=data['class'])
    
    data_resampled = pd.DataFrame(data=X_resampled)
    
    data_resampled['class'] = data['class']
    
    train_data,cv_data,test_data = obj.data_splitting(data_resampled)
    
    cv_data_list.append(cv_data)
    
    naive_bayes.append(GaussianNB().fit(X=np.array(train_data.iloc[:,0:train_data.shape[1]-1]),y=train_data['label']))

In [15]:
y_resampled.unique()

array([0, 1])

In [21]:
metrics = dict()

for obj,cv_data,config in tuple(zip(naive_bayes,cv_data_list,naive_bayes_configs.keys())):
    
    prediction = obj.predict(X=np.array(cv_data.iloc[:,0:cv_data.shape[1]-1]))

In [26]:
acc = accuracy_score(y_true=np.array(cv_data['label']),y_pred=prediction)

In [27]:
print(acc)

0.9195819112627986
