In [31]:
import numpy as np
import pandas as pd
import scipy.stats as s
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from Gaussian_Naive_Bayes import GaussianNB

In [32]:
class gaussian_nb_scania(GaussianNB):
    def __init__(xerox_copy,data,non_missing_threshold,split_ratio,apply_pca_or_not,n_principal_components):
        data.replace(to_replace='na',value=np.nan,inplace=True)
        data.dropna(axis=1,inplace=True,thresh=int(non_missing_threshold*data.shape[0]))
        data_labels=data['class']
        imputer=SimpleImputer()
        data_array=imputer.fit_transform(X=data.iloc[:,1:])
        data_columns=data.columns
        data=pd.DataFrame(data=data_array,columns=data_columns[1:])
        np_array_list=list()
        for column in data.columns:
            data[column]=pd.qcut(x=data[column],q=10,duplicates='drop').cat.codes
            np_array_list.append(np.eye(10,10)[data[column]])
        data_array=np.concatenate(np_array_list,axis=1) 
        data['class']=data_labels
        xerox_copy.data=data
        super().__init__(features=data.iloc[:,0:data.shape[1]-1],labels=data['class'],
                         data_split_ratio=split_ratio,apply_pca=apply_pca_or_not,n_components=n_principal_components)

In [33]:
data=pd.read_csv('scania/aps_failure_training_set.csv')

In [34]:
data

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,na,2130706438,280,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,na,0,na,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,na,228,100,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0,70,66,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,neg,60874,na,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,neg,153002,na,664,186,0,0,0,0,0,...,998500,566884,1290398,1218244,1019768,717762,898642,28588,0,0
59996,neg,2286,na,2130706538,224,0,0,0,0,0,...,10578,6760,21126,68424,136,0,0,0,0,0
59997,neg,112,0,2130706432,18,0,0,0,0,0,...,792,386,452,144,146,2622,0,0,0,0
59998,neg,80292,na,2130706432,494,0,0,0,0,0,...,699352,222654,347378,225724,194440,165070,802280,388422,0,0


In [35]:
obj=gaussian_nb_scania(data=data,non_missing_threshold=0.8,split_ratio=(0.8,0.2,0.0),apply_pca_or_not=True,
                      n_principal_components=100)

In [36]:
obj.X_new.shape

(60000, 100)

In [37]:
from imblearn.over_sampling import SMOTE

In [38]:
smote_obj=SMOTE(sampling_strategy='minority',k_neighbors=5)

In [39]:
x_oversampled,y_oversampled=smote_obj.fit_resample(X=obj.X_new,y=obj.data['class'])

In [40]:
data_new=pd.DataFrame(data=x_oversampled)
data_new['class']=y_oversampled

In [41]:
data_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,class
0,-31.327601,-0.122594,-4.403098,-0.251647,-1.827702,-1.331550,-1.038895,-7.368728,-1.068359,5.641537,...,0.822743,-0.378506,0.198632,-0.116242,-0.102781,-0.361935,0.226202,0.106559,0.342974,neg
1,-16.501204,-6.955992,5.052323,6.298802,-1.056589,1.047336,6.225875,-5.119912,-0.583316,-4.207829,...,0.004751,0.107463,0.669190,0.220503,-0.027969,-0.272715,-0.765962,0.104616,0.160282,neg
2,-8.100373,-2.981546,-1.889971,-2.243415,-1.786259,3.345850,-2.724027,4.586131,2.888723,-1.502323,...,-0.899019,-0.095472,-0.204153,-0.124346,0.095544,-0.140966,-0.146965,-0.227003,-0.175612,neg
3,34.225848,-0.602825,1.105344,-0.964267,-1.607339,1.192195,0.728082,-1.377275,-3.443745,0.035152,...,-0.089852,-0.356047,0.157814,0.362160,-0.601024,-0.167934,0.019210,-0.620347,0.572816,neg
4,-22.111069,6.719273,-0.351102,-0.732192,-1.640634,2.381492,-7.404205,1.967404,0.953963,-0.762764,...,-0.084046,0.330029,-0.786851,0.136828,-0.016117,-0.013079,-0.142647,-0.097503,0.539250,neg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117995,9.773302,3.037234,6.037937,2.375852,4.882562,0.223590,-0.507227,1.051255,-1.188090,3.155129,...,-0.388321,-0.055035,-0.009102,0.170874,0.055192,-0.060384,-0.124927,-0.192074,-0.383915,pos
117996,-36.186698,0.903216,-0.021889,0.908265,-1.945810,-0.865842,-4.833639,-0.359560,1.098126,3.567501,...,-0.248158,-0.037605,0.094984,-0.075667,0.293104,0.264462,-0.027035,-0.071572,-0.236085,pos
117997,-35.735536,2.564998,5.896452,0.259066,0.352276,-1.820787,3.829907,2.683519,2.558462,0.250937,...,-0.293992,-0.000612,-0.272385,-0.406169,0.111499,0.074291,0.047725,-0.152622,0.302114,pos
117998,-34.482790,8.496670,4.915094,2.220970,0.064055,2.367645,2.973076,11.031184,-3.803123,2.998443,...,-0.131220,0.093036,-0.122126,0.073764,-0.003029,0.104421,0.014303,0.163778,0.152724,pos


In [None]:
train_data_new,cv_data_new,test_data_new=obj.data_splitting(data_new,data_new['class'])