In [6]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from Rashtriya_Raksha_University_Gaussian_NB import rru_gaussian_nb
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score

In [7]:
class gaussian_nb_scania(rru_gaussian_nb):
    def __init__(xerox_copy,data,non_missing_threshold,split_ratio,apply_pca_or_not,n_principal_components):
        data.replace(to_replace='na',value=np.nan,inplace=True)
        data.dropna(axis=1,inplace=True,thresh=int(non_missing_threshold*data.shape[0]))
        data_labels = data['class']
        imputer = SimpleImputer()
        data_array = imputer.fit_transform(X=data.iloc[:,1:])
        data_columns = data.columns
        data = pd.DataFrame(data=data_array,columns=data_columns[1:])
        np_array_list = list()
        
        for column in data.columns:
            data[column] = pd.qcut(x=data[column],q=10,duplicates='drop').cat.codes
            np_array_list.append(np.eye(10,10)[data[column]])
            
        data_array = np.concatenate(np_array_list,axis=1)
        data = pd.DataFrame(data=data_array)
        data['class'] = data_labels
        xerox_copy.data = data
        
        super().__init__(features=data.iloc[:,0:data.shape[1]-1],labels=data['class'],data_split_ratio=split_ratio,
                         apply_pca=apply_pca_or_not,n_components=n_principal_components)

In [8]:
data=pd.read_csv(r'C:\Users\hemant\Jaskirat Sir\Air pressure system failures in Scania trucks\scania\aps_failure_training_set.csv')

In [9]:
data.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,na,2130706438,280,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,na,0,na,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,na,228,100,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0,70,66,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,neg,60874,na,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0


In [10]:
logistic_regression_configs = dict()

for non_na_thresh in np.arange(0.7,1,0.1):
    for n_comp in np.arange(20,170,50):
        
        logistic_regression_configs[(non_na_thresh,n_comp)] = gaussian_nb_scania(data,non_na_thresh,
                                                                                     (0.8,0.2,0.0),
                                                                                     True,n_comp)

In [11]:
logistic_regression_configs

{(0.7, 20): <__main__.gaussian_nb_scania at 0x1c3d5cca848>,
 (0.7, 70): <__main__.gaussian_nb_scania at 0x1c3c05610c8>,
 (0.7, 120): <__main__.gaussian_nb_scania at 0x1c3d71fd288>,
 (0.7999999999999999, 20): <__main__.gaussian_nb_scania at 0x1c3d692e0c8>,
 (0.7999999999999999, 70): <__main__.gaussian_nb_scania at 0x1c3de27b988>,
 (0.7999999999999999, 120): <__main__.gaussian_nb_scania at 0x1c3da9e1d08>,
 (0.8999999999999999, 20): <__main__.gaussian_nb_scania at 0x1c3d7409808>,
 (0.8999999999999999, 70): <__main__.gaussian_nb_scania at 0x1c3a76b8cc8>,
 (0.8999999999999999, 120): <__main__.gaussian_nb_scania at 0x1c3a74a1708>,
 (0.9999999999999999, 20): <__main__.gaussian_nb_scania at 0x1c3d5f42788>,
 (0.9999999999999999, 70): <__main__.gaussian_nb_scania at 0x1c3d49ec8c8>,
 (0.9999999999999999, 120): <__main__.gaussian_nb_scania at 0x1c3873e3f48>}

In [12]:
logistic_regression = dict()
cv_data_list = list()

for configs,obj in logistic_regression_configs.items():
    for reg_strength in [0.001,0.01,0.1,1,10,100,1000]:
    
        X_resampled,y_resampled = SMOTE(sampling_strategy='minority').fit_resample(X=obj.X_new,y=data['class'])
        data_resampled = pd.DataFrame(data=X_resampled)
        data_resampled['class'] = data['class']
        train_data,cv_data,test_data = obj.data_splitting(data_resampled)
        cv_data_list.append(cv_data)
        k = tuple(list(configs)+[reg_strength])
        logistic_regression[k] = LogisticRegression(C=reg_strength,verbose=1,n_jobs=-1).fit(X=np.array(train_data.iloc[:,0:train_data.shape[1]-1]),
                                    y=train_data['label'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    4.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    8.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   15.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   16.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   16.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   17.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1

In [13]:
logistic_regression

{(0.7, 20, 0.001): LogisticRegression(C=0.001, n_jobs=-1, verbose=1),
 (0.7, 20, 0.01): LogisticRegression(C=0.01, n_jobs=-1, verbose=1),
 (0.7, 20, 0.1): LogisticRegression(C=0.1, n_jobs=-1, verbose=1),
 (0.7, 20, 1): LogisticRegression(C=1, n_jobs=-1, verbose=1),
 (0.7, 20, 10): LogisticRegression(C=10, n_jobs=-1, verbose=1),
 (0.7, 20, 100): LogisticRegression(C=100, n_jobs=-1, verbose=1),
 (0.7, 20, 1000): LogisticRegression(C=1000, n_jobs=-1, verbose=1),
 (0.7, 70, 0.001): LogisticRegression(C=0.001, n_jobs=-1, verbose=1),
 (0.7, 70, 0.01): LogisticRegression(C=0.01, n_jobs=-1, verbose=1),
 (0.7, 70, 0.1): LogisticRegression(C=0.1, n_jobs=-1, verbose=1),
 (0.7, 70, 1): LogisticRegression(C=1, n_jobs=-1, verbose=1),
 (0.7, 70, 10): LogisticRegression(C=10, n_jobs=-1, verbose=1),
 (0.7, 70, 100): LogisticRegression(C=100, n_jobs=-1, verbose=1),
 (0.7, 70, 1000): LogisticRegression(C=1000, n_jobs=-1, verbose=1),
 (0.7, 120, 0.001): LogisticRegression(C=0.001, n_jobs=-1, verbose=1),
 

In [14]:
metrics = dict()

for obj,cv_data,config in tuple(zip(logistic_regression.values(),cv_data_list,logistic_regression.keys())):
    
    predicted_category = obj.predict(X=np.array(cv_data.iloc[:,0:cv_data.shape[1]-1]))
    acc = accuracy_score(y_true=np.array(cv_data['label']),y_pred=predicted_category)
    precision = precision_score(y_true=np.array(cv_data['label']),y_pred=predicted_category,
                                pos_label='pos',zero_division=1)
    recall = recall_score(y_true=np.array(cv_data['label']),y_pred=predicted_category,
                                pos_label='pos',zero_division=1)
    metrics[config] = {'accuracy':acc,'precision':precision,'recall':recall}

In [16]:
metrics

{(0.7, 20, 0.001): {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0},
 (0.7, 20, 0.01): {'accuracy': 0.9978813559322034,
  'precision': 0.0,
  'recall': 1.0},
 (0.7, 20, 0.1): {'accuracy': 0.9973728813559322,
  'precision': 0.0,
  'recall': 1.0},
 (0.7, 20, 1): {'accuracy': 0.9972881355932204,
  'precision': 0.0,
  'recall': 1.0},
 (0.7, 20, 10): {'accuracy': 0.9972033898305085,
  'precision': 0.0,
  'recall': 1.0},
 (0.7, 20, 100): {'accuracy': 0.9972033898305085,
  'precision': 0.0,
  'recall': 1.0},
 (0.7, 20, 1000): {'accuracy': 0.9972033898305085,
  'precision': 0.0,
  'recall': 1.0},
 (0.7, 70, 0.001): {'accuracy': 0.9997457627118644,
  'precision': 0.0,
  'recall': 1.0},
 (0.7, 70, 0.01): {'accuracy': 0.9966949152542373,
  'precision': 0.0,
  'recall': 1.0},
 (0.7, 70, 0.1): {'accuracy': 0.9955084745762712,
  'precision': 0.0,
  'recall': 1.0},
 (0.7, 70, 1): {'accuracy': 0.9951694915254238,
  'precision': 0.0,
  'recall': 1.0},
 (0.7, 70, 10): {'accuracy': 0.9950847457627119,
