__Libraries__

In [1]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import FactorAnalysis
from helpers import read_csv_with_pandas
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score
from sklearn import preprocessing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

__Reading CSV__

In [2]:
df_train = pd.read_csv('data/aps_failure_training_set.csv')
df_test = pd.read_csv('data/aps_failure_test_set.csv')

__Replace Nan Values__

In [3]:
df_train['class'] = df_train['class'].replace(['pos','neg'],[1,0])
df_train = df_train.replace('na',np.NaN)

df_test['class'] = df_test['class'].replace(['pos','neg'],[1,0])
df_test = df_test.replace('na',np.NaN)

__Deleting Features With Zero Variance__

In [4]:
df_train = df_train.astype(float)
for i in df_train:
  if df_train[i].std() == 0:
    df_train = df_train.drop([i],axis=1)
    print('The feature with zero variance is : ',i)
df_train.shape

df_test = df_test.astype(float)
for i in df_test:
  if df_test[i].std() == 0:
    df_test = df_test.drop([i],axis=1)
    print('The feature with zero variance is : ',i)
df_test.shape

The feature with zero variance is :  cd_000
The feature with zero variance is :  cd_000


(16000, 170)

__Deleting Duplicates__

In [5]:
df_train = df_train.drop_duplicates(keep = 'first')
df_train = df_train.T.drop_duplicates().T
print(df_train.shape)

df_test = df_test.drop_duplicates(keep = 'first')
df_test = df_test.T.drop_duplicates().T
print(df_test.shape)

(59999, 170)
(16000, 170)


__Calculating Missing Values__

In [6]:
missing_feature_count = dict(df_train.drop('class',axis=1).isnull().sum())
missing_feature_count = dict(sorted(missing_feature_count.items(), key=lambda item:item[1],reverse=True))

__Missing Value Imputation__

We will eliminate features with missing value greater than 60%.

We will perform median imputation of features with missing values less than 20%

For the features between 20%-60% missing values, we will perform model based imputation called MICE imputation.

In [7]:
features_tobe_eliminated = []
median_imp_features = []
model_imp_features = []
for i in missing_feature_count.keys():
  percent = (missing_feature_count[i]/df_train.shape[0])
  if percent > 0.6:
    features_tobe_eliminated.append(i)
  elif percent < 0.2:
    median_imp_features.append(i)
  else:
    model_imp_features.append(i)

print("Features to be eliminated : ",features_tobe_eliminated)
print("Number of features to be eliminated : ",len(features_tobe_eliminated))
print("\nFeatures for model imputation : ",model_imp_features)
print("Number of features for model imputation : ",len(model_imp_features))
print("\nFeatures for median imputaton : ",median_imp_features)
print("Number of features for median imputaton : ",len(median_imp_features))

Features to be eliminated :  ['br_000', 'bq_000', 'bp_000', 'bo_000', 'ab_000', 'cr_000', 'bn_000', 'bm_000']
Number of features to be eliminated :  8

Features for model imputation :  ['bl_000', 'bk_000', 'ad_000', 'cf_000', 'cg_000', 'ch_000', 'co_000', 'ct_000', 'cu_000', 'cv_000', 'cx_000', 'cy_000', 'cz_000', 'da_000', 'db_000', 'dc_000']
Number of features for model imputation :  16

Features for median imputaton :  ['ec_00', 'cm_000', 'cl_000', 'ed_000', 'ak_000', 'ca_000', 'dm_000', 'df_000', 'dg_000', 'dh_000', 'dl_000', 'dj_000', 'dk_000', 'eb_000', 'di_000', 'ac_000', 'bx_000', 'cc_000', 'bd_000', 'ds_000', 'dt_000', 'dp_000', 'dq_000', 'dr_000', 'du_000', 'dv_000', 'bc_000', 'cp_000', 'de_000', 'do_000', 'dy_000', 'ef_000', 'ar_000', 'bz_000', 'dx_000', 'dz_000', 'ea_000', 'eg_000', 'be_000', 'dd_000', 'ce_000', 'ax_000', 'ae_000', 'af_000', 'av_000', 'bf_000', 'bs_000', 'cb_000', 'bu_000', 'bv_000', 'cq_000', 'dn_000', 'ba_000', 'ba_001', 'ba_002', 'ba_003', 'ba_004', 'ba_

__Train & CV Split__

In [8]:
X = df_train.drop('class',axis=1)
y = df_train['class']

In [9]:
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)
X_test = df_test.drop('class',axis=1)
y_test = df_test['class']

X_train.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
X_cv.reset_index(drop=True,inplace=True)
y_cv.reset_index(drop=True,inplace=True)
print(10*'='+" Train Data "+10*'=')
print(X_train.shape)
print(y_train.shape)

print(10*'='+" CV Data "+10*'=')
print(X_cv.shape)
print(y_cv.shape)

print(10*'='+" Test Data "+10*'=')
print(X_test.shape)
print(y_test.shape)

(41999, 169)
(41999,)
(18000, 169)
(18000,)
(16000, 169)
(16000,)


__Median Imputation__

In [10]:
median_imputer = SimpleImputer(strategy='median')
median_imputer.fit(X_train[median_imp_features])

X_train_median = median_imputer.transform(X_train[median_imp_features])
X_cv_median = median_imputer.transform(X_cv[median_imp_features])
X_test_median = median_imputer.transform(X_test[median_imp_features])

In [11]:
X_train_mice = X_train.copy()
X_train_mice[median_imp_features] = X_train_median
X_train_mice = X_train_mice.drop(features_tobe_eliminated,axis=1)
print(X_train_mice.shape)

X_cv_mice = X_cv.copy()
X_cv_mice[median_imp_features] = X_cv_median
X_cv_mice = X_cv_mice.drop(features_tobe_eliminated,axis=1)
print(X_cv_mice.shape)

X_test_mice = X_test.copy()
X_test_mice[median_imp_features] = X_test_median
X_test_mice = X_test_mice.drop(features_tobe_eliminated,axis=1)
print(X_test_mice.shape)

(41999, 161)
(18000, 161)
(16000, 161)


__Median Imputation for Mice Part (Mice runs to slow)__

In [12]:
def median_imputation(df):
    median_imputer = SimpleImputer(strategy='median')
    median_imputer.fit(df)
    df_imputed = median_imputer.transform(df)
    #df_imputed = pd.DataFrame(df_imputed,columns=df.columns)
    return df_imputed, median_imputer

In [13]:
X_train_imputed, median_imputer = median_imputation(X_train_mice)
X_cv_imputed = median_imputer.transform(X_cv_mice)
X_test_imputed = median_imputer.transform(X_test_mice)

X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train_mice.columns)
X_cv_imputed = pd.DataFrame(X_cv_imputed, columns=X_cv_mice.columns)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test_mice.columns)

__Mice Imputation #Do not Use__

In [14]:
"""def normalize(df):
    x = df
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(x)
    return x_scaled, scaler"""

'def normalize(df):\n    x = df\n    scaler = StandardScaler()\n    x_scaled = scaler.fit_transform(x)\n    return x_scaled, scaler'

In [15]:
"""X_train_scaled, scaler = normalize(X_train_mice)
X_cv_scaled = scaler.transform(X_cv_mice)
X_test_scaled = scaler.transform(X_test_mice)

X_train_mice = pd.DataFrame(X_train_scaled, columns= X_train_mice.columns)
X_cv_mice = pd.DataFrame(X_cv_scaled, columns= X_cv_mice.columns)
X_test_mice = pd.DataFrame(X_test_scaled, columns= X_test_mice.columns)"""

'X_train_scaled, scaler = normalize(X_train_mice)\nX_cv_scaled = scaler.transform(X_cv_mice)\nX_test_scaled = scaler.transform(X_test_mice)\n\nX_train_mice = pd.DataFrame(X_train_scaled, columns= X_train_mice.columns)\nX_cv_mice = pd.DataFrame(X_cv_scaled, columns= X_cv_mice.columns)\nX_test_mice = pd.DataFrame(X_test_scaled, columns= X_test_mice.columns)'

In [16]:
"""def mice_imputation(df):
  mice_imputer = IterativeImputer(random_state=42, max_iter=10)
  mice_imputer.fit(df)
  df_imputed = mice_imputer.transform(df)
  #df_imputed = pd.DataFrame(df_imputed,columns=df.columns)
  return df_imputed, mice_imputer"""

'def mice_imputation(df):\n  mice_imputer = IterativeImputer(random_state=42, max_iter=10)\n  mice_imputer.fit(df)\n  df_imputed = mice_imputer.transform(df)\n  #df_imputed = pd.DataFrame(df_imputed,columns=df.columns)\n  return df_imputed, mice_imputer'

In [17]:
"""X_train_imputed, mice_imputer = mice_imputation(X_train_mice)
X_cv_imputed = mice_imputer.transform(X_cv_mice)
X_test_imputed = mice_imputer.transform(X_test_mice)

X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train_mice.columns)
X_cv_imputed = pd.DataFrame(X_cv_imputed, columns=X_cv_mice.columns)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test_mice.columns)"""

'X_train_imputed, mice_imputer = mice_imputation(X_train_mice)\nX_cv_imputed = mice_imputer.transform(X_cv_mice)\nX_test_imputed = mice_imputer.transform(X_test_mice)\n\nX_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train_mice.columns)\nX_cv_imputed = pd.DataFrame(X_cv_imputed, columns=X_cv_mice.columns)\nX_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test_mice.columns)'

__LDA__

In [18]:
lda = LinearDiscriminantAnalysis(n_components=1)

# Fit the LDA model with the normalized features and target variable
lda.fit(X_train_imputed, y_train)

# Transform the features using the fitted LDA model
x_train_lda = lda.transform(X_train_imputed)
x_test_lda = lda.transform(X_test_imputed)

lda_df = pd.DataFrame(data=x_train_lda, columns=['LDA_Component_1'])

In [19]:
y_pred = lda.predict(X_test_imputed)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Classification Report
print('Classification Report:')
print(classification_report(y_test, y_pred))


Accuracy: 0.9891875
Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     15625
         1.0       0.80      0.72      0.76       375

    accuracy                           0.99     16000
   macro avg       0.90      0.86      0.88     16000
weighted avg       0.99      0.99      0.99     16000



__Finding Best Parameters For Random Forest__

In [20]:
max_depth = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
n_estimators = [10, 25, 50, 75, 100, 125, 150, 175, 200]
min_samples_split = [2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
param = {'max_depth':max_depth,'n_estimators':n_estimators,'min_samples_split':min_samples_split}
clf = RandomForestClassifier(class_weight = 'balanced' , random_state=42)
tuning = RandomizedSearchCV(estimator=clf,param_distributions=param,cv=5,scoring='f1_macro',n_jobs=-1,return_train_score=True,verbose=10)
tuning.fit(x_train_lda,y_train)
best = tuning.best_params_
print(best)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 50, 'min_samples_split': 5, 'max_depth': 35}


__Random Forest Classifier__

In [21]:
best_RF_model = RandomForestClassifier(max_depth = 25, n_estimators =75 ,min_samples_split=10,n_jobs=-1,class_weight = 'balanced' , random_state=42, criterion='gini')
calib_RF = CalibratedClassifierCV(estimator=best_RF_model, cv=3, method='sigmoid')
calib_RF.fit(x_train_lda,y_train)

y_pred = calib_RF.predict(x_train_lda)
f1_scr = f1_score(y_train,y_pred, average = 'macro')
print("Macro average f1-score on Train Data : ", f1_scr)

y_pred = calib_RF.predict(x_test_lda)
f1_scr = f1_score(y_test,y_pred, average = 'macro')
print("Macro average f1-score on Test Data : ", f1_scr)

Macro average f1-score on Train Data :  0.9138580786202264
Macro average f1-score on Test Data :  0.8494162158966629
