### What is ROC-AUC curve?

AUC - ROC curve is a performance measurement for classification problem at various thresholds settings. ROC is a probability curve and AUC represents degree or measure of separability. It tells how much model is capable of distinguishing between classes. Higher the AUC, better the model is at predicting 0s as 0s and 1s as 1s. By analogy, Higher the AUC, better the model is at distinguishing between patients with disease and no disease.


In [1]:
# Importing important libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns

In [2]:
data=pd.read_csv('Sentander Dataset//santander_train.csv',nrows=20000)

In [3]:
data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


#### Splitting data into Independent and Dependent Variable.

In [4]:
X=data.drop('TARGET',axis=1)
y=data['TARGET']

In [5]:
X.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016


In [6]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: TARGET, dtype: int64

### Splitting data into training and testing set.

In [7]:
## Importing Libraries.
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=0,stratify=y)

In [8]:
x_train.shape,x_test.shape,y_test.shape,y_train.shape

((16000, 370), (4000, 370), (4000,), (16000,))

In [9]:
## Importing Important Libraries.
from sklearn.feature_selection import VarianceThreshold
## Importing Important Libraries.
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,roc_auc_score

### Removing the constant,Quasi constant and Duplicate Features from Dependent features.

In [10]:
constant_filters=VarianceThreshold(threshold=0.01)
constant_filters.fit(x_train)

VarianceThreshold(threshold=0.01)

In [11]:
### Transform all filter values.
x_train_filter=constant_filters.transform(x_train)
x_test_filter=constant_filters.transform(x_test)

In [12]:
x_train_filter.shape,x_test_filter.shape

((16000, 245), (4000, 245))

In [13]:
constant_filters.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True, False,
       False, False, False,  True,  True, False, False, False, False,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False,  True,  True,  True,  True,  True, False,  True,
       False, False, False, False, False, False,  True,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,

In [14]:
constant_filters.get_support().sum()

245

In [15]:
## Finding the constant columns.
quasi_constant_columns=[columns for columns in x_train.columns
                       if columns not in x_train.columns[constant_filters.get_support()]]

print(len(quasi_constant_columns))

125


In [16]:
## Printing all constant columns.
for columns in quasi_constant_columns:
    print(columns)

ind_var1
ind_var2_0
ind_var2
ind_var6_0
ind_var6
ind_var13_largo_0
ind_var13_largo
ind_var13_medio_0
ind_var13_medio
ind_var14
ind_var17_0
ind_var17
ind_var18_0
ind_var18
ind_var19
ind_var20_0
ind_var20
ind_var27_0
ind_var28_0
ind_var28
ind_var27
ind_var29_0
ind_var29
ind_var30_0
ind_var31_0
ind_var31
ind_var32_cte
ind_var32_0
ind_var32
ind_var33_0
ind_var33
ind_var34_0
ind_var34
ind_var40
ind_var41
ind_var39
ind_var44_0
ind_var44
ind_var46_0
ind_var46
num_var6_0
num_var6
num_var13_medio_0
num_var13_medio
num_var18_0
num_var18
num_op_var40_hace3
num_var27_0
num_var28_0
num_var28
num_var27
num_var29_0
num_var29
num_var33_0
num_var33
num_var34_0
num_var34
num_var41
num_var46_0
num_var46
saldo_var13_medio
saldo_var18
saldo_var28
saldo_var27
saldo_var34
saldo_var41
saldo_var46
delta_imp_amort_var18_1y3
delta_imp_amort_var34_1y3
delta_imp_aport_var33_1y3
delta_imp_reemb_var33_1y3
delta_imp_trasp_var17_out_1y3
delta_imp_trasp_var33_out_1y3
delta_num_aport_var33_1y3
delta_num_reemb_var33_1y3


In [17]:
## Dropping allquasi_constant_features.
x_train.drop(quasi_constant_columns,axis=1)

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
8579,17282,2,24,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63200.700000
19085,38270,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88640.610000
15698,31526,2,45,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96314.160000
19307,38737,2,29,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117568.020000
8165,16469,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19117,38333,2,25,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53324.460000
13166,26389,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56028.480000
1024,2025,2,43,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38272.440000
8605,17327,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016


#### Duplicate Feature Filters.

In [18]:
### Transpose all x_train_filter and x_test_filter data.
x_train_T=x_train_filter.T
x_test_T=x_train_filter.T

In [19]:
## converting all transpose dataset into pandas dataframe.
x_train_T=pd.DataFrame(x_train_T)
x_test_T=pd.DataFrame(x_test_T)

In [20]:
### Find out the Duplicate rows in Dataset.
x_train_T.duplicated().sum()

18

In [21]:
### Removing all duplicate feature we get.
duplicate_features=x_train_T.duplicated()
duplicate_features

0      False
1      False
2      False
3      False
4      False
       ...  
240    False
241    False
242    False
243    False
244    False
Length: 245, dtype: bool

In [22]:
### Selecting the non_duplicated rows.
non_duplicated_feature=[not index for index in duplicate_features]

In [23]:
x_train_unique=x_train_T[non_duplicated_feature].T
x_test_unique=x_test_T[non_duplicated_feature].T

In [24]:
x_train_unique.shape,x_test_unique.shape,x_train.shape

((16000, 227), (16000, 227), (16000, 370))

### Calculating the ROC-AUC Score.

In [45]:
roc_auc=[]
for feature in x_train_unique.columns:
    clf=RandomForestClassifier(n_estimators=100,random_state=0)
    clf.fit(x_train_unique[feature].to_frame(),y_train)
    y_pred=clf.predict(x_test_unique[feature].to_frame())
    roc_auc.append(roc_auc_score(y_test,y_pred))

ValueError: Found input variables with inconsistent numbers of samples: [4000, 16000]

In [None]:
y_train.shape