In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

DATA_DIR = "C:/Users/16306/Downloads/"

### Consider how we handle NAN if results aren't quite what we want

In [47]:
#Read in data
df = pd.read_csv(DATA_DIR + "application_train.csv")
df.head()
print("Rows initially:", len(df))

#Pull target variable and remove from predictors
target_var = df["TARGET"]
df.drop(columns=["SK_ID_CURR"], inplace=True)

Rows initially: 307511


### Uncomment this chunk if you wish to standardize

In [None]:
standardized = True

numeric_columns = df.drop(columns=["TARGET"])._get_numeric_data().columns

scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

df.head()

### One Hot Encode and Create Variables for Categorical Values with NAN

In [50]:
#One Hot Encode and create variables for categorical values with NAN
df_dummies = pd.get_dummies(df, dummy_na=True)
df_dummies.head()

Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan
0,1,-0.577538,0.142129,-0.478095,-0.166146,-0.507236,-0.149452,1.50688,-0.456215,0.379837,...,0,0,0,0,1,0,0,1,0,0
1,0,-0.577538,0.426792,1.72545,0.592666,1.599974,-1.25275,-0.166821,-0.460115,1.078697,...,0,0,0,0,0,0,0,1,0,0
2,0,-0.577538,-0.427196,-1.152888,-1.404649,-1.091895,-0.783451,-0.689509,-0.453299,0.206116,...,0,0,0,0,0,0,1,0,0,1
3,0,-0.577538,-0.142533,-0.71143,0.177865,-0.653401,-0.928991,-0.680114,-0.473217,-1.375829,...,0,0,0,0,0,0,1,0,0,1
4,0,-0.577538,-0.199466,-0.213734,-0.361748,-0.068741,0.56357,-0.892535,-0.47321,0.191639,...,0,0,0,0,0,0,1,0,0,1


### Replace NAN values in numerical variables with mean value of column 
* Too many NANs to drop them all so must impute somehow
* Consider trying median values

In [None]:
for col in df_.columns:
    df_dummies[col] = df_dummies[col].fillna(df_dummies[col].mean())

### Drop Columns That Sum to Zero 
* They're all nan columns for categorical variables that don't have any nans

In [52]:
temp = pd.DataFrame(df_dummies.sum(axis=0))
df_dummies.drop(columns = list(temp[temp[0] == 0].index), inplace = True)

df_dummies.head()

### Save Cleaned Data

In [54]:
if standardized:
    df_dummies.to_csv("Standardized_Cleaned_Data.csv")
else:
    df_dummies.to_csv("Non_Standardized_Cleaned_Data.csv")

## EDA
* correlations

In [55]:
corrs = df_.corr()["TARGET"]
corrs.sort_values()

TARGET                       1.000000
CNT_CHILDREN                 0.019187
AMT_INCOME_TOTAL            -0.003982
AMT_CREDIT                  -0.030369
AMT_ANNUITY                 -0.012817
                               ...   
WALLSMATERIAL_MODE_Wooden    0.007946
WALLSMATERIAL_MODE_nan       0.039393
EMERGENCYSTATE_MODE_No      -0.042201
EMERGENCYSTATE_MODE_Yes      0.004829
EMERGENCYSTATE_MODE_nan      0.041392
Name: TARGET, Length: 251, dtype: float64

# Training

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from math import log2

### Read in Data
* Makes us able to skip the first part for future runs

In [17]:
df = pd.read_csv("Standardized_Cleaned_Data.csv")
# df = pd.read_csv("Non_Standardized_Cleaned_Data.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
df.head()

Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan
0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,-3648.0,...,0,0,0,0,1,0,0,1,0,0
1,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,-1186.0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,-4260.0,...,0,0,0,0,0,0,1,0,0,1
3,0,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039,-9833.0,...,0,0,0,0,0,0,1,0,0,1
4,0,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038,-4311.0,...,0,0,0,0,0,0,1,0,0,1


In [148]:
def gradient_descent_logistic(X, y, learning_rate_val, threshold):
  betas = np.matrix(np.zeros(X.shape[1]))
  learning_rate = np.array([-1 * learning_rate_val] * betas.shape[1])
  stop = False
  iterations = 0
  while not stop:
    y_hat = np.array(X * betas.transpose()).flatten()
    exp = np.exp(y_hat)
    pi = (exp / (1 + exp))
    errors = np.matrix(y-pi)
    gradient = np.array((errors * X) / X.shape[0])[0]
    gradient_total = np.sum(abs(gradient))
    if gradient_total < threshold:
      stop = True
    betas = -1*(gradient * learning_rate) + betas
    iterations += 1
  return betas

def gradient_descent_logistic_with_penalty(X, y, learning_rate_val, threshold, L):
  betas = np.matrix(np.zeros(X.shape[1]))
  learning_rate = np.array([-1 * learning_rate_val] * betas.shape[1])
  stop = False
  iterations = 0
  while not stop:
    y_hat = np.array(X * betas.transpose()).flatten()
    exp = np.exp(y_hat)
    pi = (exp / (1 + exp))
    errors = np.matrix(y-pi)
    gradient = np.array((errors * X) / X.shape[0])[0]
    beta_vals = np.array(betas)[0]
    for x in range(len(gradient)):
      if x < 0:
        gradient[x] -= beta_vals[x] * L
      elif x > 0:
        gradient[x] += beta_vals[x] * L
    gradient_total = np.sum(abs(gradient))
    if (gradient_total < threshold) or (iterations > 1000):
      stop = True
    betas = -1*(gradient * learning_rate) + betas
    iterations += 1
    print(gradient_total)
  return betas

In [87]:
X_train, X_test, y_train, y_test = train_test_split(df_.drop(columns=["TARGET"]), df_["TARGET"], test_size = .2, random_state = 42)
df_.drop(columns=["TARGET"])

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan
0,-0.577538,0.142129,-0.478095,-0.166146,-0.507236,-0.149452,1.506880,-0.456215,0.379837,0.579154,...,0,0,0,0,1,0,0,1,0,0
1,-0.577538,0.426792,1.725450,0.592666,1.599974,-1.252750,-0.166821,-0.460115,1.078697,1.790855,...,0,0,0,0,0,0,0,1,0,0
2,-0.577538,-0.427196,-1.152888,-1.404649,-1.091895,-0.783451,-0.689509,-0.453299,0.206116,0.306869,...,0,0,0,0,0,0,1,0,0,1
3,-0.577538,-0.142533,-0.711430,0.177865,-0.653401,-0.928991,-0.680114,-0.473217,-1.375829,0.369143,...,0,0,0,0,0,0,1,0,0,1
4,-0.577538,-0.199466,-0.213734,-0.361748,-0.068741,0.563570,-0.892535,-0.473210,0.191639,-0.307263,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,-0.577538,-0.047646,-0.855489,0.031008,-0.848287,0.845396,1.537586,-0.453377,-0.984955,0.670578,...,0,0,0,0,1,0,0,1,0,0
307507,-0.577538,-0.408219,-0.818594,-1.042319,-0.848287,0.310593,-1.085707,2.133617,0.169782,-0.725959,...,0,0,0,0,1,0,0,1,0,0
307508,-0.577538,-0.066623,0.195379,0.198046,0.126145,-1.147120,0.245417,-0.507774,-0.497002,-1.428203,...,0,0,0,1,0,0,0,1,0,0
307509,-0.577538,0.009287,-0.568757,-0.476315,-0.592499,-1.124635,0.934008,-0.485583,0.688107,1.366859,...,0,0,0,0,1,0,0,1,0,0


In [88]:
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size = .2, random_state = 42)

In [62]:
y_train

9717      0
203356    0
81757     0
84860     0
234668    0
         ..
163061    1
96173     0
204206    0
219751    0
255       1
Name: TARGET, Length: 196806, dtype: int64

In [149]:
X_train_matrix = np.matrix(X_train)
y_train_matrix = np.matrix(y_train)

new_betas = gradient_descent_logistic_with_penalty(X_train_matrix, y_train, .1, .05, .001)

7.211549485126493
5.895742251779742
4.877657295824253
4.099228522935657
3.500520811793623
3.03263264512893
2.660658319759811
2.359814178535387
2.112484227682689
1.9065642791662967
1.7329436734672214
1.5847880032527706
1.4571780837445454
1.346351953865811
1.249319288548026
1.1637430319001996
1.0877945229891128
1.0200109829461894
0.9592058965255786
0.9044076121962248
0.854813412589738
0.8097658225378525
0.7687002018983551
0.7311333483803721
0.6966623639117293
0.664955065263644
0.6357313722401904
0.6087680200943384
0.5838092075531754
0.5606595637817173
0.5391925844097488
0.5191987014148818
0.5006431278534597
0.48350152002217317
0.4675542911984653
0.4527085276534901
0.43879450846089035
0.4257338288393647
0.4134364996088836
0.4018733425503625
0.3909586659841605
0.3806340120280933
0.3708557344451096
0.3616017127416191
0.35281067425022716
0.3444848312510279
0.33696730717827494
0.32992452537139727
0.3232638211775428
0.3169332597412762
0.31093855811857307
0.30529410672513047
0.299999360519847
0

0.06899724138838798
0.06887000766154575
0.06874317488098701
0.06861674122389577
0.06849080822201176
0.0683653467125313
0.06824027662528956
0.06811559621428075
0.06799130374463441
0.06786739749251786
0.06774387574504025
0.0676207368001579
0.06749797896658002
0.0673756005636761
0.0672535999213846
0.06713197538012072
0.06701072529068827
0.06688984801419005
0.06676963342439854
0.06665009952448353
0.06653092708083025
0.06641211456440524
0.06629366045535032
0.06617556324290533
0.06605782142533109
0.06594043350983332
0.06582339801248882
0.06570671345816968
0.06559037838047155
0.06547478473156051
0.06535967186748569
0.06524489773532158
0.06513046096440453
0.06501636019187064
0.06490259406259166
0.06478916122911291
0.06467606035159164
0.06456348576675755
0.06445133237555928
0.06433950665252372
0.06422800728420394
0.06411683296452525
0.06400598239472682
0.0638954542833025
0.0637852473459451
0.06367536030548795
0.06356579189184998
0.0634565408419793
0.0633476058997987
0.06323898581615109
0.063130

0.03913523530536211
0.03909510356098039
0.03905505206002899
0.03901508058996728
0.03897518893898698
0.03893537689600756
0.03889564425067378
0.0388559907933524
0.0388164163151284
0.03877692060780266
0.03873750346388791
0.0386981646766061
0.03865909261342342
0.03862027881205835
0.03858154099744791
0.0385428789722179
0.038504292539661956
0.03846578150373893
0.03842734566906955
0.038388984840934136
0.038350698825268884
0.03831248742866365
0.03827440349351602
0.03823669832143509
0.038199265696107376
0.038161902595249354
0.038124608860235665
0.03808738433284463
0.038050228855255636
0.0380131422700493
0.03797612442020624
0.03793917514910548
0.03790229430052412
0.03786548171863649
0.03782878295720079
0.037792298064866336
0.037755880272533945
0.037719529428287
0.03768324538059296
0.03764702797830259
0.03761087707064831
0.03757479250724363
0.037538774138082515
0.03750282181353794
0.037466935384361216
0.03743111470168062
0.037395359617001134
0.037359669982203386
0.037324045649541684
0.03728848647

In [154]:
log_odds = np.matrix(X_train) * new_betas.transpose()
exp = np.exp(log_odds)
pi = (exp / (1 + exp))
pd.Series([1 if x > .5 else 0 for x in pi]).value_counts()

0    196473
1       333
dtype: int64

In [90]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train_matrix, y_train)



In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train_matrix, y_train)

In [91]:
pd.Series(clf.predict(X_train_matrix[:, :])).value_counts()

0    196395
1       411
dtype: int64

In [42]:
target_var.value_counts()[1]/target_var.value_counts()[0]

0.08781828601345662

In [79]:
y_train.value_counts()

0    180972
1     15834
Name: TARGET, dtype: int64

In [73]:
#Consider removing variables that are highly correlated with other variables
df_corr = df_.corr()
df_corr

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan
CNT_CHILDREN,1.000000,0.012882,0.002145,0.021374,-0.001826,-0.025573,0.330938,-0.239818,0.183395,-0.028019,...,-0.000709,0.001607,-0.002032,-0.020892,-0.025088,0.011036,0.036713,-0.038644,0.004525,0.037885
AMT_INCOME_TOTAL,0.012882,1.000000,0.156870,0.191657,0.159600,0.074796,0.027261,-0.064223,0.027805,0.008506,...,0.006149,0.023886,0.003886,0.032753,0.016523,-0.003369,-0.048745,0.050174,-0.002894,-0.049706
AMT_CREDIT,0.002145,0.156870,1.000000,0.770127,0.986588,0.099738,-0.055436,-0.066838,0.009621,-0.006575,...,0.007987,0.027255,0.005799,0.046644,0.009756,-0.007373,-0.055586,0.058256,-0.004308,-0.057547
AMT_ANNUITY,0.021374,0.191657,0.770127,1.000000,0.774661,0.118424,0.009445,-0.104331,0.038513,0.011268,...,0.010077,0.035318,0.005968,0.054987,0.009188,-0.007711,-0.063930,0.068144,-0.003910,-0.067511
AMT_GOODS_PRICE,-0.001826,0.159600,0.986588,0.774661,1.000000,0.103482,-0.053416,-0.064826,0.011561,-0.009262,...,0.009953,0.028299,0.005877,0.049740,0.012882,-0.009128,-0.060514,0.062849,-0.004846,-0.062050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WALLSMATERIAL_MODE_Wooden,0.011036,-0.003369,-0.007373,-0.007711,-0.009128,-0.056263,0.008758,-0.002846,0.000159,0.012702,...,-0.011554,-0.010162,-0.009710,-0.069666,-0.068843,1.000000,-0.135474,0.078964,0.273271,-0.126455
WALLSMATERIAL_MODE_nan,0.036713,-0.048745,-0.055586,-0.063930,-0.060514,-0.108545,0.019602,0.010224,0.067334,0.014851,...,-0.088204,-0.077575,-0.074123,-0.531832,-0.525545,-0.135474,1.000000,-0.924401,-0.048372,0.933422
EMERGENCYSTATE_MODE_No,-0.038644,0.050174,0.058256,0.068144,0.062849,0.103513,-0.019852,-0.010907,-0.069039,-0.015230,...,0.076786,0.073260,0.060550,0.502903,0.491544,0.078964,-0.924401,1.000000,-0.090624,-0.984943
EMERGENCYSTATE_MODE_Yes,0.004525,-0.002894,-0.004308,-0.003910,-0.004846,-0.029932,0.005434,-0.003571,-0.006273,0.006851,...,0.031648,-0.005178,0.049521,-0.039280,-0.007605,0.273271,-0.048372,-0.090624,1.000000,-0.082907


In [74]:
#Take absolute value of correlations
df_corr_abs = df_corr.abs()

In [93]:
#
temp = df_corr_abs
temp
for col in temp.columns:
    temp[col] = temp[col].sort_values(ascending=False)[1:6]

temp.mean(axis=0).sort_values(ascending=False).head(20)

LIVINGAREA_AVG                   0.887484
LIVINGAREA_MEDI                  0.887324
APARTMENTS_AVG                   0.875282
APARTMENTS_MEDI                  0.871326
LIVINGAREA_MODE                  0.871326
TOTALAREA_MODE                   0.869341
APARTMENTS_MODE                  0.866186
ELEVATORS_MODE                   0.823251
ELEVATORS_AVG                    0.819636
ELEVATORS_MEDI                   0.815055
LIVINGAPARTMENTS_MODE            0.750896
LIVINGAPARTMENTS_MEDI            0.737680
LIVINGAPARTMENTS_AVG             0.732650
WALLSMATERIAL_MODE_nan           0.682478
HOUSETYPE_MODE_nan               0.679362
HOUSETYPE_MODE_block of flats    0.674516
EMERGENCYSTATE_MODE_nan          0.645410
FONDKAPREMONT_MODE_nan           0.645410
EMERGENCYSTATE_MODE_No           0.645381
FLOORSMAX_MODE                   0.638215
dtype: float64

In [98]:
df[["LIVINGAPARTMENTS_MEDI", "LIVINGAPARTMENTS_AVG", "LIVINGAPARTMENTS_MODE"]]

Unnamed: 0,LIVINGAPARTMENTS_MEDI,LIVINGAPARTMENTS_AVG,LIVINGAPARTMENTS_MODE
0,0.0205,0.0202,0.0220
1,0.0787,0.0773,0.0790
2,,,
3,,,
4,,,
...,...,...,...
307506,0.1509,0.1484,0.0882
307507,0.0205,0.0202,0.0220
307508,0.0855,0.0841,0.0918
307509,,,


In [95]:
weird_vars = []
for col in temp.columns:
    if "MODE" in col or "MEDI" in col or "AVG" in col:
        weird_vars.append(col)
        
weird_vars

['APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BUILD_MODE',
 'COMMONAREA_MODE',
 'ELEVATORS_MODE',
 'ENTRANCES_MODE',
 'FLOORSMAX_MODE',
 'FLOORSMIN_MODE',
 'LANDAREA_MODE',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAREA_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'ENTRANCES_MEDI',
 'FLOORSMAX_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'LIVINGAREA_MEDI',
 'NONLIVINGAPARTMENTS_MEDI',
 'NONLIVINGAREA_MEDI',
 'TOTALAREA_MODE',
 'FONDKAPREMONT_MODE_not specified',
 'FONDKAPREMONT_MODE_org spec acco

In [96]:
df["EMERGENCYSTATE_MODE"]

0          No
1          No
2         NaN
3         NaN
4         NaN
         ... 
307506     No
307507     No
307508     No
307509     No
307510     No
Name: EMERGENCYSTATE_MODE, Length: 307511, dtype: object

In [82]:
#This is a mess, let's work on it
corr_dict = {}
for col in df_corr_abs.columns:
    corr_dict[col] = (df_corr_abs[col].sort_values(ascending = False).index[1], df_corr_abs[col].sort_values(ascending = False)[1])

highest_corrs = []
for val in corr_dict.values():
    highest_corrs.append(val[1])

sorted(highest_corrs)

[0.009706258820390544,
 0.010530906131240437,
 0.011058928656568849,
 0.011275414105436612,
 0.011466936858909002,
 0.01146984969983328,
 0.013094141708821722,
 0.014453429433199215,
 0.014611141295997464,
 0.014707845334371087,
 0.014878787777508647,
 0.015219257083813132,
 0.01587807600767065,
 0.016142001495559357,
 0.018049363474305677,
 0.019918568572964312,
 0.02242663942850155,
 0.026528840049437407,
 0.029205787940739636,
 0.03251547477822282,
 0.034458206073689,
 0.03498087394722537,
 0.03616073209148479,
 0.03674056481901469,
 0.03785482417533689,
 0.0389845350095574,
 0.04060216190954936,
 0.04084983554835195,
 0.04190139021996996,
 0.042379987828412034,
 0.04553483246007707,
 0.045909366542471054,
 0.045948970033035975,
 0.04689784270300796,
 0.051630035753355036,
 0.0544974753246741,
 0.05568055283133256,
 0.055821449221329235,
 0.058172451205915895,
 0.06096223634421087,
 0.0635192081742089,
 0.06513321547998097,
 0.06619660349783962,
 0.06762236373081022,
 0.067857475486

In [47]:
#Zero binary columns
count = 0
col_list = []
for col in df.columns:
    if len(df[col].value_counts()) == 2:
        if df[col].dtype != np.int64:
            col_list.append(col)
            count += 1
        
col_list

['NAME_CONTRACT_TYPE',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'EMERGENCYSTATE_MODE']

In [49]:
#Zero binary columns
count = 0
col_list = []
for col in df.columns:
    if df[col].dtype != np.int64:
        col_list.append(col)
        count += 1

In [53]:
na_cat_vars = []
for col in df[col_list].columns:
    if df[col_list][col].isna().any():
        na_cat_vars.append(col)
        
na_cat_vars

['AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'NAME_TYPE_SUITE',
 'OWN_CAR_AGE',
 'OCCUPATION_TYPE',
 'CNT_FAM_MEMBERS',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BUILD_MODE',
 'COMMONAREA_MODE',
 'ELEVATORS_MODE',
 'ENTRANCES_MODE',
 'FLOORSMAX_MODE',
 'FLOORSMIN_MODE',
 'LANDAREA_MODE',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAREA_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'ENTRANCES_MEDI',
 'FLOORSMAX_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',