In [1]:
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import warnings
warnings.filterwarnings("ignore")
import sys
import datetime
import scipy as sp
import statsmodels.stats.api as sms
import statsmodels.api as sm
from patsy import dmatrix
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import utils.preprocessing as pp 
import utils.correlation as cr
import utils.statsmodel_helper as st

# df_macro = pd.read_csv('./data/macro.csv', parse_dates=['timestamp'])
df_train = pd.read_csv('./data/train_imput_regression.csv', index_col=0)
df_test = pd.read_csv('./data/test_imput_regression.csv', index_col=0)

min_corr = 0.3

In [2]:
def pick_highly_correlated_features(df, columns, min_corr):
    pairs = []
    for col in columns:
        if not np.issubdtype(df[col].dtype, np.number):continue
        corrs = [(col, c, abs(df[col].corr(df[c]))) for c in df.columns.values.tolist() if c != col]
        corrs.sort(key=lambda item: item[2], reverse=True)
        for item in corrs:
            if item[2] > min_corr:
                pairs.append(item)
            else:
                break
    return pd.DataFrame(pairs, columns=['missing_col', 'highest corr with', 'corr'])

def pick_highly_correlated_IVs(df, target_col, min_corr, min_unique_values = 0):
    if not np.issubdtype(df[target_col].dtype, np.number):
        Exception('{}은 numeric data가 아닙니다.'.format(target_col))
    # if len(df[col].value_counts().index) < min_unique_values:
    #     Exception('{}로 상관관계를 계산하기에는 유니크한 값이 너무 작습니다.'.format(col))

    corrs = []
    for col in df._get_numeric_data().drop(target_col, axis=1).columns:
        if len(df[col].value_counts().index) < min_unique_values: continue
        corr = abs(df[target_col].corr(df[col]))
        if corr > min_corr:
            corrs.append((col, corr))
    
    return corrs 

In [4]:
columns = list(df_train._get_numeric_data().columns)

In [5]:
df_train[columns].astype(int).tail()

Unnamed: 0_level_0,usdrub,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,state,area_m,raion_popul,green_zone_part,indust_part,children_preschool,preschool_quota,preschool_education_centers_raion,children_school,school_quota,school_education_centers_raion,school_education_centers_top_20_raion,hospital_beds_raion,healthcare_centers_raion,university_top_20_raion,sport_objects_raion,additional_education_raion,culture_objects_top_25_raion,shopping_centers_raion,office_raion,full_all,male_f,female_f,young_all,young_male,young_female,work_all,work_male,work_female,ekder_all,ekder_male,ekder_female,0_6_all,0_6_male,0_6_female,7_14_all,7_14_male,7_14_female,0_17_all,0_17_male,0_17_female,16_29_all,16_29_male,16_29_female,0_13_all,0_13_male,0_13_female,raion_build_count_with_material_info,build_count_block,build_count_wood,build_count_frame,build_count_brick,build_count_monolith,build_count_panel,build_count_foam,build_count_slag,build_count_mix,raion_build_count_with_builddate_info,build_count_before_1920,build_count_1921-1945,build_count_1946-1970,build_count_1971-1995,build_count_after_1995,metro_min_avto,metro_km_avto,metro_min_walk,metro_km_walk,kindergarten_km,school_km,park_km,green_zone_km,industrial_km,water_treatment_km,cemetery_km,incineration_km,railroad_station_walk_km,railroad_station_walk_min,railroad_station_avto_km,railroad_station_avto_min,ID_railroad_station_avto,public_transport_station_km,public_transport_station_min_walk,water_km,mkad_km,ttk_km,sadovoe_km,bulvar_ring_km,kremlin_km,big_road1_km,big_road2_km,railroad_km,zd_vokzaly_avto_km,bus_terminal_avto_km,oil_chemistry_km,nuclear_reactor_km,radiation_km,power_transmission_line_km,thermal_power_plant_km,ts_km,big_market_km,market_shop_km,fitness_km,swim_pool_km,ice_rink_km,stadium_km,basketball_km,hospice_morgue_km,detention_facility_km,public_healthcare_km,university_km,workplaces_km,shopping_centers_km,office_km,additional_education_km,preschool_km,big_church_km,church_synagogue_km,mosque_km,theater_km,museum_km,exhibition_km,catering_km,green_part_500,prom_part_500,office_count_500,office_sqm_500,trc_count_500,trc_sqm_500,cafe_count_500,cafe_sum_500_min_price_avg,cafe_sum_500_max_price_avg,cafe_avg_price_500,cafe_count_500_na_price,cafe_count_500_price_500,cafe_count_500_price_1000,cafe_count_500_price_1500,cafe_count_500_price_2500,cafe_count_500_price_4000,cafe_count_500_price_high,big_church_count_500,church_count_500,mosque_count_500,leisure_count_500,sport_count_500,market_count_500,green_part_1000,prom_part_1000,office_count_1000,office_sqm_1000,trc_count_1000,trc_sqm_1000,cafe_count_1000,cafe_sum_1000_min_price_avg,cafe_sum_1000_max_price_avg,cafe_avg_price_1000,cafe_count_1000_na_price,cafe_count_1000_price_500,cafe_count_1000_price_1000,cafe_count_1000_price_1500,cafe_count_1000_price_2500,cafe_count_1000_price_4000,cafe_count_1000_price_high,big_church_count_1000,church_count_1000,mosque_count_1000,leisure_count_1000,sport_count_1000,market_count_1000,green_part_1500,prom_part_1500,office_count_1500,office_sqm_1500,trc_count_1500,trc_sqm_1500,cafe_count_1500,cafe_sum_1500_min_price_avg,cafe_sum_1500_max_price_avg,cafe_avg_price_1500,cafe_count_1500_na_price,cafe_count_1500_price_500,cafe_count_1500_price_1000,cafe_count_1500_price_1500,cafe_count_1500_price_2500,cafe_count_1500_price_4000,cafe_count_1500_price_high,big_church_count_1500,church_count_1500,mosque_count_1500,leisure_count_1500,sport_count_1500,market_count_1500,green_part_2000,prom_part_2000,office_count_2000,office_sqm_2000,trc_count_2000,trc_sqm_2000,cafe_count_2000,cafe_sum_2000_min_price_avg,cafe_sum_2000_max_price_avg,cafe_avg_price_2000,cafe_count_2000_na_price,cafe_count_2000_price_500,cafe_count_2000_price_1000,cafe_count_2000_price_1500,cafe_count_2000_price_2500,cafe_count_2000_price_4000,cafe_count_2000_price_high,big_church_count_2000,church_count_2000,mosque_count_2000,leisure_count_2000,sport_count_2000,market_count_2000,green_part_3000,prom_part_3000,office_count_3000,office_sqm_3000,trc_count_3000,trc_sqm_3000,cafe_count_3000,cafe_sum_3000_min_price_avg,cafe_sum_3000_max_price_avg,cafe_avg_price_3000,cafe_count_3000_na_price,cafe_count_3000_price_500,cafe_count_3000_price_1000,cafe_count_3000_price_1500,cafe_count_3000_price_2500,cafe_count_3000_price_4000,cafe_count_3000_price_high,big_church_count_3000,church_count_3000,mosque_count_3000,leisure_count_3000,sport_count_3000,market_count_3000,green_part_5000,prom_part_5000,office_count_5000,office_sqm_5000,trc_count_5000,trc_sqm_5000,cafe_count_5000,cafe_sum_5000_min_price_avg,cafe_sum_5000_max_price_avg,cafe_avg_price_5000,cafe_count_5000_na_price,cafe_count_5000_price_500,cafe_count_5000_price_1000,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,month,dow,rel_floor,rel_kitch_sq,room_size,avg_price_ID_metro,avg_price_ID_railroad_station_walk,avg_price_ID_big_road1,avg_price_ID_big_road2,avg_price_ID_railroad_terminal,avg_price_ID_bus_terminal,price_doc,avg_price_sub_area
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1,Unnamed: 250_level_1,Unnamed: 251_level_1,Unnamed: 252_level_1,Unnamed: 253_level_1,Unnamed: 254_level_1,Unnamed: 255_level_1,Unnamed: 256_level_1,Unnamed: 257_level_1,Unnamed: 258_level_1,Unnamed: 259_level_1,Unnamed: 260_level_1,Unnamed: 261_level_1,Unnamed: 262_level_1,Unnamed: 263_level_1,Unnamed: 264_level_1,Unnamed: 265_level_1,Unnamed: 266_level_1,Unnamed: 267_level_1,Unnamed: 268_level_1,Unnamed: 269_level_1,Unnamed: 270_level_1,Unnamed: 271_level_1,Unnamed: 272_level_1,Unnamed: 273_level_1,Unnamed: 274_level_1,Unnamed: 275_level_1,Unnamed: 276_level_1,Unnamed: 277_level_1,Unnamed: 278_level_1,Unnamed: 279_level_1,Unnamed: 280_level_1,Unnamed: 281_level_1
30469,55,44,27,7,9,1975,2,6,3,10053051,175518,0,0,9753,5088,4,10311,12721,4,0,940,1,0,10,3,0,9,6,61396,27916,33480,21400,11094,10306,112133,59089,53044,41985,12703,29282,9753,5044,4709,10311,5335,4976,23849,12412,11437,11588,5359,6229,18732,9663,9069,282,35,0,0,20,5,222,0,0,0,282,0,0,14,246,22,1,0,8,0,0,0,1,0,0,22,1,1,3,37,3,5,1,0,1,0,3,8,10,11,12,3,3,1,11,15,15,4,2,2,6,1,8,0,0,2,0,6,3,3,4,1,2,1,0,1,0,0,1,0,1,7,4,4,0,3,0,0,0,2,41970,8,614,1071,842,1,1,4,2,0,0,0,0,0,0,0,1,0,10,7,0,0,3,79970,20,666,1138,902,2,5,7,4,2,0,0,0,1,0,0,3,1,8,10,5,217042,6,109230,28,646,1115,880,2,6,12,6,2,0,0,2,2,0,0,11,2,9,18,6,224342,14,357757,43,635,1102,869,4,11,18,7,2,1,0,2,5,1,0,15,3,13,21,17,410183,22,745130,79,647,1112,880,8,20,30,15,5,1,0,5,12,1,0,29,3,15,17,44,838601,53,2548292,207,689,1156,923,18,63,59,47,15,5,0,15,26,1,2,84,6,6,1,0,0,13,6958287,6816849,6798686,6573570,7265532,8817570,7400000,7127992
30470,55,86,59,3,9,1935,4,10,3,7307410,75377,0,0,4237,1874,4,6398,6772,4,1,1046,3,2,29,16,10,23,141,116742,52836,63906,11272,5470,5802,43921,21901,22020,20184,6644,13540,4237,2079,2158,6398,3094,3304,12508,6065,6443,23480,11491,11989,9955,4835,5120,651,19,27,4,529,25,41,0,5,1,650,263,105,154,71,57,1,0,9,0,0,0,1,0,0,13,1,8,0,10,1,2,83,0,1,1,13,1,1,1,3,1,1,0,1,4,10,7,1,3,5,3,12,1,0,1,2,2,1,0,0,1,0,0,0,0,0,0,0,0,2,0,1,3,0,5,0,14,392972,0,0,34,777,1322,1050,3,7,11,6,7,0,0,0,1,0,1,1,0,1,11,56,1143488,5,89090,160,699,1183,941,7,50,51,30,20,1,1,2,8,0,1,8,0,2,8,105,1815698,7,121090,292,762,1269,1015,17,87,77,65,38,6,2,2,13,0,6,16,1,5,7,152,2186592,10,145077,444,796,1321,1059,29,125,115,100,58,13,4,9,28,0,17,36,2,6,11,299,4208928,28,845372,1039,895,1476,1186,54,268,258,230,155,57,17,35,62,1,48,80,4,8,12,617,9949843,90,4345915,2197,887,1462,1175,136,550,535,511,313,128,24,98,182,1,82,171,15,6,1,0,0,14,10456818,14882299,9577567,6418487,8505242,8609262,25000000,7113850
30471,55,45,28,10,20,2000,1,1,1,25536296,4001,0,0,275,118,0,264,2519,0,0,747,0,0,0,0,0,1,0,17790,8350,9443,574,297,277,2566,1356,1211,861,244,617,275,143,133,264,136,128,646,336,311,3796,2035,1762,506,261,245,123,33,-33,0,70,1,-31,0,-2,0,123,0,-6,109,-6,8,2,1,20,1,0,1,4,0,0,16,2,19,3,44,3,4,24,0,7,0,7,17,19,21,21,2,3,1,25,17,29,17,9,5,10,3,15,6,1,2,2,13,8,7,25,2,12,9,1,4,1,1,1,1,12,13,9,4,0,3,3,0,0,0,0,2,1000,1750,1375,0,0,1,0,1,0,0,0,0,0,0,0,0,36,2,0,0,0,0,2,1000,1750,1375,0,0,1,0,1,0,0,0,0,0,0,0,0,43,1,0,0,0,0,3,833,1500,1166,0,0,2,0,1,0,0,1,1,0,0,0,0,38,3,0,0,2,22000,7,757,1285,1021,0,1,3,2,1,0,0,1,2,0,0,3,0,41,2,0,0,2,22000,9,700,1222,961,0,1,5,2,1,0,0,1,4,0,0,6,0,35,6,1,117300,4,201300,20,747,1263,1005,1,4,8,5,1,1,0,2,12,0,1,11,1,6,1,0,0,17,6166237,5264685,5337575,5815015,6932084,6331644,6970959,6215716
30472,55,64,32,5,15,2003,2,11,2,6050064,78616,0,0,4215,2372,6,4635,6083,8,0,3300,2,1,11,1,0,4,5,83844,36656,47188,9414,4815,4599,51445,25003,26442,17757,5579,12178,4215,2161,2054,4635,2364,2271,10896,5572,5324,15835,7398,8437,8301,4219,4082,185,38,0,0,4,9,134,0,0,0,186,0,0,84,36,66,3,2,24,2,0,0,1,0,2,11,3,9,6,82,8,9,105,0,3,0,2,8,11,12,13,0,2,4,15,5,19,8,0,1,6,3,2,2,0,0,2,2,2,1,24,2,1,2,1,1,1,0,1,0,2,2,4,1,0,14,0,0,0,0,0,3,1000,1500,1250,0,0,0,3,0,0,0,0,1,0,0,3,0,23,0,0,0,0,0,13,753,1269,1011,0,1,6,5,1,0,0,0,5,0,0,10,0,32,0,1,37800,1,28800,42,646,1097,871,1,15,13,8,5,0,0,0,6,0,0,15,0,32,0,2,107800,10,136296,67,704,1195,950,3,17,23,15,9,0,0,1,12,0,2,18,2,30,1,15,473168,25,481350,115,681,1152,917,7,32,37,26,13,0,0,2,17,1,2,33,4,30,9,39,1225712,45,1464521,230,703,1182,942,11,60,77,58,22,1,1,6,31,1,4,65,7,6,1,0,0,16,8917337,10381112,11765799,11270697,7042021,6331644,13500000,13924198
30473,55,43,28,1,9,1968,2,6,2,4395332,94561,0,0,6120,2215,4,6533,5824,4,0,1015,2,0,7,1,0,5,1,72131,34296,37835,13523,6724,6799,56908,27219,29689,24130,7105,17025,6120,3096,3024,6533,3192,3341,14994,7422,7572,17070,7717,9353,11903,5928,5975,304,108,2,0,105,4,85,0,0,0,303,1,2,220,66,14,0,0,5,0,0,0,0,0,0,10,1,8,2,24,2,3,31,0,3,0,1,6,9,10,11,1,2,0,12,5,3,14,0,1,5,1,14,1,0,2,1,3,0,0,8,0,2,3,0,2,0,0,0,0,8,0,0,2,0,0,1,0,0,3,6906,4,400,750,575,0,2,2,0,0,0,0,1,0,0,4,0,0,3,5,0,0,4,25106,11,581,1045,813,0,3,6,1,1,0,0,2,1,0,4,3,0,19,2,0,0,5,39106,17,600,1058,829,0,4,9,3,1,0,0,2,1,0,4,6,1,23,1,0,0,5,39106,26,538,942,740,0,10,11,4,1,0,0,2,3,0,5,12,2,28,3,6,155237,13,545023,47,595,1054,825,1,13,24,6,2,1,0,3,7,0,7,26,4,25,10,15,351244,22,646575,93,664,1127,896,3,26,35,22,5,2,0,7,16,0,9,54,10,6,1,0,0,14,6172015,6032815,7149038,7800643,6311649,6226479,5600000,6168623


In [None]:
pairs = []

for col in columns:
    if not np.issubdtype(df_train[col].dtype, np.number):continue
    corrs = [(col, c, abs(df_train[col].corr(df_train[c]))) for c in df_train.columns.values.tolist() if c != col]
    corrs.sort(key=lambda item: item[2], reverse=True)
    for item in corrs:
        if item[2] > 0.3:
            pairs.append(item)
        else:
            break
df = pd.DataFrame(pairs, column=['missing_col', 'highest corr with', 'corr'])

In [13]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30404 entries, 1 to 30473
Columns: 296 entries, usdrub to avg_price_sub_area
dtypes: float64(129), int64(152), object(15)
memory usage: 68.9+ MB


In [15]:
pairs = []
for col in columns:
    if not np.issubdtype(df_train[col].dtype, np.number):continue
    corrs = [(col, c, abs(df_train[col].corr(df_train[c]))) for c in df_train.columns.values.tolist() if c != col]
    corrs.sort(key=lambda item: item[2], reverse=True)
    for item in corrs:
        if item[2] > min_corr:
            pairs.append(item)
        else:
            break
df = pd.DataFrame(pairs, columns=['missing_col', 'highest corr with', 'corr'])

TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [6]:
pick_highly_correlated_features(df_train, columns, 0.3)

TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [None]:
pick_highly_correlated_IVs(df_train, 'price_doc', 0.3)