In [2]:
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import warnings
warnings.filterwarnings("ignore")
import sys
import datetime
import scipy as sp
import statsmodels.stats.api as sms
import statsmodels.api as sm
from patsy import dmatrix
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import utils.preprocessing as pp 
import utils.correlation as cr
import utils.statsmodel_helper as st

df_macro = pd.read_csv('./data/macro.csv', parse_dates=['timestamp'])
df_train = pd.read_csv('./data/train.csv', index_col=0, parse_dates=['timestamp'])
df_test = pd.read_csv('./data/test.csv', index_col=0, parse_dates=['timestamp'])

min_corr = 0.3

In [2]:
# def pick_highly_correlated_features(df, columns, min_corr):
#     pairs = []
#     for col in columns:
#         if not np.issubdtype(df[col].dtype, np.number):continue
#         corrs = [(col, c, abs(df[col].corr(df[c]))) for c in df.columns.values.tolist() if c != col]
#         corrs.sort(key=lambda item: item[2], reverse=True)
#         for item in corrs:
#             if item[2] > min_corr:
#                 pairs.append(item)
#             else:
#                 break
#     return pd.DataFrame(pairs, columns=['missing_col', 'highest corr with', 'corr'])

# def pick_highly_correlated_IVs(df, target_col, min_corr, min_unique_values = 0):
#     if not np.issubdtype(df[target_col].dtype, np.number):
#         Exception('{}은 numeric data가 아닙니다.'.format(target_col))
#     # if len(df[col].value_counts().index) < min_unique_values:
#     #     Exception('{}로 상관관계를 계산하기에는 유니크한 값이 너무 작습니다.'.format(col))

#     corrs = []
#     for col in df._get_numeric_data().drop(target_col, axis=1).columns:
#         if len(df[col].value_counts().index) < min_unique_values: continue
#         corr = abs(df[target_col].corr(df[col]))
#         if corr > min_corr:
#             corrs.append((col, corr))
    
#     return corrs 

In [5]:
cols = df_train.columns

In [12]:
numeric_cols = df_train._get_numeric_data().columns.drop('price_doc').tolist()
numeric_cols

['full_sq',
 'life_sq',
 'floor',
 'max_floor',
 'material',
 'build_year',
 'num_room',
 'kitch_sq',
 'state',
 'area_m',
 'raion_popul',
 'green_zone_part',
 'indust_part',
 'children_preschool',
 'preschool_quota',
 'preschool_education_centers_raion',
 'children_school',
 'school_quota',
 'school_education_centers_raion',
 'school_education_centers_top_20_raion',
 'hospital_beds_raion',
 'healthcare_centers_raion',
 'university_top_20_raion',
 'sport_objects_raion',
 'additional_education_raion',
 'culture_objects_top_25_raion',
 'shopping_centers_raion',
 'office_raion',
 'full_all',
 'male_f',
 'female_f',
 'young_all',
 'young_male',
 'young_female',
 'work_all',
 'work_male',
 'work_female',
 'ekder_all',
 'ekder_male',
 'ekder_female',
 '0_6_all',
 '0_6_male',
 '0_6_female',
 '7_14_all',
 '7_14_male',
 '7_14_female',
 '0_17_all',
 '0_17_male',
 '0_17_female',
 '16_29_all',
 '16_29_male',
 '16_29_female',
 '0_13_all',
 '0_13_male',
 '0_13_female',
 'raion_build_count_with_mater

In [73]:
# corrlated_features
pairs = []
for col in numeric_cols:
    corrs = [(col, c, abs(df_train[col].corr(df_train[c]))) for c in numeric_cols if c != col]
    corrs.sort(key=lambda item: item[2], reverse=True)
    for item in corrs:
        if item[2] > 0.2:
            pairs.append(item)
        else:
            break

df = pd.DataFrame(pairs, columns=['missing_col', 'highest_corr_with', 'corr'])
df.tail()

Unnamed: 0,missing_col,highest_corr_with,corr
32761,market_count_5000,build_count_slag,0.213024
32762,market_count_5000,ID_big_road2,0.21275
32763,market_count_5000,preschool_quota,0.207418
32764,market_count_5000,cafe_count_500_price_high,0.2019
32765,market_count_5000,cafe_count_1000_price_high,0.201449


In [85]:
df_train['material'].corr(df_train['price_doc'])

0.06404720198338729

In [93]:
df.loc[df['corr'] > 0.99].missing_col.unique()

array(['raion_popul', 'children_preschool', 'children_school', 'full_all',
       'male_f', 'female_f', 'young_all', 'young_male', 'young_female',
       'work_all', 'work_male', 'work_female', 'ekder_all', 'ekder_male',
       'ekder_female', '0_6_all', '0_6_male', '0_6_female', '7_14_all',
       '7_14_male', '7_14_female', '0_17_all', '0_17_male', '0_17_female',
       '16_29_all', '16_29_male', '16_29_female', '0_13_all', '0_13_male',
       '0_13_female', 'raion_build_count_with_material_info',
       'raion_build_count_with_builddate_info', 'metro_min_walk',
       'metro_km_walk', 'school_km', 'railroad_station_walk_km',
       'railroad_station_walk_min', 'public_transport_station_km',
       'public_transport_station_min_walk', 'ttk_km', 'sadovoe_km',
       'bulvar_ring_km', 'kremlin_km', 'preschool_km',
       'cafe_sum_500_min_price_avg', 'cafe_sum_500_max_price_avg',
       'cafe_avg_price_500', 'cafe_count_1000',
       'cafe_sum_1000_min_price_avg', 'cafe_sum_1000_max_pr

In [67]:
df.groupby('highest corr with').size().sort_values(ascending=False)

highest corr with
bulvar_ring_km                           214
market_count_5000                        214
zd_vokzaly_avto_km                       213
sadovoe_km                               213
kremlin_km                               213
museum_km                                211
sport_count_5000                         211
trc_sqm_5000                             210
trc_count_5000                           208
sport_count_3000                         208
sport_count_2000                         206
market_count_3000                        205
sport_count_1500                         204
sport_objects_raion                      203
trc_count_3000                           203
ttk_km                                   201
detention_facility_km                    200
green_part_5000                          200
shopping_centers_raion                   198
trc_sqm_3000                             197
ice_rink_km                              197
trc_count_2000                       

In [80]:
# highly_correlated Independent Valiables
corrs = []
for col in df_train._get_numeric_data().drop('price_doc', axis=1).columns:
    if len(df_train[col].value_counts().index) < 0:continue
    corr = abs(df_train['price_doc'].corr(df_train[col]))
    corrs.append((col, corr))
        
df_corrs = pd.DataFrame(corrs, columns=['col_name', 'corr'])
df_corrs

Unnamed: 0,col_name,corr
0,full_sq,0.34184
1,life_sq,0.165606
2,floor,0.117447
3,max_floor,0.094386
4,material,0.064047
5,build_year,0.002161
6,num_room,0.476337
7,kitch_sq,0.028718
8,state,0.121303
9,area_m,0.166981


In [81]:
df_corrs.loc[df_corrs['corr'] < 0.1]

Unnamed: 0,col_name,corr
3,max_floor,0.094386
4,material,0.064047
5,build_year,0.002161
7,kitch_sq,0.028718
11,green_zone_part,0.094486
12,indust_part,0.082614
14,preschool_quota,0.075118
17,school_quota,0.014012
24,additional_education_raion,0.057689
25,culture_objects_top_25_raion,0.044296
