In [1]:
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import warnings
import sys
import datetime
import scipy as sp
import statsmodels.stats.api as sms
import statsmodels.api as sm
from patsy import dmatrix
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

class SMWrapper(BaseEstimator, RegressorMixin):
    """ A universal sklearn-style wrapper for statsmodels regressors """
    def __init__(self, model_class, fit_intercept=True):
        self.model_class = model_class
        self.fit_intercept = fit_intercept
    def fit(self, X, y):
        if self.fit_intercept:
            X = sm.add_constant(X)
        self.model_ = self.model_class(y, X)
        self.results_ = self.model_.fit()
    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.results_.predict(X)

df_train_macro = pd.read_csv('./data/train_macro.csv', index_col=0, parse_dates=['timestamp'])
df_test_macro = pd.read_csv('./data/test_macro.csv', index_col=0, parse_dates=['timestamp'])

degree = 2
skewness_limit = 1

In [2]:
df_train = pd.read_csv('./data/train.csv', index_col=0, parse_dates=['timestamp'])
df_test = pd.read_csv('./data/test.csv', index_col=0, parse_dates=['timestamp'])
df_macro = pd.read_csv('./data/macro.csv', index_col=0, parse_dates=['timestamp'])

In [3]:
# %%writefile preprocessing.py

def clean_data(df):
    # build_year 1500이전 nan으로
    df.loc[df.build_year < 1500, 'build_year'] = np.nan
    df.loc[df.build_year > 2016, 'build_year'] = np.nan
    
    # floor가 0이면 nan으로
    df.loc[df.floor==0, 'floor'] = np.nan
    
    # max_floor가 0이면 nan으로
    df.loc[df.max_floor==0, 'max_floor'] = np.nan
    
    # max_floor가 floor보다 크면 nan으로
    df.loc[df.floor>df.max_floor, 'max_floor'] = np.nan
    
    # full_sq, life_sq가 0이면 nan으로
    df.loc[df.full_sq==0, 'full_sq'] = np.nan
    df.loc[df.life_sq==0, 'life_sq'] = np.nan
    
    # full_sq가 life_sq보다 작으면 nan으로
    df.loc[df.life_sq>df.full_sq, 'life_sq'] = np.nan
    
    # kitch_sq가 life_sq보다 크면 nan으로
    df.loc[df.kitch_sq>df.life_sq, 'kitch_sq'] = np.nan
    
    df.loc[df.state == 33, 'state'] = 3

    df.loc[df.num_room < 0, 'num_room'] = np.nan
    
    df['material'].fillna(0, inplace=True)
    
    # 이상한 숫자값들 45,34 ...
#     if 'modern_education_share' in df: del df['modern_education_share']
#     if 'old_education_build_share' in df: del df['old_education_build_share']
#     if 'child_on_acc_pre_school' in df: del df['child_on_acc_pre_school']
        
    consts = [col for col in df.columns if len(df[col].value_counts().index) == 1]
    for const in consts:
        del df[const]
        
    df = df.replace(['no data'], ['nodata'])
    
#     # 뉴머릭한 카테고리컬 독립변수들인데 유니크값이 너무 많아서 없앤다.
#     del df['ID_railroad_station_walk']
#     del df['ID_railroad_station_avto']
#     del df['ID_big_road1']
#     del df['ID_big_road2']
#     del df['ID_railroad_terminal']
#     del df['ID_bus_terminal']
#     del df['ID_metro']
#     # too many dummy variables
#     del df['sub_area']
    
#     50% 이상 미싱 데이터가 있으면 없애버린다
    if 'provision_retail_space_sqm' in df: del df['provision_retail_space_sqm']
    if 'theaters_viewers_per_1000_cap' in df: del df['theaters_viewers_per_1000_cap']
    if 'museum_visitis_per_100_cap' in df: del df['museum_visitis_per_100_cap']
    
    # material은 카테고리
#     df['material'] = df['material'].astype(np.str, copy=False)
#     df['material'] = df['material'].replace([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0], ['a', 'b', 'c', 'd', 'e', 'f', 'e'])
    return df


def macro_odd_data(df):
    del df['modern_education_share']
    del df['old_education_build_share']
    del df['child_on_acc_pre_school']
    return df



def col_renames(df):
    df.rename(columns={'build_count_1921-1945': 'build_count_1921_1945', 'build_count_1946-1970': 'build_count_1946_1970', 'build_count_1971-1995': 'build_count_1971_1995'}, inplace=True)
    return df

def del_many_unique(df):
 # 뉴머릭한 카테고리컬 독립변수들인데 유니크값이 너무 많아서 없앤다.
    del df['ID_railroad_station_walk']
    del df['ID_railroad_station_avto']
    del df['ID_big_road1']
    del df['ID_big_road2']
    del df['ID_railroad_terminal']
    del df['ID_bus_terminal']
    del df['ID_metro']
    # too many dummy variables
    del df['sub_area']
    del df['prom_part_3000']
    return df

def categorize(df):
    df['material'] = df['material'].astype(np.object, copy=False)
    df['material'] = df['material'].replace([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0], ['a', 'b', 'c', 'd', 'e', 'f', 'e'])
# def find_missing_data_columns(df):
#     missing_df = df.isnull().sum(axis=0).reset_index()
#     missing_df.columns = ['missing_column', 'missing_count']
#     missing_df = missing_df.loc[missing_df['missing_count'] > 0]
#     return missing_df


def impute_num_mode(df):
    for col in df._get_numeric_data().columns[df._get_numeric_data().isnull().any()]:
        df[col].fillna(df[col].mean(), inplace=True)

def imput_cat_mode(df):
    for col in df.column[df.isnull().any()].tolist():
        df[col].fillna(df[col].mean(), inplace=True)
        
def apply_log(df, numeric_cols):
    for col in numeric_cols:
        min_val = min(df[col].value_counts().index)
        if min_val < 0:
            df[col] -= min_val
            df[col] += 1
        else:
            df[col] += 1
    df[numeric_cols].apply(np.log)

def scale_up_positive(df, numeric_cols):
    for col in numeric_cols:
        min_val = min(df[col].value_counts().index)
        if min_val < 0:
            df[col] -= min_val
            df[col] += 1
        else:
            df[col] += 1
            
def remove_outliers(df, formula, repeat=1):
    result = None
    for i in range(repeat):
        model = sm.OLS.from_formula(formula, data=df)
        result = model.fit()
        influence = result.get_influence()
        distances, pvalues = influence.cooks_distance
        threshold = 4/(len(distances) - len(df.columns.drop(['_timestamp', '_price_doc']))-1)
        outliers = [idx for idx, d in enumerate(distances) if d > threshold]
        df.drop(df.index[outliers], inplace=True)
    return df, model, result

def remove_features_by_vif(df):
    features_to_remove = ['raion_popul', \
        'cafe_count_3000', \
        'cafe_count_5000', \
        'cafe_count_2000', \
        'kremlin_km', \
        'sadovoe_km', \
        'cafe_count_1500', \
        '0_17_all', \
        'cafe_sum_1500_max_price_avg', \
        'bulvar_ring_km', \
        'cafe_count_5000_price_1000', \
        'school_km', \
        'cafe_count_5000_price_2500', \
        'cafe_count_5000_price_1500', \
        'cafe_count_3000_price_1500', \
        'office_count_5000', \
        'cafe_count_1000', \
        'cafe_count_3000_price_500', \
        'office_count_3000', \
        'cafe_count_3000_price_2500', \
        'ttk_km', \
        'cafe_count_2000_price_1500', \
        'cafe_count_2000_price_500', \
        'cafe_count_5000_price_500', \
        'avg_price_ID_railroad_terminal', \
        'office_count_2000', \
        'church_count_5000', \
        'cafe_count_2000_price_2500', \
        'cafe_count_3000_price_1000', \
        'cafe_count_1500_price_1500', \
        'cafe_count_5000_na_price', \
        'cafe_count_2000_price_1000', \
        'zd_vokzaly_avto_km', \
        'church_count_3000', \
        'work_all', \
        'oil_chemistry_km', \
        'cafe_count_1500_price_500', \
        'cafe_count_5000_price_4000', \
        'avg_price_ID_bus_terminal', \
        'cafe_count_3000_na_price', \
        'office_count_1500', \
        'cafe_count_3000_price_4000', \
        'trc_count_5000', \
        'leisure_count_5000', \
        'cafe_count_1500_price_1000', \
        'sport_count_5000', \
        'metro_min_walk', \
        'big_church_count_3000', \
        'radiation_km', \
        'cafe_count_2000_na_price', \
        'cafe_count_1500_price_2500', \
        'avg_price_sub_area', \
        'basketball_km', \
        'avg_price_ID_big_road1', \
        'green_part_3000', \
        'preschool_education_centers_raion', \
        'cafe_count_500', \
        'avg_price_ID_railroad_station_walk', \
        'cafe_count_1000_price_1500', \
        'exhibition_km', \
        'sport_count_3000', \
        'avg_price_ID_big_road2', \
        'leisure_count_3000', \
        'preschool_km', \
        'office_sqm_3000', \
        'museum_km', \
        'power_transmission_line_km', \
        'stadium_km', \
        'cafe_count_2000_price_high', \
        'university_km', \
        'cafe_count_1500_na_price', \
        'workplaces_km', \
        'mosque_km', \
        'trc_count_3000', \
        'sport_count_2000', \
        'office_count_1000', \
        'office_sqm_5000', \
        'young_all', \
        'thermal_power_plant_km', \
        'swim_pool_km', \
        'railroad_station_avto_min', \
        'green_part_5000', \
        'cafe_count_3000_price_high', \
        'trc_count_2000', \
        'cafe_count_1000_price_1000', \
        'detention_facility_km', \
        'big_church_km', \
        'num_room', \
        'cafe_avg_price_1500', \
        'office_sqm_2000', \
        'shopping_centers_km', \
        'office_raion', \
        'nuclear_reactor_km', \
        'avg_price_ID_metro', \
        'trc_sqm_5000', \
        'park_km', \
        'sport_objects_raion', \
        'big_road2_km', \
        'sport_count_1500', \
        'state', \
        'public_healthcare_km', \
        'big_church_count_5000', \
        'ts_km', \
        'max_floor', \
        'ekder_all', \
        'bus_terminal_avto_km', \
        'theater_km', \
        'area_m', \
        'cafe_count_1500_price_high', \
        'office_sqm_1500', \
        'cafe_count_1000_na_price']
    for f in features_to_remove:
        if f in df:
            del df[f]
    return df

def scale_up_positive(df, numeric_cols):
    for col in numeric_cols:
        min_val = min(df[col].value_counts().index)
        if min_val < 0:
            df[col] -= min_val
            df[col] += 1
        else:
            df[col] += 1

def remove_features_by_high_corr(df):            
    features_to_remove = [
        'children_preschool', 'children_school', 'male_f', \
        'female_f', 'young_male', 'young_female', 'work_male', \
        'work_female', 'ekder_male', 'ekder_female', '16_29_all',\
        '0_6_all', '0_6_male', '0_6_female',\
        '7_14_all', '7_14_male', '7_14_female', '0_17_male', '0_17_female',\
        '16_29_male', '16_29_female', '0_13_all', '0_13_male', '0_13_female',\
        'metro_km_walk', 'railroad_station_walk_km',\
        'railroad_station_avto_km', 'public_transport_station_km' \
    ]
    for f in features_to_remove:
        del df[f]
    

In [4]:
clean_data(df_train)
col_renames(df_train)
del_many_unique(df_train)
categorize(df_train)
impute_num_mode(df_train)
df_train_num = df_train.select_dtypes(include=['int', 'float']).columns
apply_log(df_train, df_train_num)
scale_up_positive(df_train, df_train_num)
remove_features_by_vif(df_train)
remove_features_by_high_corr(df_train)

In [5]:
clean_data(df_test)
col_renames(df_test)
categorize(df_test)
impute_num_mode(df_test)
df_test_num = df_test.select_dtypes(include=['int', 'float']).columns
apply_log(df_test, df_test_num)
scale_up_positive(df_test, df_test_num)
remove_features_by_vif(df_test)
remove_features_by_high_corr(df_test)

In [6]:
macro_odd_data(df_macro)
impute_num_mode(df_macro)

Unnamed: 0_level_0,oil_urals,gdp_quart,gdp_quart_growth,cpi,ppi,gdp_deflator,balance_trade,balance_trade_growth,usdrub,eurrub,brent,net_capital_export,gdp_annual,gdp_annual_growth,average_provision_of_build_contract,average_provision_of_build_contract_moscow,rts,micex,micex_rgbi_tr,micex_cbi_tr,deposits_value,deposits_growth,deposits_rate,mortgage_value,mortgage_growth,mortgage_rate,grp,grp_growth,income_per_cap,real_dispos_income_per_cap_growth,salary,salary_growth,fixed_basket,retail_trade_turnover,retail_trade_turnover_per_cap,retail_trade_turnover_growth,labor_force,unemployment,employment,invest_fixed_capital_per_cap,invest_fixed_assets,profitable_enterpr_share,unprofitable_enterpr_share,share_own_revenues,overdue_wages_per_cap,fin_res_per_cap,marriages_per_1000_cap,divorce_rate,construction_value,invest_fixed_assets_phys,pop_natural_increase,pop_migration,pop_total_inc,childbirth,mortality,housing_fund_sqm,lodging_sqm_per_cap,water_pipes_share,baths_share,sewerage_share,gas_share,hot_water_share,electric_stove_share,heating_share,old_house_share,average_life_exp,infant_mortarity_per_1000_cap,perinatal_mort_per_1000_cap,incidence_population,rent_price_4+room_bus,rent_price_3room_bus,rent_price_2room_bus,rent_price_1room_bus,rent_price_3room_eco,rent_price_2room_eco,rent_price_1room_eco,load_of_teachers_preschool_per_teacher,load_of_teachers_school_per_teacher,students_state_oneshift,provision_doctors,provision_nurse,load_on_doctors,power_clinics,hospital_beds_available_per_cap,hospital_bed_occupancy_per_year,provision_retail_space_sqm,provision_retail_space_modern_sqm,turnover_catering_per_cap,theaters_viewers_per_1000_cap,seats_theather_rfmin_per_100000_cap,museum_visitis_per_100_cap,bandwidth_sports,population_reg_sports_share,students_reg_sports_share,apartment_build,apartment_fund_sqm
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1
2010-01-01,76.1000,,,,,,,,,,,,38807.2,-0.078209,5.00,,,,,,7484970,,,142968,,13.72,8375.8638,,30789.2,0.038,38410.5,,11443.63,2882.4169,251.484,106.8,6436.244,0.018,0.69,63932.0,732.760604,0.716,0.284,0.962915,12865.0,233.351529,8.0,3.9,553874.9,95.8,-0.3,14.1,13.8,10.7,11.0,216.0,18.715883,99.9,99.8,99.6,44.4,95.7,55.0,99.9,0.3,74.2,6.1,5.87,696.6,,,,,,,,721.477765,1356.112607,,18.0,99.4,7872.848285,162.9,,,,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
2010-01-02,76.1000,,,,,,,,,,,,38807.2,-0.078209,5.00,,,,,175.15,7484970,,,142968,,13.72,8375.8638,,30789.2,0.038,38410.5,,11443.63,2882.4169,251.484,106.8,6436.244,0.018,0.69,63932.0,732.760604,0.716,0.284,0.962915,12865.0,233.351529,8.0,3.9,553874.9,95.8,-0.3,14.1,13.8,10.7,11.0,216.0,18.715883,99.9,99.8,99.6,44.4,95.7,55.0,99.9,0.3,74.2,6.1,5.87,696.6,,,,,,,,721.477765,1356.112607,,18.0,99.4,7872.848285,162.9,,,,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
2010-01-03,76.1000,,,,,,,,,,,,38807.2,-0.078209,5.00,,,,,178.66,7484970,,,142968,,13.72,8375.8638,,30789.2,0.038,38410.5,,11443.63,2882.4169,251.484,106.8,6436.244,0.018,0.69,63932.0,732.760604,0.716,0.284,0.962915,12865.0,233.351529,8.0,3.9,553874.9,95.8,-0.3,14.1,13.8,10.7,11.0,216.0,18.715883,99.9,99.8,99.6,44.4,95.7,55.0,99.9,0.3,74.2,6.1,5.87,696.6,,,,,,,,721.477765,1356.112607,,18.0,99.4,7872.848285,162.9,,,,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
2010-01-04,76.1000,,,,,,,,29.9050,43.4054,80.12,,38807.2,-0.078209,5.00,,,,,183.44,7484970,,,142968,,13.72,8375.8638,,30789.2,0.038,38410.5,,11443.63,2882.4169,251.484,106.8,6436.244,0.018,0.69,63932.0,732.760604,0.716,0.284,0.962915,12865.0,233.351529,8.0,3.9,553874.9,95.8,-0.3,14.1,13.8,10.7,11.0,216.0,18.715883,99.9,99.8,99.6,44.4,95.7,55.0,99.9,0.3,74.2,6.1,5.87,696.6,,,,,,,,721.477765,1356.112607,,18.0,99.4,7872.848285,162.9,,,,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
2010-01-05,76.1000,,,,,,,,29.8360,42.9600,80.59,,38807.2,-0.078209,5.00,,,,,183.44,7484970,,,142968,,13.72,8375.8638,,30789.2,0.038,38410.5,,11443.63,2882.4169,251.484,106.8,6436.244,0.018,0.69,63932.0,732.760604,0.716,0.284,0.962915,12865.0,233.351529,8.0,3.9,553874.9,95.8,-0.3,14.1,13.8,10.7,11.0,216.0,18.715883,99.9,99.8,99.6,44.4,95.7,55.0,99.9,0.3,74.2,6.1,5.87,696.6,,,,,,,,721.477765,1356.112607,,18.0,99.4,7872.848285,162.9,,,,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
2010-01-06,76.1000,,,,,,,,29.7150,42.9138,81.89,,38807.2,-0.078209,5.00,,,,,184.87,7484970,,,142968,,13.72,8375.8638,,30789.2,0.038,38410.5,,11443.63,2882.4169,251.484,106.8,6436.244,0.018,0.69,63932.0,732.760604,0.716,0.284,0.962915,12865.0,233.351529,8.0,3.9,553874.9,95.8,-0.3,14.1,13.8,10.7,11.0,216.0,18.715883,99.9,99.8,99.6,44.4,95.7,55.0,99.9,0.3,74.2,6.1,5.87,696.6,,,,,,,,721.477765,1356.112607,,18.0,99.4,7872.848285,162.9,,,,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
2010-01-07,76.1000,,,,,,,,29.7750,42.7143,81.51,,38807.2,-0.078209,5.00,,,,,187.97,7484970,,,142968,,13.72,8375.8638,,30789.2,0.038,38410.5,,11443.63,2882.4169,251.484,106.8,6436.244,0.018,0.69,63932.0,732.760604,0.716,0.284,0.962915,12865.0,233.351529,8.0,3.9,553874.9,95.8,-0.3,14.1,13.8,10.7,11.0,216.0,18.715883,99.9,99.8,99.6,44.4,95.7,55.0,99.9,0.3,74.2,6.1,5.87,696.6,,,,,,,,721.477765,1356.112607,,18.0,99.4,7872.848285,162.9,,,,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
2010-01-08,76.1000,,,,,,,,29.7750,42.9359,81.37,,38807.2,-0.078209,5.00,,,,,187.97,7484970,,,142968,,13.72,8375.8638,,30789.2,0.038,38410.5,,11443.63,2882.4169,251.484,106.8,6436.244,0.018,0.69,63932.0,732.760604,0.716,0.284,0.962915,12865.0,233.351529,8.0,3.9,553874.9,95.8,-0.3,14.1,13.8,10.7,11.0,216.0,18.715883,99.9,99.8,99.6,44.4,95.7,55.0,99.9,0.3,74.2,6.1,5.87,696.6,,,,,,,,721.477765,1356.112607,,18.0,99.4,7872.848285,162.9,,,,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
2010-01-09,76.1000,,,,,,,,29.7750,42.9359,81.37,,38807.2,-0.078209,5.00,,,,,190.77,7484970,,,142968,,13.72,8375.8638,,30789.2,0.038,38410.5,,11443.63,2882.4169,251.484,106.8,6436.244,0.018,0.69,63932.0,732.760604,0.716,0.284,0.962915,12865.0,233.351529,8.0,3.9,553874.9,95.8,-0.3,14.1,13.8,10.7,11.0,216.0,18.715883,99.9,99.8,99.6,44.4,95.7,55.0,99.9,0.3,74.2,6.1,5.87,696.6,,,,,,,,721.477765,1356.112607,,18.0,99.4,7872.848285,162.9,,,,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
2010-01-10,76.1000,,,,,,,,29.7750,42.9359,81.37,,38807.2,-0.078209,5.00,,,,,192.39,7484970,,,142968,,13.72,8375.8638,,30789.2,0.038,38410.5,,11443.63,2882.4169,251.484,106.8,6436.244,0.018,0.69,63932.0,732.760604,0.716,0.284,0.962915,12865.0,233.351529,8.0,3.9,553874.9,95.8,-0.3,14.1,13.8,10.7,11.0,216.0,18.715883,99.9,99.8,99.6,44.4,95.7,55.0,99.9,0.3,74.2,6.1,5.87,696.6,,,,,,,,721.477765,1356.112607,,18.0,99.4,7872.848285,162.9,,,,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,


In [9]:
df_train_macro = df_train.merge(df_macro, left_on='timestamp', right_on='timestamp', how='left').set_index(df_train.index)
df_test_macro = df_test.merge(df_macro, left_on='timestamp', right_on='timestamp', how='left').set_index(df_test.index)
cols = list(df_train_macro.columns.values)
cols.pop(cols.index('price_doc'))
df_train_macro = df_train_macro[cols + ['price_doc']]
df_train.to_csv('./data/train_macro.csv', header=True, index=True)
df_test.to_csv('./data/test_macro.csv', header=True, index=True)

In [None]:
df_train_macro.tail()

In [None]:
# df_train.select_dtypes(include=['int', 'float']).columns
# df_train.select_dtypes(include=['object']).columns

In [None]:
# df_train_num_ls = list(df_train.select_dtypes(include=['int', 'float']).columns)

In [None]:
# # 다중공선성 확인
# from statsmodels.stats.outliers_influence import variance_inflation_factor

# X = df_train[df_train_num_ls]

# vif = pd.DataFrame()
# vif["VifFactor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
# vif["features"] = X.columns
# vif.sort_values(by="VifFactor", ascending=False)

In [None]:
# model1= sm.OLS.from_formula("np.log(df_trian['price_doc']) ~ scale(vrank) + C(taster_name)\
# + scale(points)", wine_france)
# result1= model1.fit()
# print(result1.summary())

In [None]:
df_train_num_col = df_train.select_dtypes(include=['int', 'float']).columns
df_train_numeric = df_train[df_train_num_col]
df_train_cat_col = df_train.select_dtypes(include=['object']).columns
df_train_categoric = df_train[df_train_cat_col]

In [None]:
df_test_num_col = df_test.select_dtypes(include=['int', 'float']).columns
df_test_numeric = df_test[df_test_num_col]
df_test_cat_col = df_test.select_dtypes(include=['object']).columns
df_test_categoric = df_test[df_test_cat_col]

In [None]:
# def clean_column_names(cols):
#     cleaned_cols = [col.replace('-', '').replace('+', '').replace(':', '') for col in cols]
#     cleaned_cols = ['c_' + col if col[0].isdigit() else col for col in cleaned_cols]
#     return cleaned_cols

In [None]:
# def clean_column_names(cols):
#     cleaned_cols = [col.replace('-', '_').replace('+', '').replace(':', '') for col in cols]
#     cleaned_cols = ['c_' + col if col[0].isdigit() else col for col in cleaned_cols]
#     return cleaned_cols

In [None]:
# df_train_clean_num_col = clean_column_names(df_train_num_col)
# df_train_clean_num_col

In [None]:
# feature_names = list(boston.feature_names)
# feature_names.remove("CHAS") 
# feature_names = ["scale({})".format(name) for name in feature_names] + ["CHAS"]
# model3 = sm.OLS.from_formula("MEDV ~ " + "+".join(feature_names), data=df2)
# result3 = model3.fit()
# print(result3.summary())

In [None]:
# feature_names = ["scale({})".format(col) for col in df_train_num_col.drop]
# feature_names

In [None]:
# model2 = sm.OLS.from_formula("price_doc ~ " + "+".join([col for col in df_train_numeric.drop(columns=['price_doc']).columns]), data=df_train)
# model2.fit().summary()

In [None]:
# formula_numeric = ' + '.join([col for col in df_train_numeric.drop(columns=['price_doc']).columns])
# formula_numeric

In [None]:
# linear regression
formula_numeric = ' + '.join([col for col in df_train_numeric.drop(columns=['price_doc']).columns])
formula_cate = ' + '.join(['C(' + col + ')' for col in df_train_categoric.columns])

formula = 'price_doc ~ ' + formula_numeric + ' + ' + formula_cate

model = sm.OLS.from_formula(formula, data=df_train)
model.fit().summary()

In [None]:
df_train_macro = pd.read_csv('./data/train_macro.csv', index_col=0, parse_dates=['timestamp'])
df_train_macro.tail()
# df_test_macro = pd.read_csv('./data/test_macro.csv', index_col=0, parse_dates=['timestamp'])

In [None]:
df_train_macro_num_col = df_train_macro.select_dtypes(include=['int', 'float']).columns
df_train_macro_numeric = df_train_macro[df_train_macro_num_col]
df_train_macro_cat_col = df_train_macro.select_dtypes(include=['object']).columns
df_train_macro_categoric = df_train_macro[df_train_macro_cat_col]

In [None]:
df_train_macro_numeric.tail()

In [None]:
formula_numeric = ' + '.join([col for col in df_train_macro_numeric.drop(columns=['price_doc']).columns])
formula_cate = ' + '.join(['C(' + col + ')' for col in df_train_macro_categoric.columns])

formula = 'price_doc ~ ' + formula_numeric  + ' + ' + formula_cate

model = sm.OLS.from_formula(formula, data=df_train_macro)
result = model.fit()
result.summary()

In [None]:
anova = sm.stats.anova_lm(result, typ=2)
anova

We can remove features with p-value equal or greater than 0.05 since they have very small influences on the dependent variable.

# F-test and Feature Influence

In [None]:
import statsmodels.api as sm
import numpy as np

def by_f_test(df, formula, repeat=10, log_dv = True):
    result = None
    selected_ivs = []
    for i in range(repeat):
        model = sm.OLS.from_formula(formula, data=df)
        result = model.fit()
        anova = sm.stats.anova_lm(result, typ=2)
        selected_ivs = [iv[0] for iv in anova.iterrows() if iv[1][3] < 0.05]
        if len(selected_ivs) >= 0:
            if log_dv == True:  
                formula = 'np.log(price_doc) ~ ' + ' + '.join(selected_ivs)
            else:
                formula = 'price_doc ~ ' + ' + '.join(selected_ivs)
        else:
            return result, selected_ivs
    return result, selected_ivs,  formula

In [None]:
result, sms_vars, formula = by_f_test(df_train_macro, formula, repeat=5)
result.summary()

# 3. Outlier

## Cook's Distance

Find data with large leverage and residual by calculating Cook's distance.

In [None]:
def remove_outliers(df, formula, repeat=1):
    result = None
    for i in range(repeat):
        model = sm.OLS.from_formula(formula, data=df)
        result = model.fit()
        influence = result.get_influence()
        distances, pvalues = influence.cooks_distance
        threshold = 4/(len(distances) - len(df.columns.drop(['timestamp', 'price_doc']))-1)
        outliers = [idx for idx, d in enumerate(distances) if d > threshold]
        df.drop(df.index[outliers], inplace=True)
    return df, model, result

In [None]:
df_train_macro_with_outliers = df_train_macro.copy(deep=True)
df_train_macro, model, result = remove_outliers(df_train_macro, formula, repeat=5)
result.summary()

# 4. Regularization

## Lasso

Find variables with zero coefficient when Lasso regularization is applied.

In [None]:
result_lasso = model.fit_regularized(alpha=0.001, L1_wt=1)

Let's remove features with zero coefficient to reduce dimensionality.

In [None]:
sms_vars = []
for idx, coef in enumerate(result_lasso.params):
    if coef ==0:
        continue
    feature = result_lasso.params.index[idx]
    if feature == 'Intercept':
        continue
    startDelPos = feature.find('[')
    endDelPos = feature.find(']')
    feature = feature.replace(feature[startDelPos:endDelPos+1], '')
    sms_vars.append(feature)

In [None]:
formula = 'np.log(price_doc) ~ ' + " + ".join(sms_vars)
model = sm.OLS.from_formula(formula, data=df_train_macro)
result = model.fit()
result.summary()

# 5. Diagnosis of Regression

## Residual Normality Test

In [None]:
sp.stats.probplot(result.resid, plot=plt)
plt.show()

In [None]:
test = sms.omni_normtest(result.resid)
for xi in zip(['Chi^2', 'P-value'], test):
    print("%-12s: %6.3f" % xi)

## Partial Regression Plot

Let's visualize the influence of a single independent variable.

In [None]:
fig = plt.figure(figsize=(10,70))
sm.graphics.plot_partregress_grid(result, fig=fig)
fig.suptitle("")
plt.show()

# 6. Cross Validation

In [None]:
dm = dmatrix(" + ".join(sms_vars) + ' + np.log(price_doc)', df_train_macro_with_outliers, return_type="dataframe")
X = dm[dm.columns.drop(['np.log(price_doc)'])]
y = dm['np.log(price_doc)']
cv = cv = KFold(n_splits=1000, shuffle=True, random_state=0)
r2s = cross_val_score(SMWrapper(sm.OLS), X, y, scoring='r2', cv=cv)
r2s.mean()