In [1]:
!mkdir -p utils

In [2]:
!tree utils

[01;34mutils[00m

0 directories, 0 files


In [3]:
!touch utils/__init__.py

In [4]:
!tree utils

[01;34mutils[00m
└── __init__.py

0 directories, 1 file


In [5]:
%%writefile utils/preprocessing.py

import numpy as np
import pandas as pd
import utils.correlation as corr
import statsmodels.api as sm
import utils.statsmodel_helper as smh
from sklearn.preprocessing import OneHotEncoder

def merge(df_1, df_2, on_col):
    df_tm = pd.merge(df_1, df_2, on=[on_col, on_col])
    df_tm_cols = df_tm.columns.tolist()
    df_tm_cols = df_tm_cols[:290] + df_tm_cols[291:] + [df_tm_cols[290]]
    df_tm = df_tm[df_tm_cols]
    return df_tm

def clean_data(df):
    # build_year 1500이전 nan으로
    df.loc[df.build_year < 1500, 'build_year'] = np.nan
    df.loc[df.build_year > 2016, 'build_year'] = np.nan
    
    # floor가 0이면 nan으로
    df.loc[df.floor==0, 'floor'] = np.nan
    
    # max_floor가 0이면 nan으로
    df.loc[df.max_floor==0, 'max_floor'] = np.nan
    
    # max_floor가 floor보다 크면 nan으로
    df.loc[df.floor>df.max_floor, 'max_floor'] = np.nan
    
    # full_sq, life_sq, kitch_sq가 0이면 nan으로
    df.loc[df.full_sq==0, 'full_sq'] = np.nan
    df.loc[df.life_sq==0, 'life_sq'] = np.nan
    df.loc[df.kitch_sq==0, 'kitch_sq'] = np.nan
    
    # full_sq가 life_sq보다 작으면 nan으로
    df.loc[df.life_sq>df.full_sq, 'life_sq'] = np.nan
    
    # kitch_sq가 life_sq보다 크면 nan으로
    df.loc[df.kitch_sq>df.life_sq, 'kitch_sq'] = np.nan
    
    df.loc[df.state == 33, 'state'] = 3
    
    df.loc[df.full_sq > 210, 'full_sq'] == np.nan
    df.loc[df.full_sq > 200, 'full_sq'] == np.nan    

    df.loc[df.num_room < 0, 'num_room'] = np.nan
    
    df['material'].fillna(0, inplace=True)
    
    # 이상한 숫자값들 45,34 ...
    if 'modern_education_share' in df: del df['modern_education_share']
    if 'old_education_build_share' in df: del df['old_education_build_share']
    if 'child_on_acc_pre_school' in df: del df['child_on_acc_pre_school']
        
    consts = [col for col in df.columns if len(df[col].value_counts().index) == 1]
    for const in consts:
        del df[const]
        
    df = df.replace(['no data'], ['nodata'])
    
#     # 뉴머릭한 카테고리컬 독립변수들인데 유니크값이 너무 많아서 없앤다.
#     del df['ID_railroad_station_walk']
#     del df['ID_railroad_station_avto']
#     del df['ID_big_road1']
#     del df['ID_big_road2']
#     del df['ID_railroad_terminal']
#     del df['ID_bus_terminal']
#     del df['ID_metro']
#     # too many dummy variables
#     del df['sub_area']
    
#     50% 이상 미싱 데이터가 있으면 없애버린다
    if 'provision_retail_space_sqm' in df: del df['provision_retail_space_sqm']
    if 'theaters_viewers_per_1000_cap' in df: del df['theaters_viewers_per_1000_cap']
    if 'museum_visitis_per_100_cap' in df: del df['museum_visitis_per_100_cap']
    
    # material은 카테고리
#     df['material'] = df['material'].astype(np.str, copy=False)
#     df['material'] = df['material'].replace([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0], ['a', 'b', 'c', 'd', 'e', 'f', 'e'])
#     return df


def col_renames(df):
    df.rename(columns={'build_count_1921-1945': 'build_count_1921_1945', 'build_count_1946-1970': 'build_count_1946_1970', 'build_count_1971-1995': 'build_count_1971_1995'}, inplace=True)
    return df

def del_many_unique(df):
 # 뉴머릭한 카테고리컬 독립변수들인데 유니크값이 너무 많아서 없앤다.
    del df['ID_railroad_station_walk']
    del df['ID_railroad_station_avto']
    del df['ID_big_road1']
    del df['ID_big_road2']
    del df['ID_railroad_terminal']
    del df['ID_bus_terminal']
    del df['ID_metro']
    # too many dummy variables
    del df['sub_area']
    del df['prom_part_3000']
    return df

def categorize(df):
    df['material'] = df['material'].astype(np.object, copy=False)
    df['material'] = df['material'].replace([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0], ['a', 'b', 'c', 'd', 'e', 'f', 'e'])
# def find_missing_data_columns(df):
#     missing_df = df.isnull().sum(axis=0).reset_index()
#     missing_df.columns = ['missing_column', 'missing_count']
#     missing_df = missing_df.loc[missing_df['missing_count'] > 0]
#     return missing_df


def impute_num_mode(df):
    for col in df._get_numeric_data().columns[df._get_numeric_data().isnull().any()]:
        df[col].fillna(df[col].mean(), inplace=True)

def imput_cat_mode(df):
    for col in df.column[df.isnull().any()].tolist():
        df[col].fillna(df[col].mean(), inplace=True)
        
def apply_log(df, numeric_cols):
    for col in numeric_cols:
        min_val = min(df[col].value_counts().index)
        if min_val < 0:
            df[col] -= min_val
            df[col] += 1
        else:
            df[col] += 1
    df[numeric_cols].apply(np.log)

def scale_up_positive(df, numeric_cols):
    for col in numeric_cols:
        min_val = min(df[col].value_counts().index)
        if min_val < 0:
            df[col] -= min_val
            df[col] += 1
        else:
            df[col] += 1
            
def remove_outliers(df, formula, repeat=1):
    result = None
    for i in range(repeat):
        model = sm.OLS.from_formula(formula, data=df)
        result = model.fit()
        influence = result.get_influence()
        distances, pvalues = influence.cooks_distance
        threshold = 4/(len(distances) - len(df.columns.drop(['_price_doc']))-1)
        outliers = [idx for idx, d in enumerate(distances) if d > threshold]
        df.drop(df.index[outliers], inplace=True)
    return df, model, result

def remove_features_by_vif(df):
    features_to_remove = [ 
        'raion_popul', \
        'preschool_education_centers_raion', \
        'school_education_centers_raion', \
        'sport_objects_raion', \
        'office_raion', \
        'young_all', \
        'work_all', \
        'ekder_all', \
        '0_17_all', \
        'raion_build_count_with_material_info', \
        'raion_build_count_with_builddate_info', \
        'build_count_1946-1970', \
        'metro_min_avto', \
        'metro_km_avto', \
        'metro_min_walk', \
        'school_km', \
        'park_km', \
        'railroad_station_walk_min', \
        'railroad_station_avto_min', \
        'ttk_km', \
        'sadovoe_km', \
        'bulvar_ring_km', \
        'kremlin_km', \
        'zd_vokzaly_avto_km', \
        'bus_terminal_avto_km', \
        'oil_chemistry_km', \
        'nuclear_reactor_km', \
        'radiation_km', \
        'power_transmission_line_km', \
        'thermal_power_plant_km', \
        'ts_km', \
        'swim_pool_km', \
        'ice_rink_km', \
        'stadium_km', \
        'basketball_km', \
        'detention_facility_km', \
        'public_healthcare_km', \
        'university_km', \
        'workplaces_km', \
        'shopping_centers_km', \
        'preschool_km', \
        'big_church_km', \
        'mosque_km', \
        'theater_km', \
        'museum_km', \
        'exhibition_km', \
        'cafe_count_500', \
        'cafe_sum_500_min_price_avg', \
        'cafe_avg_price_500', \
        'office_count_1000', \
        'cafe_count_1000', \
        'cafe_sum_1000_min_price_avg', \
        'cafe_sum_1000_max_price_avg', \
        'cafe_avg_price_1000', \
        'cafe_count_1000_na_price', \
        'cafe_count_1000_price_1000', \
        'cafe_count_1000_price_1500', \
        'office_count_1500', \
        'cafe_count_1500', \
        'cafe_sum_1500_max_price_avg', \
        'cafe_avg_price_1500', \
        'cafe_count_1500_na_price', \
        'cafe_count_1500_price_500', \
        'cafe_count_1500_price_1000', \
        'cafe_count_1500_price_1500', \
        'cafe_count_1500_price_2500', \
        'cafe_count_1500_price_high', \
        'leisure_count_1500', \
        'sport_count_1500', \
        'green_part_2000', \
        'office_count_2000', \
        'office_sqm_2000', \
        'trc_count_2000', \
        'cafe_count_2000', \
        'cafe_sum_2000_min_price_avg', \
        'cafe_sum_2000_max_price_avg', \
        'cafe_avg_price_2000', \
        'cafe_count_2000_na_price', \
        'cafe_count_2000_price_500', \
        'cafe_count_2000_price_1000', \
        'cafe_count_2000_price_1500', \
        'cafe_count_2000_price_2500', \
        'cafe_count_2000_price_high', \
        'sport_count_2000', \
        'green_part_3000', \
        'office_count_3000', \
        'office_sqm_3000', \
        'trc_count_3000', \
        'cafe_count_3000', \
        'cafe_count_3000_na_price', \
        'cafe_count_3000_price_500', \
        'cafe_count_3000_price_1000', \
        'cafe_count_3000_price_1500', \
        'cafe_count_3000_price_2500', \
        'cafe_count_3000_price_4000', \
        'cafe_count_3000_price_high', \
        'big_church_count_3000', \
        'church_count_3000', \
        'leisure_count_3000', \
        'sport_count_3000', \
        'green_part_5000',\
        'office_count_5000',\
        'office_sqm_5000',\
        'trc_count_5000',\
        'trc_sqm_5000',\
        'cafe_count_5000',\
        'cafe_count_5000_na_price', \
        'cafe_count_5000_price_500', \
        'cafe_count_5000_price_1000', \
        'cafe_count_5000_price_1500', \
        'cafe_count_5000_price_2500', \
        'cafe_count_5000_price_4000', \
        'cafe_count_5000_price_high', \
        'big_church_count_5000', \
        'church_count_5000', \
        'leisure_count_5000', \
        'sport_count_5000', \
        'market_count_5000', \
        'avg_price_ID_metro', \
        'avg_price_ID_railroad_station_walk', \
        'avg_price_ID_big_road1', \
        'avg_price_ID_big_road2', \
        'avg_price_ID_railroad_terminal', \
        'avg_price_ID_bus_terminal', \
        'avg_price_sub_area' \
    ]
    for f in features_to_remove:
        if f in df_train:
            del df_train[f]

def scale_up_positive(df, numeric_cols):
    for col in numeric_cols:
        min_val = min(df[col].value_counts().index)
        if min_val < 0:
            df[col] -= min_val
            df[col] += 1
        else:
            df[col] += 1

def remove_features_by_high_corr(df):            
    features_to_remove = [
        'children_preschool', 'children_school', 'male_f', \
        'female_f', 'young_male', 'young_female', 'work_male', \
        'work_female', 'ekder_male', 'ekder_female', '16_29_all',\
        '0_6_all', '0_6_male', '0_6_female',\
        '7_14_all', '7_14_male', '7_14_female', '0_17_male', '0_17_female',\
        '16_29_male', '16_29_female', '0_13_all', '0_13_male', '0_13_female',\
        'metro_km_walk', 'railroad_station_walk_km',\
        'railroad_station_avto_km', 'public_transport_station_km' \
    ]
    for f in features_to_remove:
        del df[f]
        
def find_missing_data_columns(df):
    missing_df = df.isnull().sum(axis=0).reset_index()
    missing_df.columns = ['missing_column', 'missing_count']
    missing_df = missing_df.loc[missing_df['missing_count'] > 0]
    return missing_df

def imput_by_interpolate(df):
    missing_df = find_missing_data_columns(df)
    for col in missing_df['missing_column']:
        df[col] = df[col].interpolate(mathod='linear')
    return df

def impute_by_regression(df, repeat, corr_thresh):
    for i in range(repeat):
        pairs = []
        missing_df = find_missing_data_columns(df)
        for missing_col in missing_df['missing_column']:
            if not np.issubdtype(df[missing_col], np.number) : continue
            corrs = [ (missing_col, c, abs(df[missing_col].corr(df[c]))) for c in df._get_numeric_data().columns if c != missing_col ]
            corrs.sort(key=lambda item : item[2], reverse=True)
            for item in corrs:
                if item[2] >= corr_thresh:
                    pairs.append(item)
                else:
                    break
        df_nan_col_with_high_corr_col = pd.DataFrame(pairs, columns=['missing_col', 'highest corr with', 'corr'])
        for row in df_nan_col_with_high_corr_col.iterrows():
            if df[row[1][0]].isnull().sum() <= 0 : continue
            nan_col = row[1][0]
            high_corr_col = row[1][1]
            corr = row[1][1]
            
            df_temp = pd.DataFrame(df[[high_corr_col, nan_col]], columns=[high_corr_col, nan_col])
            df_temp = df_temp.dropna()
            
            df_temp = sm.add_constant(df_temp)
            X = df_temp.values[:, :2]
            y = df_temp.values[:, 2]
            result = sm.OLS(y, X).fit()
            
            dfX = sm.add_constant(df[high_corr_col])
            predicted = result.predict(dfX)
            
            df = pd.merge(df, predicted.to_frame('predicted'), left_index=True, right_index=True)
            df[nan_col].fillna(df['predicted'], inplace=True)
            del df['predicted']
    return df

def impute_by_regression2(df, repeat, corr_thresh):
    for i in range(repeat):
        pairs = []
        missing_df = find_missing_data_columns(df)
        for missing_col in missing_df['missing_column']:
            if not np.issubdtype(df[missing_col], np.number) : continue
            corrs = [ (missing_col, c, abs(df[missing_col].corr(df[c]))) for c in df._get_numeric_data().columns if c != missing_col ]
            corrs.sort(key=lambda item : item[2], reverse=True)
            for item in corrs:
                if item[2] >= corr_thresh:
                    pairs.append(item)
                else:
                    break
        df_nan_col_with_high_corr_col = pd.DataFrame(pairs, columns=['missing_col', 'highest corr with', 'corr'])
        for row in df_nan_col_with_high_corr_col.iterrows():
            if df[row[1][0]].isnull().sum() <= 0 : continue
            nan_col = row[1][0]
            high_corr_col = row[1][1]
            corr = row[1][1]
            
            df_temp = pd.DataFrame(df[[high_corr_col, nan_col]], columns=[high_corr_col, nan_col])
            df_temp = df_temp.dropna()
            
            df_temp = sm.add_constant(df_temp)
            X = df_temp.values[:, :2]
            y = df_temp.values[:, 1]
            result = sm.OLS(y, X).fit()
            
            dfX = sm.add_constant(df[high_corr_col])
            predicted = result.predict(dfX)
            
            df = pd.merge(df, predicted.to_frame('predicted'), left_index=True, right_index=True)
            df[nan_col].fillna(df['predicted'], inplace=True)
            del df['predicted']
    return df
            


Writing utils/preprocessing.py


In [6]:
%%writefile utils/feature_selection.py

import statsmodels.api as sm
import numpy as np

def by_f_test(df, formula, repeat=10, log_dv = True):
    result = None
    selected_ivs = []
    for i in range(repeat):
        model = sm.OLS.from_formula(formula, data=df)
        result = model.fit()
        anova = sm.stats.anova_lm(result, typ=2)
        selected_ivs = [iv[0] for iv in anova.iterrows() if iv[1][3] < 0.05]
        if len(selected_ivs) >= 0:
            if log_dv == True:  
                formula = 'np.log(_price_doc) ~ ' + ' + '.join(selected_ivs)
            else:
                formula = '_price_doc ~ ' + ' + '.join(selected_ivs)
        else:
            return result, selected_ivs
    return result, selected_ivs,  formula

Writing utils/feature_selection.py


In [7]:
%%writefile utils/statsmodel_helper.py

import statsmodels as sm

def make_statsmodels_ols_formula(numeric_ivs, categorical_ivs, dv, log_vs=[], degree=1, scale=True):


    if len(log_vs) > 0:
        numeric_ivs = ["np.log({})".format(iv) if iv in log_vs else iv for iv in numeric_ivs ]

    polynomials = []
    if degree > 1:
        for i in range(2, degree + 1):
            if scale:
                polynomials = list(map(lambda iv: 'scale(I({}**{}))'.format(iv, i), numeric_ivs))
            else:
                polynomials = list(map(lambda iv: 'I({}**{})'.format(iv, i), numeric_ivs))
    
    if scale:
        numeric_ivs = ["scale({})".format(iv) if scale else iv for iv in numeric_ivs ]

    formula = ''
    if dv in log_vs:
        formula = 'np.log({}) ~ '.format(dv)
    else:
        formula = '{} ~ '.format(dv)
    

    if len(categorical_ivs) > 0:
        if len(numeric_ivs) > 0:
            formula += " + ".join(list(map(lambda iv: 'C({})'.format(iv), categorical_ivs)))
        else:
            formula += " + ".join(list(map(lambda iv: 'C({})-1'.format(iv), categorical_ivs)))
    
    if len(polynomials) > 0:
        if len(categorical_ivs) > 0:
            return  formula + " + " + " + ".join(numeric_ivs) + " + " + " + ".join(polynomials)
        else:
            return  formula + " + ".join(numeric_ivs) + " + " + " + ".join(polynomials)
    else:
        if len(categorical_ivs) > 0:
            return formula + " + " + " + ".join(numeric_ivs)
        else:
            return formula + " + ".join(numeric_ivs) 

Writing utils/statsmodel_helper.py


In [8]:
%%writefile utils/error_calculator.py
from sklearn.metrics import make_scorer
import numpy as np
  
def rmsle(pred, real):
	# 넘파이로 배열 형태로 바꿔줌.  
    predicted_values = pred.values
    actual_values = real.values
    
  # 예측값과 실제 값에 1을 더하고 로그를 씌어줌 
    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)
    
  # 위에서 계산한 예측값에서 실측값을 빼주고 제곱해줌
    difference = log_predict - log_actual
    difference = np.square(difference)
    
  # 평균을 냄
    mean_difference = difference.mean()
    
  # 다시 루트를 씌움
    score = np.sqrt(mean_difference)  
    
    return score

Writing utils/error_calculator.py


In [9]:
%%writefile utils/correlation.py
import numpy as np

def pick_highly_correlated_features(df, columns, min_corr):
    pairs = []
    for col in columns:
        if not np.issubdtype(df[col].dtype, np.number):
            continue
        corrs = [(col, c, abs(df[col].corr(df[c]))) for c in df.columns.values.tolist() if c != col]
        corrs.sort(key=lambda item: item[2], reverse=True)
        for item in corrs:
            if item[2] > min_corr:
                pairs.append(item)
            else:
                break
    return pd.DataFrame(pairs, columns=['missing_col', 'highest corr with', 'corr'])

def pick_highly_correlated_IVs(df, target_col, min_corr, min_unique_values = 0):
    if not np.issubdtype(df[target_col].dtype, np.number):
        Exception('{}은 numeric data가 아닙니다.'.format(target_col))
    # if len(df[col].value_counts().index) < min_unique_values:
    #     Exception('{}로 상관관계를 계산하기에는 유니크한 값이 너무 작습니다.'.format(col))

    corrs = []
    for col in df._get_numeric_data().drop(target_col, axis=1).columns:
        if len(df[col].value_counts().index) < min_unique_values: continue
        corr = abs(df[target_col].corr(df[col]))
        if corr > min_corr:
            corrs.append((col, corr))
    
    return corrs 


Writing utils/correlation.py


In [10]:
!tree utils

[01;34mutils[00m
├── correlation.py
├── error_calculator.py
├── feature_selection.py
├── __init__.py
├── preprocessing.py
└── statsmodel_helper.py

0 directories, 6 files
