# 준비

## 라이브러리

In [20]:
import pandas as pd
import numpy as np
import scipy as sp

import missingno as msno

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

pd.options.display.max_columns = 50 # 칼럼수 50개까지

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import lightgbm as lgb
from bayes_opt import BayesianOptimization

from tqdm import tqdm

RANDOM_SEED = 42

## 데이터

In [2]:
train_df = pd.read_csv("kaggle-kakr-housing-data/train.csv")
test_df = pd.read_csv("kaggle-kakr-housing-data/test.csv")
sub = pd.read_csv("kaggle-kakr-housing-data/sample_submission.csv")

1. ID : 집을 구분하는 번호
2. date : 집을 구매한 날짜
3. price : 타겟 변수인 집의 가격
4. bedrooms : 침실의 수
5. bathrooms : 침실당 화장실 개수
6. sqft_living : 주거 공간의 평방 피트
7. sqft_lot : 부지의 평방 피트
8. floors : 집의 층수
9. waterfront : 집의 전방에 강이 흐르는지 유무 (a.k.a. 리버뷰)
10. view : 집이 얼마나 좋아 보이는지의 정도
11. condition : 집의 전반적인 상태
12. grade : King County grading 시스템 기준으로 매긴 집의 등급
13. sqft_above : 지하실을 제외한 평방 피트
14. sqft_basement : 지하실의 평방 피트
15. yr_built : 집을 지은 년도
16. yr_renovated : 집을 재건축한 년도
17. zipcode : 우편번호
18. lat : 위도
19. long : 경도
20. sqft_living15 : 근처 15 가구의 주거 공간, 평방 피트
21. sqft_lot15 : 근처 15가구의 부지, 평방 피트

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15035 entries, 0 to 15034
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             15035 non-null  int64  
 1   date           15035 non-null  object 
 2   price          15035 non-null  float64
 3   bedrooms       15035 non-null  int64  
 4   bathrooms      15035 non-null  float64
 5   sqft_living    15035 non-null  int64  
 6   sqft_lot       15035 non-null  int64  
 7   floors         15035 non-null  float64
 8   waterfront     15035 non-null  int64  
 9   view           15035 non-null  int64  
 10  condition      15035 non-null  int64  
 11  grade          15035 non-null  int64  
 12  sqft_above     15035 non-null  int64  
 13  sqft_basement  15035 non-null  int64  
 14  yr_built       15035 non-null  int64  
 15  yr_renovated   15035 non-null  int64  
 16  zipcode        15035 non-null  int64  
 17  lat            15035 non-null  float64
 18  long  

## helper

In [2]:
def show_price_corr(data):
    df = data
    df = df[~df["price"].isna()]
    cor_abs = abs(df.corr(method='spearman')) 
    cor_cols = cor_abs.nlargest(n=10, columns='price').index
    cor = np.array(sp.stats.spearmanr(df[cor_cols].values))[0] # 10 x 10
    print(cor_cols.values)
    plt.figure(figsize=(8,8))
    sns.set(font_scale=1)
    sns.heatmap(cor, fmt='.2f', annot=True, square=True , annot_kws={'size' : 6} ,xticklabels=cor_cols.values, yticklabels=cor_cols.values)
    plt.show()

In [3]:
def rmse_exp(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_true), np.expm1(y_pred)))

In [4]:
def TTS(data, do_ohe=True):
    df = data.drop(['id','price','data'], axis=1).copy()
    cat_cols = df.select_dtypes('object').columns
    for col in cat_cols:
        if do_ohe:
            ohe_df = pd.get_dummies(df[[col]], prefix='ohe_'+col)
            df.drop(col, axis=1, inplace=True)
            df = pd.concat([df, ohe_df], axis=1)
        else:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])

    train_len = data[data['data'] == 'train'].shape[0]
    X_train = df.iloc[:train_len]
    X_test = df.iloc[train_len:]
    y_train = data[data['data'] == 'train']['price']
    
    return X_train, X_test, y_train

def get_oof_lgb(X_train, y_train, X_test, lgb_param, verbose_eval=False, return_cv_score_only=False):

    folds = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    oof = np.zeros(len(X_train))
    predictions = np.zeros(len(X_test))
    feature_importance_df = pd.DataFrame()
    params = {'verbose': -1}
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train.values, y_train.values)):
        if verbose_eval > 0: print(f'Fold : {fold_ + 1}')
        trn_data = lgb.Dataset(X_train.iloc[trn_idx], label=y_train.iloc[trn_idx], params=params)
        val_data = lgb.Dataset(X_train.iloc[val_idx], label=y_train.iloc[val_idx])

        num_round = 100000
        clf = lgb.train(lgb_param, trn_data, num_round, valid_sets=[trn_data, val_data],
                        verbose_eval=verbose_eval, early_stopping_rounds=200)
        oof[val_idx] = clf.predict(X_train.iloc[val_idx], num_iteration=clf.best_iteration)
        predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
        
        cv_fold_score = rmse_exp(y_train.iloc[val_idx], oof[val_idx])
        
        if verbose_eval > 0: print(f'Fold {fold_ + 1} / CV-Score: {cv_fold_score:.6f}')
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = X_train.columns.tolist()
        fold_importance_df['importance'] = clf.feature_importance('gain')
        fold_importance_df['fold'] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    cv_score = rmse_exp(y_train, oof)
    print(f'CV-Score: {cv_score:.6f}')
    if return_cv_score_only: return cv_score
    else: return oof, predictions, cv_score, feature_importance_df
    
def plot_feature_importance(fi_df, num_feature=20):
    cols = (fi_df[['feature', 'importance']]
            .groupby('feature')
            .mean()
            .sort_values(by='importance', ascending=False)[:num_feature].index)
    best_features = fi_df.loc[fi_df.feature.isin(cols)]

    sns.barplot(x='importance', y='feature', data=best_features.sort_values(by='importance', ascending=False))
    plt.title('Feature Importances (averaged over folds)')
    plt.tight_layout()
    plt.show()
    
def plot_numeric_for_regression(df, field, target_field='price'):
    df = df[df[field].notnull()]

    fig = plt.figure(figsize = (16, 7))
    ax1 = plt.subplot(121)
    
    sns.distplot(df[df['data'] == 'train'][field], label='Train', hist_kws={'alpha': 0.5}, ax=ax1)
    sns.distplot(df[df['data'] == 'test'][field], label='Test', hist_kws={'alpha': 0.5}, ax=ax1)

    plt.xlabel(field)
    plt.ylabel('Density')
    plt.legend()
    
    ax2 = plt.subplot(122)
    
    df_copy = df[df['data'] == 'train'].copy()

    sns.scatterplot(x=field, y=target_field, data=df_copy, ax=ax2)
    
    plt.show()
    
def plot_categorical_for_regression(df, field, target_field='price', show_missing=True, missing_value='NA'):
    df_copy = df.copy()
    if show_missing: df_copy[field] = df_copy[field].fillna(missing_value)
    df_copy = df_copy[df_copy[field].notnull()]

    ax1_param = 121
    ax2_param = 122
    fig_size = (16, 7)
    if df_copy[field].nunique() > 30:
        ax1_param = 211
        ax2_param = 212
        fig_size = (16, 10)
    
    fig = plt.figure(figsize = fig_size)
    ax1 = plt.subplot(ax1_param)
    
    sns.countplot(x=field, hue='data', order=np.sort(df_copy[field].unique()), data=df_copy)
    plt.xticks(rotation=90, fontsize=11)
    
    ax2 = plt.subplot(ax2_param)
    
    df_copy = df_copy[df_copy['data'] == 'train']

    sns.boxplot(x=field, y=target_field, data=df_copy, order=np.sort(df_copy[field].unique()), ax=ax2)
    plt.xticks(rotation=90, fontsize=11)
    
    plt.show()
    
def load_original_data():
    train = pd.read_csv('../input/train.csv')
    test = pd.read_csv('../input/test.csv')

    train_copy = train.copy()
    train_copy['data'] = 'train'
    test_copy = test.copy()
    test_copy['data'] = 'test'
    test_copy['price'] = np.nan

    # remove outlier
    train_copy = train_copy[~((train_copy['sqft_living'] > 12000) & (train_copy['price'] < 3000000))].reset_index(drop=True)

    # concat train, test data to preprocess
    data = pd.concat([train_copy, test_copy], sort=False).reset_index(drop=True)
    data = data[train_copy.columns]

    data.drop('date', axis=1, inplace=True)
    data['zipcode'] = data['zipcode'].astype(str)

    # fix skew feature
    skew_columns = ['price']

    for c in skew_columns:
        data[c] = np.log1p(data[c])
        
    return data

# 전처리

In [164]:
# 데이터 합치기
train_df["data"]="train"
test_df["data"]="test"
df = pd.concat([train_df, test_df], axis=0)
df.head(3)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,data
0,0,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,train
1,1,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,train
2,2,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,train


In [165]:
# price
df["price"] = df["price"].apply(lambda x: np.log1p(x))

In [166]:
# date
df["date"] = df["date"].apply(lambda x: str(x)[:8])
df["year"] = df["date"].apply(lambda x: x[:4])
df["month"] = df["date"].apply(lambda x: int(x[4:6]))

In [167]:
# backup
df = df.reset_index(drop=True)
original_df = df.copy()

In [175]:
# basemodel
X_train, X_test, y_train = TTS(df)
print(X_train.shape, X_test.shape)

lgb_param = {
    'objective': 'regression',
    'learning_rate': 0.05,
    'num_leaves': 15,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'feature_fraction': 0.7,
    'seed': RANDOM_SEED,
    'metric': ['rmse'],
    'verbose': -1,
}

oof, pred, cv_score, fi_df = get_oof_lgb(X_train, y_train, X_test, lgb_param)

(15035, 394) (6468, 394)
CV-Score: 119571.897666


In [178]:
# live / lot False
df = original_df.copy()
df["live_per_lot"] = df["sqft_living"] / df["sqft_lot"]
X_train, X_test, y_train = TTS(df)
oof, pred, cv_score, fi_df = get_oof_lgb(X_train, y_train, X_test, lgb_param)

CV-Score: 119571.897666


In [185]:
# renovated
df = original_df.copy()
last_built = []
for idx, row in df.iterrows():
    if row["yr_built"] < row["yr_renovated"]:
        last_built.append(row["yr_renovated"])
    else:
        last_built.append(row["yr_built"])
       
df["last_built"] = last_built
df = df.drop(["yr_built","yr_renovated"], axis=1)
X_train, X_test, y_train = TTS(df)
oof, pred, cv_score, fi_df = get_oof_lgb(X_train, y_train, X_test, lgb_param)

CV-Score: 118434.275475


In [177]:
# yr_built
df = original_df.copy()
df["yr_built"] = 2015-df["yr_built"]
X_train, X_test, y_train = TTS(df)
oof, pred, cv_score, fi_df = get_oof_lgb(X_train, y_train, X_test, lgb_param)

CV-Score: 117817.094319


In [54]:
# zipcode seperate
df = original_df.copy()

df["zipcode"] = df["zipcode"].apply(lambda x: str(x)[2:])
df["zipcode-1"] = df["zipcode"].apply(lambda x: "zip_"+str(x)[:1])
df["zipcode-2"] = df["zipcode"].apply(lambda x: "zip_"+str(x)[1:2])
df["zipcode-3"] = df["zipcode"].apply(lambda x: "zip_"+str(x)[2:])

X_train, X_test, y_train = TTS(df)
oof, pred, cv_score, fi_df = get_oof_lgb(X_train, y_train, X_test, lgb_param)

CV-Score: 116842.144618


In [55]:
# zipcode combination
df["zipcode-12"] = df["zipcode"].apply(lambda x: "zip_"+str(x)[:2])
df["zipcode-13"] = df["zipcode"].apply(lambda x: "zip_"+str(x)[:1]+str(x)[2:])
df["zipcode-23"] = df["zipcode"].apply(lambda x: "zip_"+str(x)[1:])

X_train, X_test, y_train = TTS(df)
oof, pred, cv_score, fi_df = get_oof_lgb(X_train, y_train, X_test, lgb_param)

CV-Score: 116415.573613


In [53]:
# lat long - PCA component:1 <=
df = original_df.copy()

coord = df[['lat','long']]
pca = PCA(n_components=1)
pca.fit(coord)
coord_pca = pca.transform(coord)
df['coord_pca1'] = coord_pca[:, 0]

X_train, X_test, y_train = TTS(df)
oof, pred, cv_score, fi_df = get_oof_lgb(X_train, y_train, X_test, lgb_param)

CV-Score: 116249.246033


In [52]:
# lat long - PCA component: 2 => False
df = original_df.copy()

coord = df[['lat','long']]
pca = PCA(n_components=2)
pca.fit(coord)
coord_pca = pca.transform(coord)
df['coord_pca1'] = coord_pca[:, 0]
df['coord_pca2'] = coord_pca[:, 1]

X_train, X_test, y_train = TTS(df)
oof, pred, cv_score, fi_df = get_oof_lgb(X_train, y_train, X_test, lgb_param)

CV-Score: 117021.270829


In [65]:
# lat long - Kmeans K:39 <=
df = original_df.copy()

k_range = range(3, 50)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=RANDOM_SEED).fit(coord)
    coord_cluster = kmeans.predict(coord)
    df['coord_cluster'] = coord_cluster
    df['coord_cluster'] = df['coord_cluster'].map(lambda x: str(x).rjust(2, '0'))
    
    X_train, X_test, y_train = TTS(df)

    print('K :', k)
    get_oof_lgb(X_train, y_train, X_test, lgb_param)
    print()

K : 3
CV-Score: 115380.173709

K : 4
CV-Score: 116272.473179

K : 5
CV-Score: 116677.051501

K : 6
CV-Score: 116172.262792

K : 7
CV-Score: 114936.908439

K : 8
CV-Score: 115856.724413

K : 9
CV-Score: 116398.236962

K : 10
CV-Score: 115772.735695

K : 11
CV-Score: 118542.069559

K : 12
CV-Score: 116696.261773

K : 13
CV-Score: 117202.111614

K : 14
CV-Score: 116946.552685

K : 15
CV-Score: 117844.281331

K : 16
CV-Score: 118026.728299

K : 17
CV-Score: 117036.451056

K : 18
CV-Score: 116229.235762

K : 19
CV-Score: 115908.161469

K : 20
CV-Score: 116367.811538

K : 21
CV-Score: 116443.820134

K : 22
CV-Score: 116884.102800

K : 23
CV-Score: 116997.806480

K : 24
CV-Score: 116546.456531

K : 25
CV-Score: 117445.533713

K : 26
CV-Score: 116033.046960

K : 27
CV-Score: 115849.854419

K : 28
CV-Score: 117088.229659

K : 29
CV-Score: 116761.047245

K : 30
CV-Score: 117584.581981

K : 31
CV-Score: 116544.435343

K : 32
CV-Score: 116197.244265

K : 33
CV-Score: 115619.382725

K : 34
CV-Score

In [138]:
# haversine distance 
def haversine_array(lat1, lng1, lat2, lng2): 
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2)) 
    AVG_EARTH_RADIUS = 6371 # in km 
    lat = lat2 - lat1 
    lng = lng2 - lng1 
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2 
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d)) 
    return h

In [198]:
# distence 0.5
df = original_df.copy()
last_built = []
for idx, row in df.iterrows():
    if row["yr_built"] < row["yr_renovated"]:
        last_built.append(row["yr_renovated"])
    else:
        last_built.append(row["yr_built"])

df["last_built"] = last_built
df = df.drop(["yr_built","yr_renovated"], axis=1)

count10_list=[]
condi10_list=[]
grad10_list=[]
yr_built10_list = []

print("distance : ",dist*2)
for i in tqdm(range(len(df))):
    lat2 = df['lat'].values
    long2 = df['long'].values

    lat1 = df.loc[i, 'lat'] # id = 0 house lat
    long1 = df.loc[i, 'long'] # id = 0 house long
    dist_arr = haversine_array(lat1, long1, lat2, long2)
    neighbor_df = pd.DataFrame({
        'id': np.tile(np.array([df.loc[i, 'id']]), df.shape[0]),
        'neighbor_id': df['id'],
        'neighbor_lat': lat2,
        'neighbor_long': long2,
        'distance': dist_arr,
    })

    condi10_list.append(df["condition"][neighbor_df["distance"]<0.5].mean())
    grad10_list.append(df["grade"][neighbor_df["distance"]<0.5].mean())
    yr_built10_list.append(df["condition"][neighbor_df["distance"]<0.5].mean())
    count10_list.append(sum(neighbor_df["distance"]<=0.5))
df[f"count_1km"]=count10_list
df[f"conditon_1km"]=condi10_list
df[f"grade_1km"]=grad10_list
df[f"yr_built_1km"]=yr_built10_list

distance :  10.0


100%|████████████████████████████████████████████████████████████████████████████| 21503/21503 [00:58<00:00, 366.60it/s]


In [206]:
# 1km house count
temp = df.drop(["yr_built_1km","conditon_1km","grade_1km"],axis=1)
X_train, X_test, y_train = TTS(temp)

oof, pred, cv_score, fi_df = get_oof_lgb(X_train, y_train, X_test, lgb_param)

CV-Score: 117639.268041


In [203]:
# 1km yr_lastbuilt mean => False
temp = df.drop(["count_1km","conditon_1km","grade_1km"],axis=1)
X_train, X_test, y_train = TTS(temp)

oof, pred, cv_score, fi_df = get_oof_lgb(X_train, y_train, X_test, lgb_param)

CV-Score: 120004.360741


In [204]:
# 1km grade mean
temp = df.drop(["count_1km","conditon_1km","yr_built_1km"],axis=1)
X_train, X_test, y_train = TTS(temp)

oof, pred, cv_score, fi_df = get_oof_lgb(X_train, y_train, X_test, lgb_param)

CV-Score: 116922.818722


In [205]:
# 1km condition mean => False
temp = df.drop(["count_1km","yr_built_1km","grade_1km"],axis=1)
X_train, X_test, y_train = TTS(temp)

oof, pred, cv_score, fi_df = get_oof_lgb(X_train, y_train, X_test, lgb_param)

CV-Score: 120004.360741


In [207]:
distance_1km = df[["count_1km","grade_1km"]].copy()

In [212]:
# all

# last_built
df = original_df.copy()
last_built = []
for idx, row in df.iterrows():
    if row["yr_built"] < row["yr_renovated"]:
        last_built.append(row["yr_renovated"])
    else:
        last_built.append(row["yr_built"])
       
df["last_built"] = last_built
df = df.drop(["yr_built","yr_renovated"], axis=1)

# 2015 - built 
df["last_built"] = 2015-df["last_built"]

# zipcode
df["zipcode"] = df["zipcode"].apply(lambda x: str(x)[2:])
df["zipcode-1"] = df["zipcode"].apply(lambda x: "zip_"+str(x)[:1])
df["zipcode-2"] = df["zipcode"].apply(lambda x: "zip_"+str(x)[1:2])
df["zipcode-3"] = df["zipcode"].apply(lambda x: "zip_"+str(x)[2:])
df["zipcode-12"] = df["zipcode"].apply(lambda x: "zip_"+str(x)[:2])
df["zipcode-13"] = df["zipcode"].apply(lambda x: "zip_"+str(x)[:1]+str(x)[2:])
df["zipcode-23"] = df["zipcode"].apply(lambda x: "zip_"+str(x)[1:])

# lat long : PCA
coord = df[['lat','long']]
pca = PCA(n_components=1)
pca.fit(coord)
coord_pca = pca.transform(coord)
df['coord_pca1'] = coord_pca[:, 0]

# lat long : K means
kmeans = KMeans(n_clusters=39, random_state=RANDOM_SEED).fit(coord)
coord_cluster = kmeans.predict(coord)
df['coord_cluster'] = coord_cluster
df['coord_cluster'] = df['coord_cluster'].map(lambda x: str(x).rjust(2, '0'))

# distance 1km : haversine distance 
df["count_1km"]=distance_1km["count_1km"]
df["grade_1km"]=distance_1km["grade_1km"]

In [215]:
df.to_csv("kaggle-kakr-housing-data/prep_df.csv",index=False)

In [213]:
# all
X_train, X_test, y_train = TTS(df)
oof, pred, cv_score, fi_df = get_oof_lgb(X_train, y_train, X_test, lgb_param)

CV-Score: 115337.970625


# 모델링

In [6]:
df = pd.read_csv("kaggle-kakr-housing-data/prep_df.csv")
X_train, X_test, y_train = TTS(df)

In [23]:
pbounds = { 'learning_rate': (0.0005, 0.2),
            'n_estimators': (16, 1024),
            'max_depth': (3,10),   
            'subsample': (0.5,1), 
            'colsample_bytree': (0.5,1),   
            'num_leaves': (2,16),
            'min_child_weight': (1, 10)}


def lgbm_opt(learning_rate, n_estimators, max_depth, subsample, colsample_bytree, num_leaves, min_child_weight):

    params = {
        'learning_rate': learning_rate,
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'subsample': subsample,
        'colsample_bytree' : colsample_bytree,
        'num_leaves' : int(round(num_leaves)),
        'min_child_weight' : min_child_weight,
        'n_jobs' : -1
    }
    
    lgbm = lgb.LGBMRegressor(**params)
    
    kf = KFold(n_splits=5 , shuffle=True, random_state=50)

    score = cross_val_score(lgbm, X_train, y_train, scoring='neg_mean_squared_error', cv=kf, n_jobs=-1)
    
    return np.mean(score)
                   
BO_lgbm = BayesianOptimization(f = lgbm_opt, pbounds = pbounds, random_state=RANDOM_SEED, verbose=2) 

BO_lgbm.maximize(init_points=10, n_iter = 200)

BO_lgbm.max

params = BO_lgbm.max['params']
params['max_depth'] = int(round(params['max_depth']))
params['n_estimators'] = int(round(params['n_estimators']))
params['num_leaves'] = int(round(params['num_leaves']))

lgbm = lgb.LGBMRegressor(seed = RANDOM_SEED, **params)
lgbm.fit(X_train, y_train)


pred_lgbm = np.expm1(lgbm.predict(X_test))

|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.03241 [0m | [0m0.6873   [0m | [0m0.1902   [0m | [0m8.124    [0m | [0m6.388    [0m | [0m173.3    [0m | [0m4.184    [0m | [0m0.529    [0m |
| [95m2        [0m | [95m-0.02667 [0m | [95m0.9331   [0m | [95m0.1204   [0m | [95m7.957    [0m | [95m1.185    [0m | [95m993.7    [0m | [95m13.65    [0m | [95m0.6062   [0m |
| [0m3        [0m | [0m-0.03252 [0m | [0m0.5909   [0m | [0m0.03709  [0m | [0m5.13     [0m | [0m5.723    [0m | [0m451.4    [0m | [0m6.077    [0m | [0m0.8059   [0m |
| [0m4        [0m | [0m-0.02819 [0m | [0m0.5697   [0m | [0m0.05878  [0m | [0m5.565    [0m | [0m5.105    [0m | [0m807.5    [0m | [0m4.795    [0m | [0m0.7571   [0m |
| [0m5        [0m | [0m-0.1208  [0m | 

| [0m45       [0m | [0m-0.02688 [0m | [0m0.8431   [0m | [0m0.2      [0m | [0m10.0     [0m | [0m10.0     [0m | [0m720.9    [0m | [0m7.895    [0m | [0m0.8434   [0m |
| [0m46       [0m | [0m-0.02661 [0m | [0m0.5      [0m | [0m0.1249   [0m | [0m10.0     [0m | [0m10.0     [0m | [0m348.3    [0m | [0m16.0     [0m | [0m0.8676   [0m |
| [0m47       [0m | [0m-0.03395 [0m | [0m0.8211   [0m | [0m0.2      [0m | [0m3.0      [0m | [0m10.0     [0m | [0m629.4    [0m | [0m2.0      [0m | [0m1.0      [0m |
| [0m48       [0m | [0m-0.02764 [0m | [0m0.7243   [0m | [0m0.1906   [0m | [0m9.963    [0m | [0m1.038    [0m | [0m621.1    [0m | [0m15.57    [0m | [0m0.6246   [0m |
| [0m49       [0m | [0m-0.02985 [0m | [0m0.7157   [0m | [0m0.0251   [0m | [0m9.899    [0m | [0m1.08     [0m | [0m390.5    [0m | [0m14.7     [0m | [0m0.6175   [0m |
| [0m50       [0m | [0m-0.03771 [0m | [0m1.0      [0m | [0m0.1584   [0m | [0m10.0

| [0m90       [0m | [0m-0.02662 [0m | [0m0.9692   [0m | [0m0.09848  [0m | [0m4.928    [0m | [0m6.57     [0m | [0m851.2    [0m | [0m5.668    [0m | [0m0.9325   [0m |
| [0m91       [0m | [0m-0.02638 [0m | [0m0.9235   [0m | [0m0.08543  [0m | [0m7.914    [0m | [0m6.077    [0m | [0m838.8    [0m | [0m8.278    [0m | [0m0.9118   [0m |
| [0m92       [0m | [0m-0.02698 [0m | [0m0.6903   [0m | [0m0.1562   [0m | [0m5.514    [0m | [0m8.97     [0m | [0m817.0    [0m | [0m13.14    [0m | [0m0.7186   [0m |
| [0m93       [0m | [0m-0.02772 [0m | [0m1.0      [0m | [0m0.1679   [0m | [0m10.0     [0m | [0m3.258    [0m | [0m805.6    [0m | [0m16.0     [0m | [0m0.9561   [0m |
| [95m94       [0m | [95m-0.02621 [0m | [95m0.6106   [0m | [95m0.09897  [0m | [95m3.342    [0m | [95m8.65     [0m | [95m829.5    [0m | [95m15.59    [0m | [95m0.884    [0m |
| [0m95       [0m | [0m-0.02684 [0m | [0m0.894    [0m | [0m0.08103  [0m |

| [0m135      [0m | [0m-0.03377 [0m | [0m0.5      [0m | [0m0.2      [0m | [0m3.0      [0m | [0m1.0      [0m | [0m658.1    [0m | [0m2.0      [0m | [0m0.5      [0m |
| [0m136      [0m | [0m-0.2363  [0m | [0m1.0      [0m | [0m0.0005   [0m | [0m10.0     [0m | [0m1.0      [0m | [0m571.9    [0m | [0m2.0      [0m | [0m1.0      [0m |
| [0m137      [0m | [0m-0.02663 [0m | [0m1.0      [0m | [0m0.2      [0m | [0m3.0      [0m | [0m9.22     [0m | [0m741.8    [0m | [0m16.0     [0m | [0m0.5      [0m |
| [0m138      [0m | [0m-0.1015  [0m | [0m0.8237   [0m | [0m0.002931 [0m | [0m9.689    [0m | [0m5.662    [0m | [0m423.0    [0m | [0m8.217    [0m | [0m0.5088   [0m |
| [0m139      [0m | [0m-0.02669 [0m | [0m0.7459   [0m | [0m0.1912   [0m | [0m3.366    [0m | [0m1.816    [0m | [0m723.8    [0m | [0m6.795    [0m | [0m0.7454   [0m |
| [0m140      [0m | [0m-0.0267  [0m | [0m0.6304   [0m | [0m0.06885  [0m | [0m3.20

| [0m180      [0m | [0m-0.02623 [0m | [0m0.5      [0m | [0m0.2      [0m | [0m3.0      [0m | [0m10.0     [0m | [0m733.8    [0m | [0m9.232    [0m | [0m0.5      [0m |
| [0m181      [0m | [0m-0.02622 [0m | [0m0.5      [0m | [0m0.2      [0m | [0m3.0      [0m | [0m10.0     [0m | [0m823.7    [0m | [0m9.17     [0m | [0m0.5      [0m |
| [0m182      [0m | [0m-0.0351  [0m | [0m0.9677   [0m | [0m0.1038   [0m | [0m10.0     [0m | [0m1.661    [0m | [0m949.6    [0m | [0m2.0      [0m | [0m0.9112   [0m |
| [0m183      [0m | [0m-0.02665 [0m | [0m1.0      [0m | [0m0.2      [0m | [0m3.0      [0m | [0m2.081    [0m | [0m630.1    [0m | [0m16.0     [0m | [0m1.0      [0m |
| [0m184      [0m | [0m-0.02666 [0m | [0m1.0      [0m | [0m0.2      [0m | [0m3.0      [0m | [0m10.0     [0m | [0m624.9    [0m | [0m16.0     [0m | [0m1.0      [0m |
| [0m185      [0m | [0m-0.0265  [0m | [0m0.9458   [0m | [0m0.07578  [0m | [0m9.01

In [24]:
pred_lgbm

array([ 516412.82681629,  484104.37714833, 1313749.20186284, ...,
        464876.10463881,  322488.57858796,  435345.49672596])

In [26]:
sub = pd.read_csv("kaggle-kakr-housing-data/sample_submission.csv")
sub["price"] = pred_lgbm
sub.to_csv("submission_last2.csv",index=False)