In [54]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [55]:
root = '/kaggle/input/house-prices-advanced-regression-techniques'
ss_path = os.path.join(root, 'sample_submission.csv')
desc_path = os.path.join(root, 'data_description.txt')
train_path = os.path.join(root, 'train.csv')
test_path = os.path.join(root, 'test.csv')


In [56]:
ss = pd.read_csv(ss_path)
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [57]:
target = train['SalePrice']
train = train.drop('SalePrice', axis = 1)

In [58]:
train.dtypes.unique()

### 1. Imputing

In [59]:
a = train.isnull().sum()/len(train)*100
a.sort_values(ascending = False)

In [60]:
drop_columns = ['PoolQC', 'MiscFeature', 'Alley', 'Fence']
train = train.drop(drop_columns, axis = 1)
test = test.drop(drop_columns, axis = 1)

In [61]:
a = train.isnull().sum()/len(train)*100
a = a[a>5]
null_features = a.index.tolist()

In [62]:
#LotFrontage, GarageYrBlt (float), Else (object)
for c in null_features:
    c_null = c+'_NULL'
    train[c_null] = 0
    for i in train.index:
        if train.loc[i,c] == np.nan:
            bureau.loc[i, c_null] = 1
            
#LotFrontage, GarageYrBlt (float), Else (object)
for c in null_features:
    c_null = c+'_NULL'
    test[c_null] = 0
    for i in test.index:
        if test.loc[i,c] == np.nan:
            bureau.loc[i, c_null] = 1

In [63]:
train[null_features].dtypes

In [64]:
num_null_features = [x for x in null_features if train[x].dtypes == 'float']
cat_null_features = [x for x in null_features if train[x].dtypes == 'object']

In [65]:
# Median 채우기
for c in num_null_features:
    train[c] = train[c].fillna(train[c].median())
    test[c] = test[c].fillna(test[c].median())

In [66]:
# 최빈값 채우기
for c in cat_null_features:
    train[c] = train[c].fillna(train[c].mode()[0])
    test[c] = test[c].fillna(test[c].mode()[0])

In [67]:
print(train[null_features].isnull().sum()/len(train))
print(test[null_features].isnull().sum()/len(test))

### 2. Encoding

In [68]:
cat_columns = []
num_columns = []
for column in train.columns:
    if train[column].dtypes == 'object':
        cat_columns.append(column)
    elif train[column].dtypes in ['float64', 'int64']:
        num_columns.append(column)
        

In [69]:
print(*cat_columns)

Look for Categorical Data

In [70]:
for cat_column in cat_columns:
    print(cat_column, train[cat_column].nunique())

In [71]:
train[cat_columns]

In [72]:
encoding_map = dict()

for c in cat_columns:
    a = train.groupby(c)[c].count().sort_values(ascending = False)/len(train)*100
    a = a[a>1]
    
    encoding_map[c] = []
    
    for idx in a.index:
        encoding_map[c].append(idx)

for c in cat_columns:
    for idx in encoding_map[c]:
        train[c+'_'+idx] = np.where(train[c] == idx, 1, 0)
        test[c+'_'+idx] = np.where(test[c] == idx, 1, 0)

In [73]:
train = train.drop(cat_columns, axis = 1)
test = test.drop(cat_columns, axis = 1)

In [74]:
print(train.shape, test.shape)

In [82]:
train.dtypes[train.dtypes == 'object'] # None

### 3. Outlier Removing

In [83]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(nrows = len(num_columns)//2, ncols = 2, figsize = (12,100))
for i in range(len(num_columns)):
    ax[i//2][i%2].boxplot(train[num_columns[i]])
    ax[i//2][i%2].set_xlabel(num_columns[i])
    

In [87]:
fig, ax = plt.subplots(nrows = len(num_columns)//2, ncols = 2, figsize = (12,100))
for i in range(len(num_columns)):
    ax[i//2][i%2].hist(train[num_columns[i]])
    ax[i//2][i%2].set_xlabel(num_columns[i])

### 4. Correlation removing

In [94]:
plt.figure(figsize = (60,60))

import seaborn as sns

sns.heatmap(train[num_columns].corr(), annot= True, cmap = 'Greens')
plt.savefig('correlation.pdf')

In [96]:
# ['GarageCars', 'GarageArea'] are highly correlated
train = train.drop('GarageCars', axis = 1)
test = test.drop('GarageCars', axis = 1)

### 5. XGBoost Model

In [97]:
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score

In [98]:
X_train, X_test, y_train, y_test = train_test_split(train, target ,test_size=0.1)
xgb_model = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

print(len(X_train), len(X_test))
xgb_model.fit(X_train,y_train)

In [108]:
sorted_idx = np.argsort(xgb_model.feature_importances_)[::-1]
#for index in sorted_idx:
#    print([train.columns[index], xgb_model.feature_importances_[index]]) 
    
important_features = [train.columns[index] for index in sorted_idx if xgb_model.feature_importances_[index] > 0.001]
print(important_features)

In [109]:
new_train = train[important_features]
new_test = test[important_features]
X_train, X_test, y_train, y_test = train_test_split(train, target ,test_size=0.1)
xgb_model = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

print(len(X_train), len(X_test))
xgb_model.fit(X_train,y_train)
predictions = xgb_model.predict(X_test)
r_sq = xgb_model.score(X_train, y_train)
print(r_sq)
print(explained_variance_score(predictions,y_test))

In [114]:
ss['SalePrice'] = xgb_model.predict(test)
ss.to_csv('sample_submission.csv', index = False)