In [69]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [70]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures

In [71]:
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

In [72]:
df.head()

In [73]:
df.shape

In [74]:
df.info()

In [75]:
df.describe().T

In [76]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), cmap="RdBu")
plt.title("Correlations between variables", size=15)
plt.show()

df.corr() ???

In [77]:
df.corr()["SalePrice"]

In [78]:
important_num_cols = list(df.corr()["SalePrice"][(df.corr()["SalePrice"]>0.50) | (df.corr()["SalePrice"]<-0.50)].index)

> ## 중요한 특성 찾기

In [79]:
important_num_cols

In [80]:
cat_cols = ["MSZoning","Utilities","BldgType","Heating","KitchenQual","SaleCondition","LandSlope"]
important_cols = important_num_cols + cat_cols
important_cols

In [81]:
df = df[important_cols]
df

## Missing Values 처리하기

In [82]:
print("Missing Values by Column")
print(df.isna().sum())
print("Total Missing Values: ", df.isna().sum().sum())

## Data Visualization

In [83]:
sns.pairplot(df[important_num_cols])

## X,y Split

In [84]:
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

## One-hot Encoding

In [85]:
X = pd.get_dummies(X, columns=cat_cols)

## Standardizing Data

In [86]:
important_num_cols.remove("SalePrice")
scaler = StandardScaler()
X[important_num_cols] = scaler.fit_transform(X[important_num_cols])

In [87]:
X.head()

## Train-Test Split

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [89]:
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5)).mean()
    return mse

In [90]:
def evaluation(y_hat, predictions):
    mae = mean_absolute_error(y_hat, predictions)
    mse = mean_squared_error(y_hat, predictions)
    rmse = np.sqrt(mean_squared_error(y_hat, predictions))
    r_squared = r2_score(y_hat, predictions)
    return mae, mse, rmse, r_squared

## Machine Learning Models

In [91]:
models = pd.DataFrame(columns=["Model","MAE","MSE","RMSE","R2 Score","RMSE(Cross-Validation)"])

### Linear Regression

In [92]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
predictions = lin_reg.predict(X_test)
mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print(mae, mse, rmse, r_squared)
rmse_cross_val = rmse_cv(lin_reg)
print(rmse_cross_val)
new_row = {"Model":"LinearRegression","MAE":mae, "MSE":mse, "RMSE":rmse, "R2 Score":r_squared, "RMSE(Cross-Validation)":rmse_cross_val}
models = models.append(new_row, ignore_index=True)

### Ridge Regression

In [93]:
ridge = Ridge()
ridge.fit(X_train, y_train)
predictions = ridge.predict(X_test)
mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print(mae, mse, rmse, r_squared)
rmse_cross_val = rmse_cv(ridge)
print(rmse_cross_val)
new_row = {"Model":"RidgeRegression","MAE":mae, "MSE":mse, "RMSE":rmse, "R2 Score":r_squared, "RMSE(Cross-Validation)":rmse_cross_val}
models = models.append(new_row, ignore_index=True)

### Lasso Regression

In [94]:
lasso = Lasso()
lasso.fit(X_train, y_train)
predictions = lasso.predict(X_test)
mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print(mae, mse, rmse, r_squared)
rmse_cross_val = rmse_cv(lasso)
print(rmse_cross_val)
new_row = {"Model":"LassoRegression","MAE":mae, "MSE":mse, "RMSE":rmse, "R2 Score":r_squared, "RMSE(Cross-Validation)":rmse_cross_val}
models = models.append(new_row, ignore_index=True)

### Elastic Net

In [95]:
elastic_net = ElasticNet()
elastic_net.fit(X_train, y_train)
predictions = elastic_net.predict(X_test)
mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print(mae, mse, rmse, r_squared)
rmse_cross_val = rmse_cv(elastic_net)
print(rmse_cross_val)
new_row = {"Model":"ElasticNet","MAE":mae, "MSE":mse, "RMSE":rmse, "R2 Score":r_squared, "RMSE(Cross-Validation)":rmse_cross_val}
models = models.append(new_row, ignore_index=True)

### SVM

In [96]:
svr = SVR(C=100000)
svr.fit(X_train, y_train)
predictions = svr.predict(X_test)
mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print(mae, mse, rmse, r_squared)
rmse_cross_val = rmse_cv(svr)
print(rmse_cross_val)
new_row = {"Model":"SVR","MAE":mae, "MSE":mse, "RMSE":rmse, "R2 Score":r_squared, "RMSE(Cross-Validation)":rmse_cross_val}
models = models.append(new_row, ignore_index=True)

### Random Forest Regressor

In [97]:
random_forest = RandomForestRegressor(n_estimators=100)
random_forest.fit(X_train, y_train)
predictions = random_forest.predict(X_test)
mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print(mae, mse, rmse, r_squared)
rmse_cross_val = rmse_cv(random_forest)
print(rmse_cross_val)
new_row = {"Model":"RandomForestRegressor","MAE":mae, "MSE":mse, "RMSE":rmse, "R2 Score":r_squared, "RMSE(Cross-Validation)":rmse_cross_val}
models = models.append(new_row, ignore_index=True)

### XGBoost Regressor

In [98]:
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.01)
xgb.fit(X_train, y_train)
predictions = xgb.predict(X_test)
mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print(mae, mse, rmse, r_squared)
rmse_cross_val = rmse_cv(xgb)
print(rmse_cross_val)
new_row = {"Model":"XGBRegressor","MAE":mae, "MSE":mse, "RMSE":rmse, "R2 Score":r_squared, "RMSE(Cross-Validation)":rmse_cross_val}
models = models.append(new_row, ignore_index=True)

## Model Comparison
RMSE가 작을 수록, 모델은 성능이 더 좋음.

In [99]:
models.sort_values(by="RMSE(Cross-Validation)")

In [101]:
plt.figure(figsize=(12,8))
sns.barplot(x=models["Model"], y=models["RMSE(Cross-Validation)"])
plt.title("Evaluation of not-optimized models based RMSE(Cross-Validation)", size=15)
plt.xticks(rotation=30, size=12)
plt.show()