In [None]:
from sklearn.ensemble import RandomForestRegressor as rf

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import random

plt.style.use(style='ggplot')
np.random.seed(1234)
random.seed(1234)

In [None]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")
submission = pd.read_csv("./data/sample_submission.csv")

In [None]:
all_df = pd.concat([train_df, test_df], axis=0, sort=False).reset_index(drop=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

categories = all_df.columns[all_df.dtypes == "object"]
print(categories)
print(len(categories))

In [None]:
for cat in categories:
    le = LabelEncoder()
    all_df[cat].fillna("missing", inplace=True)
    le = le.fit(all_df[cat])
    all_df[cat] = le.transform(all_df[cat])
    all_df[cat] = all_df[cat].astype("category")

In [None]:
all_df["TotalSF"] = all_df["TotalBsmtSF"] + all_df["1stFlrSF"] + all_df["2ndFlrSF"]
all_df["Total_bathrooms"] = (
    all_df["FullBath"]
    + all_df["HalfBath"]
    + all_df["BsmtFullBath"]
    + all_df["BsmtHalfBath"]
)

In [None]:
hasnan_cat = []
for col in all_df.columns:
    tmp_null_count = all_df[col].isnull().sum()
    if (tmp_null_count > 0) & (col != "SalePrice"):
        hasnan_cat.append(col)
        print(col, tmp_null_count)

In [None]:
all_df[hasnan_cat].describe()

In [None]:
for col in all_df.columns:
    tmp_null_count = all_df[col].isnull().sum()
    if (tmp_null_count > 0) & (col != "SalePrice"):
        print(col, tmp_null_count)
        all_df[col].fillna(all_df[col].median(), inplace=True)

In [None]:
train_df_le = all_df[~all_df["SalePrice"].isnull()]
test_df_le = all_df[all_df["SalePrice"].isnull()]

train_df_le["SalePrice_log"] = np.log(train_df_le["SalePrice"])
train_X = train_df_le.drop(["SalePrice", "SalePrice_log", "Id"], axis=1)
train_Y = train_df_le["SalePrice_log"]

In [None]:
from sklearn.model_selection import KFold

folds = 3
kf = KFold(n_splits=folds)

In [None]:
from pyexpat import model
from sklearn.metrics import mean_squared_error

models_rf = []
rmses_rf = []
oof_rf = np.zeros(len(train_X))

for train_index, val_index in kf.split(train_X):
    X_train = train_X.iloc[train_index]
    X_vaild = train_X.iloc[val_index]
    y_train = train_Y.iloc[train_index]
    y_valid = train_Y.iloc[val_index]
    model_rf = rf(n_estimators=50, random_state=1234)
    model_rf.fit(X_train, y_train)
    y_pred = model_rf.predict(X_vaild)
    tmp_rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    print(tmp_rmse)
    models_rf.append(model_rf)
    rmses_rf.append(tmp_rmse)
    oof_rf[val_index] = y_pred

In [None]:
sum(rmses_rf) / len(rmses_rf)

In [None]:
test_X = test_df_le.drop(["SalePrice", "Id"], axis=1)

In [None]:
preds_rf = []
for model in models_rf:
    pred = model.predict(test_X)
    preds_rf.append(pred)

In [None]:

preds_array_rf = np.array(preds_rf)
preds_mean_rf = np.mean(preds_array_rf, axis=0)
preds_exp_rf = np.exp(preds_mean_rf)
submission["SalePrice"] = preds_exp_rf

In [None]:
submission.to_csv("./submit/submission_rf.csv", index=False)