In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import warnings

warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("assets\\trainandvalid.csv", low_memory=False, parse_dates=["saledate"]) 
df_tmp = df.copy()
df_tmp.sort_values(by=["saledate"], inplace=True, ascending=True)

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
fig, ax = plt.subplots()
ax.scatter(df["saledate"][:1000],df["SalePrice"][:1000])

In [None]:
df_tmp["saleYear"] = df_tmp["saledate"].dt.year
df_tmp["saleMonth"] = df_tmp["saledate"].dt.month
df_tmp["saleDay"] = df_tmp["saledate"].dt.day
df_tmp["saleDayOfWeek"] = df_tmp["saledate"].dt.dayofweek
df_tmp["saleDayOfYear"] = df_tmp["saledate"].dt.dayofyear

In [None]:
df_tmp.drop("saledate", axis=1, inplace=True)

In [None]:
for col in df_tmp.columns:
    if pd.api.types.is_object_dtype(df_tmp[col]):
        df_tmp[col] = df_tmp[col].astype("category")

In [None]:
df_train = df_tmp[df_tmp["saleYear"] != 2012]
df_valid = df_tmp[df_tmp["saleYear"] == 2012]

In [None]:
for col in df_train.columns:
    if pd.api.types.is_categorical_dtype(df_train[col]):
        df_train[col] = df_train[col].cat.codes +1

for col in df_valid.columns:
    if pd.api.types.is_categorical_dtype(df_valid[col]):
        df_valid[col] = df_valid[col].cat.codes +1

In [None]:
for col in df_train.columns:
    if not pd.api.types.is_categorical_dtype(df_train[col]) and df_train[col].isna().sum() > 0:
        df_train[col] = df_train[col].fillna(df_train[col].median())

for col in df_valid.columns:
    if not pd.api.types.is_categorical_dtype(df_valid[col]) and df_valid[col].isna().sum() > 0:
        df_valid[col] = df_valid[col].fillna(df_valid[col].median())

In [None]:
X_train, y_train = df_train.drop("SalePrice", axis=1), df_train["SalePrice"]
X_valid, y_valid = df_valid.drop("SalePrice", axis=1), df_valid["SalePrice"]

In [None]:
from sklearn.metrics import mean_squared_log_error, mean_absolute_error

def rmsle(y_test, y_preds):
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_valid)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Valid MAE": mean_absolute_error(y_valid, val_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Valid RMSLE": rmsle(y_valid, val_preds),
              "Training R^2": model.score(X_train, y_train),
              "Valid R^2": model.score(X_valid, y_valid)}
    return scores

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rf_grid = {"n_estimators": np.arange(10, 100, 10),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           "max_features": [0.5, 1.0, "sqrt"],
           "max_samples": [10000]}

rs_model = RandomizedSearchCV(RandomForestRegressor(random_state=42),
                              param_distributions=rf_grid,
                              n_iter=20,
                              cv=5,
                              verbose=True)

rs_model.fit(X_train, y_train)

In [None]:
rs_model_best = RandomForestRegressor(random_state=42, n_estimators=90, min_samples_leaf=1, min_samples_split=14, max_features=0.5, max_samples=None)
rs_model_best.fit(X_train, y_train)

In [None]:
show_scores(rs_model_best)

In [None]:
df_test = pd.read_csv("assets\\test.csv", parse_dates=["saledate"])

df_test["saleYear"] = df_test["saledate"].dt.year
df_test["saleMonth"] = df_test["saledate"].dt.month
df_test["saleDay"] = df_test["saledate"].dt.day
df_test["saleDayOfWeek"] = df_test["saledate"].dt.dayofweek
df_test["saleDayOfYear"] = df_test["saledate"].dt.dayofyear

df_test.drop("saledate", axis=1, inplace=True)

In [None]:
for col in df_test.columns:
    if pd.api.types.is_object_dtype(df_test[col]):
        df_test[col] = df_test[col].astype("category")

In [None]:
for col in df_test.columns:
    if pd.api.types.is_categorical_dtype(df_test[col]):
        df_test[col] = df_test[col].cat.codes +1

In [None]:
for col in df_test.columns:
    if not pd.api.types.is_categorical_dtype(df_test[col]) and df_test[col].isna().sum() > 0:
        df_test[col] = df_test[col].fillna(df_test[col].median())

In [None]:
rs_model_best.predict(df_test)

In [None]:
import seaborn as sns

def plot_features(columns, importances, n=20):
    df = (pd.DataFrame({"features": columns, "feature_importance": importances}).sort_values("feature_importance", ascending=False).reset_index(drop=True))
    
    sns.barplot(x="feature_importance", y="features", data=df[:n])

In [None]:
plot_features(X_train.columns, rs_model_best.feature_importances_)