# Main Notebook

## Libraries

In [None]:
import math

import numpy as np
import pandas as pd

import seaborn as sns 
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

In [None]:
import datetime
def timestamp():
    time = f"{datetime.datetime.now().date()}--{datetime.datetime.now().time().replace(microsecond=0)}"
    return time.replace(":", "-")

## Load data and describe

In [None]:
df = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

In [None]:
df.info()

In [None]:
# drop id
df = df.drop(columns=["id"])

In [None]:
# no missing values
df.isnull().sum()

In [None]:
df.describe()

## Visualize distributions and relationships

In [None]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
numerical_features = df.select_dtypes(include="float64").columns
for i in numerical_features:
    sns.histplot(df, x=i)
    plt.show()

In [None]:
numerical_features = df.select_dtypes(include="float64").columns
for i in numerical_features:
    sns.jointplot(x=df[i], y=df["target"], kind="hex")
    plt.show()

In [None]:
categorical_features = df.select_dtypes(include="object").columns
for i in categorical_features:
    fig, ax = plt.subplots(1, 2, figsize=(12, 8))
    sns.histplot(df, x=i, ax=ax[0])
    sns.boxplot(data=df, x=i, y="target", ax=ax[1])
    plt.show()

## Feature engineering

## Prepare for models

### Pipeline template

In [None]:
numerical_features = df.drop(columns=["target"]).select_dtypes(include="float64").columns
numerical_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_features = df.select_dtypes(include="object").columns
categorial_transformer = OneHotEncoder(drop="first") # need? drop="first"

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorial_transformer, categorical_features)
    ]
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns="target"),
    df["target"],
    test_size=0.3,
    random_state=42
)

## Training

### Linear regression

In [None]:
regression = Pipeline(
    steps=[("preprocessor", preprocessor), ("regression", LinearRegression())]
)

regression.fit(X_train, y_train)

In [None]:
y_train_pred = regression.predict(X_train)
y_test_pred = regression.predict(X_test)

print(f"Training error: {math.sqrt(mean_squared_error(y_train, y_train_pred))}")
print(f"Test error: {math.sqrt(mean_squared_error(y_test, y_test_pred))}")

### Ridge

In [None]:
ridge = Pipeline(
    steps=[("preprocessor", preprocessor), ("regression", Ridge())]
)

param_grid = {
    'regression__alpha': [0.1, 1.0, 10, 100, 500, 1000],
}

grid_search = GridSearchCV(ridge, param_grid, scoring="neg_root_mean_squared_error", cv=3)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
y_train_pred = grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)

print(f"Training error: {math.sqrt(mean_squared_error(y_train, y_train_pred))}")
print(f"Test error: {math.sqrt(mean_squared_error(y_test, y_test_pred))}")

### Lasso

In [None]:
lasso = Pipeline(
    steps=[("preprocessor", preprocessor), ("regression", Lasso())]
)

param_grid = {
    'regression__alpha': [0.1, 0.1, 0.2, 0.3, 0.6, 1.0, 10, 100],
}

grid_search = GridSearchCV(lasso, param_grid, scoring="neg_root_mean_squared_error", cv=3)

grid_search.fit(X_train, y_train)

In [None]:
grid_search.cv_results_

In [None]:
grid_search.best_params_

In [None]:
y_train_pred = grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)

print(f"Training error: {math.sqrt(mean_squared_error(y_train, y_train_pred))}")
print(f"Test error: {math.sqrt(mean_squared_error(y_test, y_test_pred))}")

### Random Forest Regressor

In [None]:
forest = Pipeline(
    steps=[("preprocessor", preprocessor), ("forest", RandomForestRegressor())]
)

forest.fit(X_train, y_train)

In [None]:
#y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

#print(f"Training error: {math.sqrt(mean_squared_error(y_train, y_train_pred))}")
print(f"Test error: {math.sqrt(mean_squared_error(y_test, y_test_pred))}")

In [None]:
forest = Pipeline(
    steps=[("preprocessor", preprocessor), ("forest", RandomForestRegressor())]
)

param_grid = {
    'forest__n_estimators': [10, 50, 100, 1000, 5000],
    "forest__max_depth": list(range(5, 25, 5))
}

grid_search = GridSearchCV(forest, param_grid, scoring="neg_root_mean_squared_error", cv=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_estimator_.get_params()

In [None]:
y_train_pred = grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)

print(f"Training error: {math.sqrt(mean_squared_error(y_train, y_train_pred))}")
print(f"Test error: {math.sqrt(mean_squared_error(y_test, y_test_pred))}")

## Submission

In [None]:
submission_id = df_test["id"]
df_test = df_test.drop(columns="id")

In [None]:
submission_y = forest.predict(df_test)

In [None]:
submission = pd.DataFrame()
submission["id"] = submission_id
submission["target"] = submission_y

In [None]:
submission

In [None]:
submission.to_csv(f"../submissions/submission_{timestamp()}.csv", index=False)