# Modelling

In [29]:
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor

from sklearn.metrics import mean_squared_error

import params as p

In [30]:
data = joblib.load(p.DATA_PATH + '02_data.pkl')

### Split Data

In [31]:
X, y = data[p.CORE_MODEL_FEATURES], data[p.MODEL_RESPONSE]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=p.RANDOM_STATE)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=p.RANDOM_STATE)

In [33]:
X_train.shape, X_test.shape, X_val.shape

((4920, 15), (1641, 15), (1640, 15))

### Functions

In [34]:
def evaluate_model(model, y_test=y_test, X_test=X_test):
    
    y_pred = model.predict(X_test)

    print(f'RMSE: {(mean_squared_error(y_true=y_test, y_pred=y_pred))**0.5}')

In [35]:
def save_model(model, model_name: str):

    joblib.dump(model, p.OUTPUTS_PATH + f'model_{model_name}.pkl')

In [36]:
def save_preds(model, model_name: str, X_test=X_test):

    y_pred = pd.DataFrame(model.predict(X_test))

    y_pred.to_csv(p.OUTPUTS_PATH + f'preds_{model_name}.csv')

In [37]:
def save_model_and_preds(model, model_name: str, X_test=X_test):

    save_model(model, model_name)

    save_preds(model, model_name, X_test)

### Dummmy - baseline

In [38]:
dummy = DummyRegressor().fit(X_train, y_train)

In [39]:
evaluate_model(dummy)
save_model_and_preds(dummy, 'dummy')

RMSE: 642.9596807391599


### Random Forest

In [40]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor()

In [41]:
evaluate_model(rf)
save_model_and_preds(rf, 'rf')

RMSE: 3.5323416508155834


### Linear Regression

In [42]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [43]:
evaluate_model(lr)
save_model_and_preds(lr, 'lr')

RMSE: 2.2973822675384153e-13


### Naive Bayes

In [44]:
nb = GaussianNB()
nb.fit(X_train, y_train)

GaussianNB()

In [45]:
evaluate_model(nb)
save_model_and_preds(nb, 'nb')

RMSE: 5.752883917644819
