# Modelling

In [21]:
import joblib
import pandas as pd

from params import RANDOM_STATE, MODEL_RESPONSE, MODEL_FEATURES

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor

from sklearn.metrics import mean_squared_error

In [2]:
data = joblib.load('./data/02_data.pkl')

### Split Data

In [3]:
X, y = data[MODEL_FEATURES], data[MODEL_RESPONSE]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=RANDOM_STATE)

In [5]:
X_train.shape, X_test.shape, X_val.shape

((5256, 14), (1752, 14), (1752, 14))

### Functions

In [23]:
def evaluate_model(model y_test=y_test, X_test=X_test):
    y_pred = model.predict(X_test)

    print(f'RMSE: {(mean_squared_error(y_true=y_test, y_pred=y_pred))**0.5}')

### Dummmy - baseline

In [25]:
dummy = DummyRegressor().fit(X_train, y_train)

In [26]:
evaluate_model(dummy)

RMSE: 642.1243706041097


### Random Forest

In [27]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor()

In [28]:
evaluate_model(rf)

RMSE: 236.44520474442365


### Linear Regression

In [8]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [29]:
evaluate_model(lr)

RMSE: 436.190017318342


### Naive Bayes

In [10]:
nb = GaussianNB()
nb.fit(X_train, y_train)

GaussianNB()

In [30]:
evaluate_model(nb)

RMSE: 442.00778934850103


### Perceptron

In [12]:
pcp = Perceptron()
pcp.fit(X_train, y_train)

Perceptron()

In [31]:
evaluate_model(pcp)

RMSE: 914.2338003928874
