# Modelling

In [1]:
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor

from sklearn.metrics import mean_squared_error

import os
os.chdir('..')

import params as p

In [2]:
data = joblib.load('./data/02_data.pkl')

### Split Data

In [3]:
X, y = data[p.MODEL_FEATURES_WB], data[p.MODEL_RESPONSE]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=p.RANDOM_STATE)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=p.RANDOM_STATE)

In [5]:
X_train.shape, X_test.shape, X_val.shape

((5256, 14), (1752, 14), (1752, 14))

### Functions

In [6]:
def evaluate_model(model, y_test=y_test, X_test=X_test):
    y_pred = model.predict(X_test)

    print(f'RMSE: {(mean_squared_error(y_true=y_test, y_pred=y_pred))**0.5}')

### Dummmy - baseline

In [7]:
dummy = DummyRegressor().fit(X_train, y_train)

In [8]:
evaluate_model(dummy)

RMSE: 642.1243706041097


### Random Forest

In [9]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor()

In [10]:
evaluate_model(rf)

RMSE: 239.9266691761536


### Linear Regression

In [11]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [12]:
evaluate_model(lr)

RMSE: 436.190017318342


### Naive Bayes

In [13]:
nb = GaussianNB()
nb.fit(X_train, y_train)

GaussianNB()

In [14]:
evaluate_model(nb)

RMSE: 442.00778934850103


### Perceptron

In [15]:
pcp = Perceptron()
pcp.fit(X_train, y_train)

Perceptron()

In [16]:
evaluate_model(pcp)

RMSE: 914.2338003928874
