# Modelization

In [1]:
# Set code path
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("Model.ipynb"), '..', 'code')))

In [2]:
from impute_and_drop import impute_missing, drop_data
from one_hot_encoding import one_hot_encoding
from models import RFC_train_and_evaluate, dummy_model
#from preprocess import get_preprocessed_data
import json
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
import numpy as np

In [3]:
#data = get_preprocessed_data(data_file_name="cis2080.csv", labels_file_name="descriptive_var_names.json")
from utils import *
data = load_my_data()

## Subset data
Select only those atributes whose metadata is considered complete.

In [4]:
with open("../metadata/descriptive_var_names.json") as f:
   var_names = json.load(f)

selected_vars = []
for key, value in var_names.items():
    if (value["description"] != "incomplete"):
        selected_vars.append(value["name"])

subdata = data[selected_vars].drop(columns=["day", "month", "year"])

## Missing values imputation
### Exploration

In [5]:
data = impute_missing(subdata)
data = drop_data(data)

## RandomForestRegressor

### One hot encoding

In [6]:
X, y = one_hot_encoding(data, "UBE")

### Split dataset

In [7]:
# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Model

In [8]:
RFR = RandomForestRegressor(n_estimators=5, max_depth=4, random_state=1)


In [9]:
RFR.fit(X_train, y_train)

In [10]:
# Make predictions on the test data
RFR_pred = RFR.predict(X_test)
dummy_pred = dummy_model(y_train, X_test)

# Evaluate the performance of the classifier on the test data
RFR_rmse = np.sqrt(mean_squared_error(y_test, RFR_pred))
dummy_rmse = np.sqrt(mean_squared_error(y_test, dummy_pred))

print("RFR RMSE: {:.2f}".format(RFR_rmse))
print("Dummy RMSE: {:.2f}".format(dummy_rmse))

RMSE: 7.14


In [11]:
data.UBE.quantile([0.1, 0.9])

0.1     0.0
0.9    16.0
Name: UBE, dtype: float64

Given these quantiles, the RMSE indicates that the prediction is awful. Nevertheless, there are plenty of aspects to improve now that a basic model is running.

## RandomForestClassfier

### One Hot Encodding

In [12]:
X, y = one_hot_encoding(data.drop(["UBE", "UBEplus", "drink_loc1", "drink_loc2"], axis=1), "drinking_pattern")

In [13]:
# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [14]:
RFC  = RandomForestClassifier(random_state=0)

In [15]:
RFC.fit(X_train, y_train)

In [16]:
# Make predictions on the test data
RFC_pred = RFC.predict(X_test)
dummy_pred = dummy_model(y_train, X_test)

In [17]:
# Evaluate the performance of the classifier on the test data
RFC_accuracy = accuracy_score(y_test, RFC_pred)
dummy_accuracy = accuracy_score(y_test, dummy_pred)
print("RFC_Accuracy: {:.2f}%".format(RFC_accuracy * 100))
print("Dummy_Accuracy: {:.2f}%".format(dummy_accuracy * 100))

RFC_Accuracy: 64.87%
Dummy_Accuracy: 60.07%


## Model explanability

In [18]:
RFC_train_and_evaluate(data, target="drinking_pattern", columns_to_drop=["UBE", "UBEplus", "drink_loc1", "drink_loc2"])

RFC_Accuracy: 65.52%
Dummy_Accuracy: 59.44%


In [19]:
RFC_train_and_evaluate(data, target="drinking_pattern", columns_to_drop=["UBE", "UBEplus"])

RFC_Accuracy: 81.50%
Dummy_Accuracy: 59.87%


In [20]:
RFC_train_and_evaluate(data, target="drinking_pattern", columns_to_drop=["drink_loc1", "drink_loc2"])

RFC_Accuracy: 92.03%
Dummy_Accuracy: 59.81%


### Those who drink

In [21]:
subdata = data[data.UBE > 0].copy()
RFC_train_and_evaluate(subdata, target="drinking_pattern", columns_to_drop=["UBE", "UBEplus"])

RFC_Accuracy: 95.47%
Dummy_Accuracy: 95.47%
