In [1]:
# path setup
import sys
import os
module_path = os.path.abspath(os.path.join('../../'))
sys.path.insert(1, module_path + "/utils")

## db setup
# pip install sqlalchemy
from sqlalchemy import create_engine
from getpass import getpass 

# pandas setup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from modeling import lag_columns, extract_date_features
from data_from_db import get_table_from_shelter
from eda import print_correlation_matrix, cramers_v
from model_helper import build_regression_models, preprocess_data


In [2]:
df = get_table_from_shelter('shelter_climate')
df = df[df['location_city'] == 'Toronto']
df['capacity_units'] = df['taken_units'] + df['free_units']
df_model = df.copy()

In [3]:
agg_functions = {
    'taken_units': 'sum',
    'free_units': 'sum',
    'capacity_units': 'sum',
    'min_temperature': 'mean',
    'total_precipitation': 'mean',
    'mean_temperature': 'mean',
    'max_temperature': 'mean',
    'snow_on_ground': 'mean'
}

df_model = df_model.groupby('date').agg(agg_functions).reset_index()
df_model.columns = ['date', 'total_taken_units', 'total_free_units', 'total_capacity_units',
                    'avg_min_temperature', 'avg_total_precipitation', 'avg_mean_temperature',
                    'avg_max_temperature', 'avg_snow_on_ground']

In [4]:
df_model['date'] = pd.to_datetime(df_model['date'])
df_model.dtypes

  df_model['date'] = pd.to_datetime(df_model['date'])


date                       datetime64[ns]
total_taken_units                 float64
total_free_units                  float64
total_capacity_units              float64
avg_min_temperature               float64
avg_total_precipitation           float64
avg_mean_temperature              float64
avg_max_temperature               float64
avg_snow_on_ground                float64
dtype: object

In [5]:
df_model

Unnamed: 0,date,total_taken_units,total_free_units,total_capacity_units,avg_min_temperature,avg_total_precipitation,avg_mean_temperature,avg_max_temperature,avg_snow_on_ground
0,2021-01-01,2988.0,118.0,3106.0,-1.0,6.8,0.7,2.5,0.0
1,2022-01-01,3717.0,79.0,3796.0,-2.1,2.4,1.5,5.1,0.0
2,2023-01-01,4596.0,17.0,4613.0,2.7,1.5,3.9,5.0,0.0
3,2024-01-01,5601.0,55.0,5656.0,-3.1,0.1,-1.9,-0.7,2.0
4,2021-01-02,3072.0,110.0,3182.0,-9.6,0.0,-5.8,-2.0,4.0
...,...,...,...,...,...,...,...,...,...
1181,2022-10-31,4170.0,26.0,4196.0,8.2,3.6,10.2,12.1,0.0
1182,2023-10-31,4996.0,24.0,5020.0,0.4,0.0,3.5,6.6,0.0
1183,2021-12-31,3712.0,105.0,3817.0,2.7,0.0,4.9,7.1,1.0
1184,2022-12-31,4599.0,18.0,4617.0,4.1,22.5,7.4,10.7,0.0


In [6]:
df_model = df_model.sort_values(by='date')
df_model.reset_index(drop=True, inplace=True)
df_model

Unnamed: 0,date,total_taken_units,total_free_units,total_capacity_units,avg_min_temperature,avg_total_precipitation,avg_mean_temperature,avg_max_temperature,avg_snow_on_ground
0,2021-01-01,2988.0,118.0,3106.0,-1.0,6.8,0.7,2.5,0.0
1,2021-01-02,3072.0,110.0,3182.0,-9.6,0.0,-5.8,-2.0,4.0
2,2021-01-03,3104.0,115.0,3219.0,-9.6,0.0,-1.5,6.5,1.0
3,2021-01-04,2945.0,99.0,3044.0,-1.7,0.0,0.7,3.1,0.0
4,2021-01-05,2974.0,154.0,3128.0,2.0,0.0,7.4,12.7,0.0
...,...,...,...,...,...,...,...,...,...
1181,2024-11-02,5670.0,26.0,5696.0,1.9,0.0,2.9,3.9,0.0
1182,2024-11-03,5392.0,40.0,5432.0,-0.8,0.0,3.8,8.4,0.0
1183,2024-12-01,5674.0,21.0,5695.0,-0.1,11.8,1.9,3.9,0.0
1184,2024-12-02,5660.0,34.0,5694.0,-0.3,0.0,1.8,3.9,0.0
