In [29]:
# path setup
import sys
import os
module_path = os.path.abspath(os.path.join('../../'))
sys.path.insert(1, module_path + "/utils")

## db setup
# pip install sqlalchemy
from sqlalchemy import create_engine
from getpass import getpass 

# pandas setup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from modeling import lag_columns, extract_date_features
from data_from_db import get_table_from_shelter

In [None]:
df = get_table_from_shelter('shelter_climate')

In [None]:
df

### add time series data, like day, month, year, weekday

In [None]:
df.dtypes

In [None]:
df['date'] = pd.to_datetime(df['date'])
df_sorted = df.sort_values(by='date')

# Extract year, month, day, day of week, etc.
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek + 1 # monday = 1

df

In [None]:
df.columns

In [None]:
df

### add time lag for weather and occupancy

In [None]:
# Define the list of weather-related columns for which you want to create lagged features
weather_columns = ['min_temperature', 'total_precipitation', 'mean_temperature', 
                   'max_temperature', 'snow_on_ground']

# Define the number of lagged days you want to consider
lagged_days = [1, 2, 3]  # You can adjust this as needed

# Generate lagged features for each weather-related column
for column in weather_columns:
    for lag in lagged_days:
        # Create a new column with the lagged values
        df[f'{column}_lag{lag}'] = df[column].shift(lag)

# Drop rows with NaN values resulting from the shift
df.dropna(inplace=True)

In [None]:
df

In [None]:
# Specify the lagged days
lagged_days = [1, 2, 3]  # Example lagged days

# Apply lagged features for each capacity-related column
capacity_columns = ['taken_units', 'free_units', 'capacity_rate', 'availability']
for lag in lagged_days:
    for column in capacity_columns:
        df[f'{column}_lag{lag}'] = df_sorted.groupby('location_city')[column].shift(lag)

# Drop rows with NaN values resulting from the shift
df.dropna(inplace=True)


In [None]:
df

### check function

In [30]:
df1 = get_table_from_shelter('shelter_climate')

In [31]:
lagged_columns = ['taken_units', 'free_units', 'capacity_rate', 'availability', 'min_temperature', 'total_precipitation', 'mean_temperature', 'max_temperature', 'snow_on_ground']
lagged_days = [1]#, 2, 3]
lag_columns_result = lag_columns(df1, lagged_columns, lagged_days)
lag_columns_result

Unnamed: 0,date,location_city,sector,overnight_service_type,capacity_type,taken_units,free_units,capacity_rate,availability,min_temperature,...,snow_on_ground,taken_units_lag_1,free_units_lag_1,capacity_rate_lag_1,availability_lag_1,min_temperature_lag_1,total_precipitation_lag_1,mean_temperature_lag_1,max_temperature_lag_1,snow_on_ground_lag_1
1,01.01.24,Toronto,Mixed Adult,Motel/Hotel Shelter,Room,57.0,0.0,1.000000,0.000000,-3.1,...,2.0,149.0,0.0,1.000000,0.000000,-3.1,0.1,-1.9,-0.7,2.0
2,01.01.24,Toronto,Mixed Adult,Shelter,Bed,8.0,0.0,1.000000,0.000000,-3.1,...,2.0,57.0,0.0,1.000000,0.000000,-3.1,0.1,-1.9,-0.7,2.0
3,01.01.24,Toronto,Families,Motel/Hotel Shelter,Room,67.0,0.0,1.000000,0.000000,-3.1,...,2.0,8.0,0.0,1.000000,0.000000,-3.1,0.1,-1.9,-0.7,2.0
4,01.01.24,Toronto,Families,Motel/Hotel Shelter,Room,161.0,0.0,1.000000,0.000000,-3.1,...,2.0,67.0,0.0,1.000000,0.000000,-3.1,0.1,-1.9,-0.7,2.0
5,01.01.24,Etobicoke,Mixed Adult,Motel/Hotel Shelter,Room,142.0,0.0,1.000000,0.000000,-3.1,...,2.0,161.0,0.0,1.000000,0.000000,-3.1,0.1,-1.9,-0.7,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128344,31.12.23,Toronto,Youth,Shelter,Bed,31.0,0.0,1.000000,0.000000,-1.0,...,0.0,30.0,0.0,1.000000,0.000000,-1.0,2.3,0.3,1.6,0.0
128345,31.12.23,Toronto,Women,Shelter,Bed,27.0,1.0,0.964286,0.035714,-1.0,...,0.0,31.0,0.0,1.000000,0.000000,-1.0,2.3,0.3,1.6,0.0
128346,31.12.23,Toronto,Youth,Shelter,Bed,27.0,0.0,1.000000,0.000000,-1.0,...,0.0,27.0,1.0,0.964286,0.035714,-1.0,2.3,0.3,1.6,0.0
128347,31.12.23,Etobicoke,Youth,Shelter,Bed,33.0,0.0,1.000000,0.000000,-1.0,...,0.0,27.0,0.0,1.000000,0.000000,-1.0,2.3,0.3,1.6,0.0


In [32]:
df1

Unnamed: 0,date,location_city,sector,overnight_service_type,capacity_type,taken_units,free_units,capacity_rate,availability,min_temperature,...,snow_on_ground,taken_units_lag_1,free_units_lag_1,capacity_rate_lag_1,availability_lag_1,min_temperature_lag_1,total_precipitation_lag_1,mean_temperature_lag_1,max_temperature_lag_1,snow_on_ground_lag_1
1,01.01.24,Toronto,Mixed Adult,Motel/Hotel Shelter,Room,57.0,0.0,1.000000,0.000000,-3.1,...,2.0,149.0,0.0,1.000000,0.000000,-3.1,0.1,-1.9,-0.7,2.0
2,01.01.24,Toronto,Mixed Adult,Shelter,Bed,8.0,0.0,1.000000,0.000000,-3.1,...,2.0,57.0,0.0,1.000000,0.000000,-3.1,0.1,-1.9,-0.7,2.0
3,01.01.24,Toronto,Families,Motel/Hotel Shelter,Room,67.0,0.0,1.000000,0.000000,-3.1,...,2.0,8.0,0.0,1.000000,0.000000,-3.1,0.1,-1.9,-0.7,2.0
4,01.01.24,Toronto,Families,Motel/Hotel Shelter,Room,161.0,0.0,1.000000,0.000000,-3.1,...,2.0,67.0,0.0,1.000000,0.000000,-3.1,0.1,-1.9,-0.7,2.0
5,01.01.24,Etobicoke,Mixed Adult,Motel/Hotel Shelter,Room,142.0,0.0,1.000000,0.000000,-3.1,...,2.0,161.0,0.0,1.000000,0.000000,-3.1,0.1,-1.9,-0.7,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128344,31.12.23,Toronto,Youth,Shelter,Bed,31.0,0.0,1.000000,0.000000,-1.0,...,0.0,30.0,0.0,1.000000,0.000000,-1.0,2.3,0.3,1.6,0.0
128345,31.12.23,Toronto,Women,Shelter,Bed,27.0,1.0,0.964286,0.035714,-1.0,...,0.0,31.0,0.0,1.000000,0.000000,-1.0,2.3,0.3,1.6,0.0
128346,31.12.23,Toronto,Youth,Shelter,Bed,27.0,0.0,1.000000,0.000000,-1.0,...,0.0,27.0,1.0,0.964286,0.035714,-1.0,2.3,0.3,1.6,0.0
128347,31.12.23,Etobicoke,Youth,Shelter,Bed,33.0,0.0,1.000000,0.000000,-1.0,...,0.0,27.0,0.0,1.000000,0.000000,-1.0,2.3,0.3,1.6,0.0


In [33]:
df_date_test = df[['date']].copy()
df_date_test.reset_index(drop=True, inplace=True)
df_date_test

Unnamed: 0,date
0,2024-01-01
1,2024-01-01
2,2024-01-01
3,2024-01-01
4,2024-01-01
...,...
128326,2023-12-31
128327,2023-12-31
128328,2023-12-31
128329,2023-12-31


In [34]:
df_date_test = extract_date_features(df_date_test, 'date')
df_date_test

Unnamed: 0,date,year,month,day,day_of_week
0,2024-01-01,2024,1,1,1
1,2024-01-01,2024,1,1,1
2,2024-01-01,2024,1,1,1
3,2024-01-01,2024,1,1,1
4,2024-01-01,2024,1,1,1
...,...,...,...,...,...
128326,2023-12-31,2023,12,31,7
128327,2023-12-31,2023,12,31,7
128328,2023-12-31,2023,12,31,7
128329,2023-12-31,2023,12,31,7


In [35]:
ddf_date_testf = df_date_test.sort_values(by='date', inplace=True)
df_date_test

Unnamed: 0,date,year,month,day,day_of_week
9762,2021-01-01,2021,1,1,5
9736,2021-01-01,2021,1,1,5
9737,2021-01-01,2021,1,1,5
9738,2021-01-01,2021,1,1,5
9739,2021-01-01,2021,1,1,5
...,...,...,...,...,...
7621,2024-12-03,2024,12,3,2
7620,2024-12-03,2024,12,3,2
7619,2024-12-03,2024,12,3,2
7617,2024-12-03,2024,12,3,2
