In [1]:
# path setup
import sys
import os
module_path = os.path.abspath(os.path.join('../../'))
sys.path.insert(1, module_path + "/utils")

## db setup
# pip install sqlalchemy
from sqlalchemy import create_engine
from getpass import getpass 

# pandas setup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from modeling import lag_columns, extract_date_features
from data_from_db import get_table_from_shelter
from eda import print_correlation_matrix, cramers_v


In [2]:
df = get_table_from_shelter('shelter_climate')

#### focus on Toronto

In [None]:
df = df[df['location_city'] == 'Toronto']
df['capacity_units'] = df['taken_units'] + df['free_units']
df_model = df.copy()

### Question/Use-Case: Give predictions per overnight_shelter type on a monthly base 

#### start with warming shelter

In [None]:
warming_shelter = df[df['overnight_service_type'] == 'Warming Centre']
warming_shelter

In [None]:
agg_functions = {
    'taken_units': 'sum',
    'free_units': 'sum',
    'capacity_units': 'sum',
    'min_temperature': 'mean',
    'total_precipitation': 'mean',
    'mean_temperature': 'mean',
    'max_temperature': 'mean',
    'snow_on_ground': 'mean'
}

warming_daily = warming_shelter.groupby('date').agg(agg_functions).reset_index()


In [None]:
warming_daily = extract_date_features(warming_daily, 'date')

In [None]:
warming_daily

In [None]:
# Calculate monthly averages
monthly_averages = warming_daily.groupby([warming_daily['date'].dt.month]).agg({
    'min_temperature': 'mean',
    'total_precipitation': 'mean',
    'mean_temperature': 'mean',
    'max_temperature': 'mean',
    'snow_on_ground': 'mean',
    'taken_units': 'sum',
    'free_units': 'sum',
    'capacity_units': 'sum'
}).reset_index()
monthly_averages.rename(columns={'date': 'month'}, inplace=True)
monthly_averages

In [None]:
monthly_averages.describe().T

In [None]:
correlation_matrix = monthly_averages.corr()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
display(correlation_matrix)

In [None]:
correlation_matrix['capacity_units'].abs().sort_values(ascending=False)

In [None]:
temp = monthly_averages['min_temperature']
capacity_unit = monthly_averages['capacity_units']

# Create scatter plot
plt.scatter(temp, capacity_unit, alpha=0.5)
plt.title('Scatter Plot of Min_Temp and Capacity Unit')
plt.xlabel('temp')
plt.ylabel('Capacity Unit')
plt.show()

In [None]:
monthly_averages = monthly_averages[['month', 'min_temperature','total_precipitation','snow_on_ground','capacity_units']]

In [None]:
correlation_matrix = monthly_averages.corr()
correlation_matrix['capacity_units'].abs().sort_values(ascending=False)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

X = monthly_averages[['min_temperature', 'total_precipitation', 'snow_on_ground']]
y = monthly_averages['capacity_units']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

score = model.score(X_test, y_test)
print("R^2 Score:", score)

In [None]:
import matplotlib.pyplot as plt

# Plotting actual vs predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)  # Plotting the diagonal line
plt.title('Actual vs Predicted Capacity Units')
plt.xlabel('Actual Capacity Units')
plt.ylabel('Predicted Capacity Units')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

X = monthly_averages[['month','min_temperature', 'total_precipitation', 'snow_on_ground']]
y = monthly_averages['capacity_units']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

score = model.score(X_test, y_test)
print("R^2 Score:", score)

In [None]:
df.describe().T

> too little data in the set

#### no filtering by overnight_service_type

In [None]:
by_month = extract_date_features(df, 'date')
by_month

In [None]:
numeric_data = by_month.select_dtypes(include='number')
correlation_matrix = numeric_data.corr()

In [None]:
correlation_matrix['capacity_units'].abs().sort_values(ascending=False)

> **realised that it will always only be 12 rows when i aggregate..**

#### instead: try utilizing extracted months 
- build a model that checks the month column -- make the time so small that it seems linear

In [None]:
df_model

In [None]:
df_model.drop(columns='location_city', inplace=True)
df_model

#### filter for specific month

- extract dates

In [None]:
df_model = extract_date_features(df_model, 'date')
df_model

#### add lagged data

In [None]:
numerical = df_model.select_dtypes(include=['number'])
numerical['date'] = df_model['date']
numerical

In [None]:
def add_lagged_columns(df, date_column, columns_to_lag, lag_column_name):
    # Convert the date column to datetime
    df[date_column] = pd.to_datetime(df[date_column])
    
    # Group the DataFrame by date and calculate the mean for each specified column
    daily_mean_temp_df = df.groupby(df[date_column].dt.date)[columns_to_lag].mean().reset_index()
    
    # Create a new DataFrame for lagged columns
    lagged_columns_df = pd.DataFrame()
    
    # Create lagged columns for each specified column
    for column in columns_to_lag:
        # Calculate the lagged values
        lagged_column_name = f"{column}_{lag_column_name}"
        daily_mean_temp_df[lagged_column_name] = daily_mean_temp_df[column].shift(1)
        # Fill NaN values with the first value of the column
        daily_mean_temp_df[lagged_column_name].fillna(daily_mean_temp_df[column].iloc[0], inplace=True)
        
        # Add lagged column to lagged_columns_df
        lagged_columns_df[lagged_column_name] = daily_mean_temp_df[lagged_column_name]
    
    # Add date column to lagged_columns_df
    lagged_columns_df['date_lag'] = daily_mean_temp_df[date_column]
    
    return lagged_columns_df

In [None]:
columns_to_lag = ['mean_temperature','snow_on_ground', 'capacity_units']
lagged_data = add_lagged_columns(numerical, 'date', columns_to_lag, '_1')
display(lagged_data.isna().sum().sum())
lagged_data

In [None]:
numerical.reset_index(drop=True, inplace=True)
numerical

In [None]:
numerical['date'] = pd.to_datetime(numerical['date'])
lagged_data['date_lag'] = pd.to_datetime(lagged_data['date_lag'])
merged_df = pd.merge(numerical, lagged_data, left_on='date', right_on='date_lag', how='left')
numerical = merged_df.drop(columns='date_lag').copy()

In [None]:
numerical

In [None]:
numerical = extract_date_features(numerical, 'date')
numerical

In [None]:
numerical = numerical[numerical['month'] == 2]
numerical.reset_index(drop=True, inplace=True)

In [None]:
numerical

- check correlation matrix numerical 

In [None]:
print_correlation_matrix(numerical)

In [None]:
correlation_matrix = numerical.corr()
correlation_matrix['capacity_units'].abs().sort_values(ascending=False)

-- remove columns with low correlation

In [None]:
numerical = numerical[['capacity_units','capacity_units__1','snow_on_ground','snow_on_ground__1','mean_temperature','mean_temperature__1']]
numerical.describe().T

- check correlation categorical and numerical

In [None]:
categorical = df_model.select_dtypes(object)
categorical

In [None]:
categorical = categorical[df_model['month'] == 2]
categorical['date'] = df_model['date']
categorical.reset_index(drop=True, inplace=True)
categorical

In [None]:
categorical['sector'] = categorical['sector'].replace(['Women', 'Men', 'Mixed Adult'], 'Adult')

In [None]:
# Calculate Cramér's V for each categorical variable against the target variable
for col in categorical.columns[:-1]:  # Exclude the target variable
    cramers_v_score = cramers_v(categorical[col], numerical['capacity_units'])
    print(f"Cramér's V for {col}: {cramers_v_score:.4f}")

- train-test-split

In [None]:
X = pd.concat([numerical, categorical], axis=1)
X['capacity_units__1'] = X['capacity_units__1'].round(1)
X

In [None]:
X.isna().sum().sum()


In [None]:
y = X['capacity_units']
X = X.drop(columns='capacity_units')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
## separate cat from num
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = X.select_dtypes(object).columns

X_train_numerical = X_train[numerical_columns]
X_train_categorical = X_train[categorical_columns]

X_test_numerical = X_test[numerical_columns]
X_test_categorical = X_test[categorical_columns]

- normalize categorical - one hot encode

In [None]:
### one hot encode 
from sklearn.preprocessing import OneHotEncoder

X_train_categorical.columns = [str(col) for col in X_train_categorical.columns]
X_test_categorical.columns = [str(col) for col in X_test_categorical.columns]

encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_data = encoder.fit_transform(X_train_categorical)

X_train_cat_hot = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(X_train_categorical.columns))
display(X_train_cat_hot.head())

# encode test using train encoder
encoded_test_data = encoder.transform(X_test_categorical)
X_test_cat_hot = pd.DataFrame(encoded_test_data, columns=encoder.get_feature_names_out(X_train_categorical.columns))

display(X_test_cat_hot.head())

- scale numerical

In [None]:
from sklearn.preprocessing import MinMaxScaler
X_train_numerical.columns = [str(col) for col in X_train_numerical.columns]
X_test_numerical.columns = [str(col) for col in X_train_numerical.columns]
# Initialize MinMaxScaler
minmax_scaler = MinMaxScaler()

# Fit and transform Min-Max Scaling on numerical data
X_train_numerical_scaled = minmax_scaler.fit_transform(X_train_numerical)

# Convert the scaled numerical data back to a DataFrame
X_train_numerical_scaled_df = pd.DataFrame(X_train_numerical_scaled, columns=X_train_numerical.columns)

# Combine the scaled numerical data with the original categorical data
X_train_scaled = pd.concat([X_train_numerical_scaled_df, X_train_categorical], axis=1)


- train model