In [1]:
import sys
import os
module_path = os.path.abspath(os.path.join('../../'))
sys.path.insert(1, module_path + "/utils")

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re 
pd.options.mode.chained_assignment = None 
from data_processing import check_concatenation, preprocess_climate_data, preprocess_climate_data, df_equality

pd.set_option('display.max_columns', None)

#### data retrieval

In [3]:
# save all url's
url_2021 = '../../data/raw/shelter/daily-shelter-overnight-service-occupancy-capacity-2021.csv'
url_2022 = '../../data/raw/shelter/daily-shelter-overnight-service-occupancy-capacity-2022.csv'
url_2023 = '../../data/raw/shelter/daily-shelter-overnight-service-occupancy-capacity-2023.csv'
url_2024 = '../../data/raw/shelter/daily-shelter-overnight-service-occupancy-capacity-Q1:2024.csv'

# read into df
df_2021 = pd.read_csv(url_2021, index_col=False)
df_2022 = pd.read_csv(url_2022, index_col=False)
df_2023 = pd.read_csv(url_2023, index_col=False)
df_2024 = pd.read_csv(url_2024, index_col=False)

# Concatenate all DataFrames into one
shelter_df = pd.concat([df_2021, df_2022, df_2023, df_2024], ignore_index=True)
shelter_df.rename(columns=lambda x: x.strip().replace(" ", "_").lower(), inplace=True)
shelter_df.rename(columns={'occupancy_date': 'date'}, inplace=True)
display(shelter_df['date'].dtype)

shelter_df.columns

dtype('O')

Index(['_id', 'date', 'organization_id', 'organization_name', 'shelter_id',
       'shelter_group', 'location_id', 'location_name', 'location_address',
       'location_postal_code', 'location_city', 'location_province',
       'program_id', 'program_name', 'sector', 'program_model',
       'overnight_service_type', 'program_area', 'service_user_count',
       'capacity_type', 'capacity_actual_bed', 'capacity_funding_bed',
       'occupied_beds', 'unoccupied_beds', 'unavailable_beds',
       'capacity_actual_room', 'capacity_funding_room', 'occupied_rooms',
       'unoccupied_rooms', 'unavailable_rooms', 'occupancy_rate_beds',
       'occupancy_rate_rooms'],
      dtype='object')

In [4]:
if check_concatenation([df_2021, df_2022, df_2023, df_2024], shelter_df):
    print("Concatenation successful!")
else:
    print("Concatenation failed!")

Concatenation successful!


In [5]:
shelter_df.duplicated().sum()

0

#### fix dates, same format as climate.csv

> - we have date like: 21-01-10 and for 2023 like: 2023-01-01T00:00:00

In [6]:
pattern1 = '(\d{2})-(\d{2})-(\d{2})'
pattern2 = '(\d{4})-(\d{2})-(\d{2})'
shelter_df['date'] = shelter_df['date'].astype(str)
# Extract year, month, and day components using regex
shelter_df[['year', 'month', 'day']] = shelter_df['date'].str.extract(pattern1).fillna(shelter_df['date'].str.extract(pattern2))
# Convert extracted components to integers
shelter_df['date'] = shelter_df['day'].astype(str) + '.' + shelter_df['month'].astype(str) + '.' + shelter_df['year'].astype(str)
shelter_df.drop(columns=['year', 'month', 'day'], inplace=True)
shelter_df

Unnamed: 0,_id,date,organization_id,organization_name,shelter_id,shelter_group,location_id,location_name,location_address,location_postal_code,location_city,location_province,program_id,program_name,sector,program_model,overnight_service_type,program_area,service_user_count,capacity_type,capacity_actual_bed,capacity_funding_bed,occupied_beds,unoccupied_beds,unavailable_beds,capacity_actual_room,capacity_funding_room,occupied_rooms,unoccupied_rooms,unavailable_rooms,occupancy_rate_beds,occupancy_rate_rooms
0,1,01.01.21,24,COSTI Immigrant Services,40,COSTI Reception Centre,1103.0,COSTI/City North York West Hotel Program,1677 Wilson Ave,M3L 1A5,North York,ON,15371,COSTI North York West Hotel - Family Program,Families,Emergency,Motel/Hotel Shelter,COVID-19 Response,74,Room Based Capacity,,,,,,29.0,58.0,26.0,3.0,29.0,,89.66
1,2,01.01.21,24,COSTI Immigrant Services,40,COSTI Reception Centre,1103.0,COSTI/City North York West Hotel Program,1677 Wilson Ave,M3L 1A5,North York,ON,16211,COSTI North York West Hotel - Seniors Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,3,Room Based Capacity,,,,,,3.0,0.0,3.0,0.0,0.0,,100.00
2,3,01.01.21,24,COSTI Immigrant Services,40,COSTI Reception Centre,1103.0,COSTI/City North York West Hotel Program,1677 Wilson Ave,M3L 1A5,North York,ON,16192,COSTI North York West Hotel Program - Men,Men,Emergency,Motel/Hotel Shelter,COVID-19 Response,24,Room Based Capacity,,,,,,28.0,0.0,23.0,5.0,0.0,,82.14
3,4,01.01.21,24,COSTI Immigrant Services,40,COSTI Reception Centre,1103.0,COSTI/City North York West Hotel Program,1677 Wilson Ave,M3L 1A5,North York,ON,16191,COSTI North York West Hotel Program - Mixed Adult,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,25,Room Based Capacity,,,,,,17.0,0.0,17.0,0.0,0.0,,100.00
4,5,01.01.21,24,COSTI Immigrant Services,40,COSTI Reception Centre,1103.0,COSTI/City North York West Hotel Program,1677 Wilson Ave,M3L 1A5,North York,ON,16193,COSTI North York West Hotel Program - Women,Women,Emergency,Motel/Hotel Shelter,COVID-19 Response,13,Room Based Capacity,,,,,,14.0,0.0,13.0,1.0,0.0,,92.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161038,12272,31.03.24,17,YWCA Toronto,78,YWCA-348 Davenport,1129.0,YWCA Davenport Shelter,348 Davenport Road,M5R 1K6,Toronto,ON,14671,YWCA Davenport - Youth,Youth,Emergency,Shelter,Base Shelter and Overnight Services System,27,Bed Based Capacity,27.0,27.0,27.0,0.0,0.0,,,,,,100.0,
161039,12273,31.03.24,31,Youth Without Shelter,52,Youth Without Shelter,1064.0,Youth Without Shelter,6 Warrendale Ct,M9V 1P9,Etobicoke,ON,12292,Youth without Shelter Emergency Shelter Program,Youth,Emergency,Shelter,Base Shelter and Overnight Services System,33,Bed Based Capacity,33.0,33.0,33.0,0.0,0.0,,,,,,100.0,
161040,12274,31.03.24,31,Youth Without Shelter,52,Youth Without Shelter,1064.0,Youth Without Shelter,6 Warrendale Ct,M9V 1P9,Etobicoke,ON,12291,Youth without Shelter Stay In School Program,Youth,Transitional,Shelter,Base Shelter and Overnight Services System,20,Bed Based Capacity,20.0,20.0,20.0,0.0,0.0,,,,,,100.0,
161041,12275,31.03.24,38,YouthLink,81,YouthLink Shelter,1147.0,YouthLink,747 Warden Ave,M1L 4A1,Scarborough,ON,14891,YouthLink Emergency Program,Youth,Emergency,Shelter,Base Shelter and Overnight Services System,10,Bed Based Capacity,10.0,10.0,10.0,0.0,0.0,,,,,,100.0,


#### filtering data before removing columns

##### --> program_type
> there is emergency and transitional shelters, we're looking at emergency (the one's that fill up daily)

In [7]:
display(shelter_df['program_model'].unique())
display(shelter_df['program_model'].value_counts())

array(['Emergency', 'Transitional', nan], dtype=object)

program_model
Emergency       131341
Transitional     29700
Name: count, dtype: int64

In [8]:
# retrieve only emergency shelters
shelter_df = shelter_df[shelter_df['program_model'] == 'Emergency']

In [9]:
display(shelter_df['overnight_service_type'].unique())
display(shelter_df['overnight_service_type'].value_counts())

array(['Motel/Hotel Shelter', 'Shelter', 'Interim Housing',
       'Isolation/Recovery Site', '24-Hour Respite Site',
       'Warming Centre', "24-Hour Women's Drop-in",
       'Alternative Space Protocol', 'Top Bunk Contingency Space'],
      dtype=object)

overnight_service_type
Shelter                       73109
Motel/Hotel Shelter           42250
24-Hour Respite Site           9864
24-Hour Women's Drop-in        2070
Isolation/Recovery Site        1790
Warming Centre                 1056
Interim Housing                 720
Alternative Space Protocol      388
Top Bunk Contingency Space       94
Name: count, dtype: int64

> remove rows were value in drop_values --> these are not explicitly emergency shelter, daily basis

In [10]:
drop_values = ['Interim Housing', 'Alternative Space Protocol', 'Top Bunk Contingency Space', 'Isolation/Recovery Site']
shelter_df = shelter_df[~shelter_df['overnight_service_type'].isin(drop_values)]

In [11]:
display(shelter_df['overnight_service_type'].unique())
display(shelter_df['overnight_service_type'].value_counts())

array(['Motel/Hotel Shelter', 'Shelter', '24-Hour Respite Site',
       'Warming Centre', "24-Hour Women's Drop-in"], dtype=object)

overnight_service_type
Shelter                    73109
Motel/Hotel Shelter        42250
24-Hour Respite Site        9864
24-Hour Women's Drop-in     2070
Warming Centre              1056
Name: count, dtype: int64

##### check columns that are empty/nan + basic structure
> - cross checked with the meaning of the columns in [Data Processing](../../data/Data_Processing.md)  

In [12]:
shelter_df.isna().sum()

_id                           0
date                          0
organization_id               0
organization_name             0
shelter_id                    0
shelter_group               216
location_id                 360
location_name              1129
location_address           3012
location_postal_code       3012
location_city              3046
location_province          3046
program_id                    0
program_name                 33
sector                        0
program_model                 0
overnight_service_type        0
program_area                  0
service_user_count            0
capacity_type                 0
capacity_actual_bed       49135
capacity_funding_bed      49135
occupied_beds             49135
unoccupied_beds           49135
unavailable_beds          49135
capacity_actual_room      79214
capacity_funding_room     79253
occupied_rooms            79214
unoccupied_rooms          79214
unavailable_rooms         79253
occupancy_rate_beds       49135
occupanc

#### look into location_id -- we need that value: missing values location# 4836

In [13]:
shelter_df[shelter_df['location_city'].isna()].head(10)

Unnamed: 0,_id,date,organization_id,organization_name,shelter_id,shelter_group,location_id,location_name,location_address,location_postal_code,location_city,location_province,program_id,program_name,sector,program_model,overnight_service_type,program_area,service_user_count,capacity_type,capacity_actual_bed,capacity_funding_bed,occupied_beds,unoccupied_beds,unavailable_beds,capacity_actual_room,capacity_funding_room,occupied_rooms,unoccupied_rooms,unavailable_rooms,occupancy_rate_beds,occupancy_rate_rooms
43,44,01.01.21,1,City of Toronto,3,Seaton House,,,,,,,15751,Seaton House Hostel Response Program,Men,Emergency,Shelter,COVID-19 Response,55,Bed Based Capacity,55.0,55.0,55.0,0.0,0.0,,,,,,100.0,
69,70,01.01.21,26,Fred Victor Centre,42,Fred Victor Women's Hostel,,,,,,,12256,Fred Victor Women's Hostel Program,Women,Emergency,Shelter,Base Shelter and Overnight Services System,25,Bed Based Capacity,25.0,25.0,25.0,0.0,0.0,,,,,,100.0,
128,129,01.01.21,16,Women's Hostels Inc.,27,Nellie's,1035.0,Nellie's Women's Shelter,,,,,12051,Nellie's Women's Shelter,Women,Emergency,Shelter,Base Shelter and Overnight Services System,10,Bed Based Capacity,10.0,10.0,10.0,0.0,0.0,,,,,,100.0,
184,185,02.01.21,1,City of Toronto,3,Seaton House,,,,,,,15751,Seaton House Hostel Response Program,Men,Emergency,Shelter,COVID-19 Response,54,Bed Based Capacity,55.0,55.0,54.0,1.0,0.0,,,,,,98.18,
210,211,02.01.21,26,Fred Victor Centre,42,Fred Victor Women's Hostel,,,,,,,12256,Fred Victor Women's Hostel Program,Women,Emergency,Shelter,Base Shelter and Overnight Services System,25,Bed Based Capacity,25.0,25.0,25.0,0.0,0.0,,,,,,100.0,
269,270,02.01.21,16,Women's Hostels Inc.,27,Nellie's,1035.0,Nellie's Women's Shelter,,,,,12051,Nellie's Women's Shelter,Women,Emergency,Shelter,Base Shelter and Overnight Services System,10,Bed Based Capacity,10.0,10.0,10.0,0.0,0.0,,,,,,100.0,
325,326,03.01.21,1,City of Toronto,3,Seaton House,,,,,,,15751,Seaton House Hostel Response Program,Men,Emergency,Shelter,COVID-19 Response,54,Bed Based Capacity,55.0,55.0,54.0,1.0,0.0,,,,,,98.18,
351,352,03.01.21,26,Fred Victor Centre,42,Fred Victor Women's Hostel,,,,,,,12256,Fred Victor Women's Hostel Program,Women,Emergency,Shelter,Base Shelter and Overnight Services System,25,Bed Based Capacity,25.0,25.0,25.0,0.0,0.0,,,,,,100.0,
410,411,03.01.21,16,Women's Hostels Inc.,27,Nellie's,1035.0,Nellie's Women's Shelter,,,,,12051,Nellie's Women's Shelter,Women,Emergency,Shelter,Base Shelter and Overnight Services System,10,Bed Based Capacity,10.0,10.0,10.0,0.0,0.0,,,,,,100.0,
466,467,04.01.21,1,City of Toronto,3,Seaton House,,,,,,,15751,Seaton House Hostel Response Program,Men,Emergency,Shelter,COVID-19 Response,52,Bed Based Capacity,55.0,55.0,52.0,3.0,0.0,,,,,,94.55,


In [14]:
# check if we can derive the information from shelter_id
shelter_df[shelter_df['shelter_id'] == 82]

Unnamed: 0,_id,date,organization_id,organization_name,shelter_id,shelter_group,location_id,location_name,location_address,location_postal_code,location_city,location_province,program_id,program_name,sector,program_model,overnight_service_type,program_area,service_user_count,capacity_type,capacity_actual_bed,capacity_funding_bed,occupied_beds,unoccupied_beds,unavailable_beds,capacity_actual_room,capacity_funding_room,occupied_rooms,unoccupied_rooms,unavailable_rooms,occupancy_rate_beds,occupancy_rate_rooms
26910,26911,09.07.21,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1163.0,SSHA Etobicoke Hotel Program,,,,,16551,SSHA Etobicoke Hotel Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,2,Bed Based Capacity,88.0,88.0,2.0,86.0,0.0,,,,,,2.27,
27051,27052,10.07.21,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1163.0,SSHA Etobicoke Hotel Program,,,,,16551,SSHA Etobicoke Hotel Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,2,Bed Based Capacity,88.0,88.0,2.0,86.0,0.0,,,,,,2.27,
27191,27192,11.07.21,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1163.0,SSHA Etobicoke Hotel Program,,,,,16551,SSHA Etobicoke Hotel Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,3,Bed Based Capacity,88.0,88.0,3.0,85.0,0.0,,,,,,3.41,
27330,27331,12.07.21,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1163.0,SSHA Etobicoke Hotel Program,,,,,16551,SSHA Etobicoke Hotel Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,3,Bed Based Capacity,88.0,88.0,3.0,85.0,0.0,,,,,,3.41,
27470,27471,13.07.21,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1163.0,SSHA Etobicoke Hotel Program,,,,,16551,SSHA Etobicoke Hotel Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,3,Bed Based Capacity,88.0,88.0,3.0,85.0,0.0,,,,,,3.41,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160663,11897,29.03.24,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1163.0,SSHA Etobicoke Hotel Program,,,,,16551,SSHA Etobicoke Hotel Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,169,Room Based Capacity,,,,,,158.0,134.0,157.0,1.0,0.0,,99.37
160798,12032,30.03.24,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1561.0,885 Scarborough Golf Club Road,885 Scarborough Golf Club Road,M1G 1J6,Toronto,ON,17931,SSHA - Warming Centre - Scarborough Golf Club ...,Mixed Adult,Emergency,Warming Centre,Winter Programs,45,Bed Based Capacity,48.0,48.0,45.0,3.0,0.0,,,,,,93.75,
160799,12033,30.03.24,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1163.0,SSHA Etobicoke Hotel Program,,,,,16551,SSHA Etobicoke Hotel Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,170,Room Based Capacity,,,,,,158.0,134.0,158.0,0.0,0.0,,100.00
160934,12168,31.03.24,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1561.0,885 Scarborough Golf Club Road,885 Scarborough Golf Club Road,M1G 1J6,Toronto,ON,17931,SSHA - Warming Centre - Scarborough Golf Club ...,Mixed Adult,Emergency,Warming Centre,Winter Programs,43,Bed Based Capacity,48.0,48.0,43.0,5.0,0.0,,,,,,89.58,


##### derive data from other entries that have everything filled

In [15]:
# Group by 'shelter_id' and get the first non-null value of 'location_city'
shelter_id_mapping = shelter_df.groupby('shelter_id')['location_city'].first()

# Fill missing values in 'location_city' based on 'shelter_id'
shelter_df['location_city'] = shelter_df.apply(lambda row: shelter_id_mapping.get(row['shelter_id'], row['location_city']), axis=1)

# Display the updated DataFrame
shelter_df[shelter_df['shelter_id'] == 82]

Unnamed: 0,_id,date,organization_id,organization_name,shelter_id,shelter_group,location_id,location_name,location_address,location_postal_code,location_city,location_province,program_id,program_name,sector,program_model,overnight_service_type,program_area,service_user_count,capacity_type,capacity_actual_bed,capacity_funding_bed,occupied_beds,unoccupied_beds,unavailable_beds,capacity_actual_room,capacity_funding_room,occupied_rooms,unoccupied_rooms,unavailable_rooms,occupancy_rate_beds,occupancy_rate_rooms
26910,26911,09.07.21,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1163.0,SSHA Etobicoke Hotel Program,,,Toronto,,16551,SSHA Etobicoke Hotel Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,2,Bed Based Capacity,88.0,88.0,2.0,86.0,0.0,,,,,,2.27,
27051,27052,10.07.21,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1163.0,SSHA Etobicoke Hotel Program,,,Toronto,,16551,SSHA Etobicoke Hotel Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,2,Bed Based Capacity,88.0,88.0,2.0,86.0,0.0,,,,,,2.27,
27191,27192,11.07.21,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1163.0,SSHA Etobicoke Hotel Program,,,Toronto,,16551,SSHA Etobicoke Hotel Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,3,Bed Based Capacity,88.0,88.0,3.0,85.0,0.0,,,,,,3.41,
27330,27331,12.07.21,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1163.0,SSHA Etobicoke Hotel Program,,,Toronto,,16551,SSHA Etobicoke Hotel Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,3,Bed Based Capacity,88.0,88.0,3.0,85.0,0.0,,,,,,3.41,
27470,27471,13.07.21,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1163.0,SSHA Etobicoke Hotel Program,,,Toronto,,16551,SSHA Etobicoke Hotel Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,3,Bed Based Capacity,88.0,88.0,3.0,85.0,0.0,,,,,,3.41,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160663,11897,29.03.24,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1163.0,SSHA Etobicoke Hotel Program,,,Toronto,,16551,SSHA Etobicoke Hotel Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,169,Room Based Capacity,,,,,,158.0,134.0,157.0,1.0,0.0,,99.37
160798,12032,30.03.24,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1561.0,885 Scarborough Golf Club Road,885 Scarborough Golf Club Road,M1G 1J6,Toronto,ON,17931,SSHA - Warming Centre - Scarborough Golf Club ...,Mixed Adult,Emergency,Warming Centre,Winter Programs,45,Bed Based Capacity,48.0,48.0,45.0,3.0,0.0,,,,,,93.75,
160799,12033,30.03.24,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1163.0,SSHA Etobicoke Hotel Program,,,Toronto,,16551,SSHA Etobicoke Hotel Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,170,Room Based Capacity,,,,,,158.0,134.0,158.0,0.0,0.0,,100.00
160934,12168,31.03.24,1,City of Toronto,82,SSHA Etobicoke Hotel Program,1561.0,885 Scarborough Golf Club Road,885 Scarborough Golf Club Road,M1G 1J6,Toronto,ON,17931,SSHA - Warming Centre - Scarborough Golf Club ...,Mixed Adult,Emergency,Warming Centre,Winter Programs,43,Bed Based Capacity,48.0,48.0,43.0,5.0,0.0,,,,,,89.58,


In [16]:
shelter_df['location_city'].isna().sum()

1185

##### still 1185 

In [17]:
shelter_df[shelter_df['location_city'].isna()].head(10)

Unnamed: 0,_id,date,organization_id,organization_name,shelter_id,shelter_group,location_id,location_name,location_address,location_postal_code,location_city,location_province,program_id,program_name,sector,program_model,overnight_service_type,program_area,service_user_count,capacity_type,capacity_actual_bed,capacity_funding_bed,occupied_beds,unoccupied_beds,unavailable_beds,capacity_actual_room,capacity_funding_room,occupied_rooms,unoccupied_rooms,unavailable_rooms,occupancy_rate_beds,occupancy_rate_rooms
128,129,01.01.21,16,Women's Hostels Inc.,27,Nellie's,1035.0,Nellie's Women's Shelter,,,,,12051,Nellie's Women's Shelter,Women,Emergency,Shelter,Base Shelter and Overnight Services System,10,Bed Based Capacity,10.0,10.0,10.0,0.0,0.0,,,,,,100.0,
269,270,02.01.21,16,Women's Hostels Inc.,27,Nellie's,1035.0,Nellie's Women's Shelter,,,,,12051,Nellie's Women's Shelter,Women,Emergency,Shelter,Base Shelter and Overnight Services System,10,Bed Based Capacity,10.0,10.0,10.0,0.0,0.0,,,,,,100.0,
410,411,03.01.21,16,Women's Hostels Inc.,27,Nellie's,1035.0,Nellie's Women's Shelter,,,,,12051,Nellie's Women's Shelter,Women,Emergency,Shelter,Base Shelter and Overnight Services System,10,Bed Based Capacity,10.0,10.0,10.0,0.0,0.0,,,,,,100.0,
551,552,04.01.21,16,Women's Hostels Inc.,27,Nellie's,1035.0,Nellie's Women's Shelter,,,,,12051,Nellie's Women's Shelter,Women,Emergency,Shelter,Base Shelter and Overnight Services System,10,Bed Based Capacity,10.0,10.0,10.0,0.0,0.0,,,,,,100.0,
693,694,05.01.21,16,Women's Hostels Inc.,27,Nellie's,1035.0,Nellie's Women's Shelter,,,,,12051,Nellie's Women's Shelter,Women,Emergency,Shelter,Base Shelter and Overnight Services System,10,Bed Based Capacity,10.0,10.0,10.0,0.0,0.0,,,,,,100.0,
835,836,06.01.21,16,Women's Hostels Inc.,27,Nellie's,1035.0,Nellie's Women's Shelter,,,,,12051,Nellie's Women's Shelter,Women,Emergency,Shelter,Base Shelter and Overnight Services System,10,Bed Based Capacity,10.0,10.0,10.0,0.0,0.0,,,,,,100.0,
977,978,07.01.21,16,Women's Hostels Inc.,27,Nellie's,1035.0,Nellie's Women's Shelter,,,,,12051,Nellie's Women's Shelter,Women,Emergency,Shelter,Base Shelter and Overnight Services System,10,Bed Based Capacity,10.0,10.0,10.0,0.0,0.0,,,,,,100.0,
1118,1119,08.01.21,16,Women's Hostels Inc.,27,Nellie's,1035.0,Nellie's Women's Shelter,,,,,12051,Nellie's Women's Shelter,Women,Emergency,Shelter,Base Shelter and Overnight Services System,10,Bed Based Capacity,10.0,10.0,10.0,0.0,0.0,,,,,,100.0,
1259,1260,09.01.21,16,Women's Hostels Inc.,27,Nellie's,1035.0,Nellie's Women's Shelter,,,,,12051,Nellie's Women's Shelter,Women,Emergency,Shelter,Base Shelter and Overnight Services System,10,Bed Based Capacity,10.0,10.0,10.0,0.0,0.0,,,,,,100.0,
1400,1401,10.01.21,16,Women's Hostels Inc.,27,Nellie's,1035.0,Nellie's Women's Shelter,,,,,12051,Nellie's Women's Shelter,Women,Emergency,Shelter,Base Shelter and Overnight Services System,10,Bed Based Capacity,10.0,10.0,10.0,0.0,0.0,,,,,,100.0,


In [18]:
display(shelter_df[shelter_df['shelter_id'] == 27].shape)

(1185, 32)

In [19]:
shelter_df[(shelter_df['shelter_id'] == 27) & 
            ((shelter_df['location_address'].notnull()) | 
             (shelter_df['location_postal_code'].notnull()) | 
             (shelter_df['location_city'].notnull()) | 
             (shelter_df['location_province'].notnull()))].sum().sum()

0.0

> they are all from shelter_id # 27 --> same place  
> so I looked up the shelter - it's in Toronto

In [20]:
display(shelter_df[shelter_df['shelter_id'] == 27].head(1))

Unnamed: 0,_id,date,organization_id,organization_name,shelter_id,shelter_group,location_id,location_name,location_address,location_postal_code,location_city,location_province,program_id,program_name,sector,program_model,overnight_service_type,program_area,service_user_count,capacity_type,capacity_actual_bed,capacity_funding_bed,occupied_beds,unoccupied_beds,unavailable_beds,capacity_actual_room,capacity_funding_room,occupied_rooms,unoccupied_rooms,unavailable_rooms,occupancy_rate_beds,occupancy_rate_rooms
128,129,01.01.21,16,Women's Hostels Inc.,27,Nellie's,1035.0,Nellie's Women's Shelter,,,,,12051,Nellie's Women's Shelter,Women,Emergency,Shelter,Base Shelter and Overnight Services System,10,Bed Based Capacity,10.0,10.0,10.0,0.0,0.0,,,,,,100.0,


In [21]:
shelter_df.loc[shelter_df['shelter_id'] == 27, 'location_city'] = 'Toronto'

In [22]:
shelter_df['location_city'].isna().sum()

0

##### drop columns that are irrelevant 
> - cross checked with the meaning of the columns in [Data Processing](../../data/Data_Processing.md)   

In [23]:
shelter_df.head(10)

Unnamed: 0,_id,date,organization_id,organization_name,shelter_id,shelter_group,location_id,location_name,location_address,location_postal_code,location_city,location_province,program_id,program_name,sector,program_model,overnight_service_type,program_area,service_user_count,capacity_type,capacity_actual_bed,capacity_funding_bed,occupied_beds,unoccupied_beds,unavailable_beds,capacity_actual_room,capacity_funding_room,occupied_rooms,unoccupied_rooms,unavailable_rooms,occupancy_rate_beds,occupancy_rate_rooms
0,1,01.01.21,24,COSTI Immigrant Services,40,COSTI Reception Centre,1103.0,COSTI/City North York West Hotel Program,1677 Wilson Ave,M3L 1A5,North York,ON,15371,COSTI North York West Hotel - Family Program,Families,Emergency,Motel/Hotel Shelter,COVID-19 Response,74,Room Based Capacity,,,,,,29.0,58.0,26.0,3.0,29.0,,89.66
1,2,01.01.21,24,COSTI Immigrant Services,40,COSTI Reception Centre,1103.0,COSTI/City North York West Hotel Program,1677 Wilson Ave,M3L 1A5,North York,ON,16211,COSTI North York West Hotel - Seniors Program,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,3,Room Based Capacity,,,,,,3.0,0.0,3.0,0.0,0.0,,100.0
2,3,01.01.21,24,COSTI Immigrant Services,40,COSTI Reception Centre,1103.0,COSTI/City North York West Hotel Program,1677 Wilson Ave,M3L 1A5,North York,ON,16192,COSTI North York West Hotel Program - Men,Men,Emergency,Motel/Hotel Shelter,COVID-19 Response,24,Room Based Capacity,,,,,,28.0,0.0,23.0,5.0,0.0,,82.14
3,4,01.01.21,24,COSTI Immigrant Services,40,COSTI Reception Centre,1103.0,COSTI/City North York West Hotel Program,1677 Wilson Ave,M3L 1A5,North York,ON,16191,COSTI North York West Hotel Program - Mixed Adult,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,25,Room Based Capacity,,,,,,17.0,0.0,17.0,0.0,0.0,,100.0
4,5,01.01.21,24,COSTI Immigrant Services,40,COSTI Reception Centre,1103.0,COSTI/City North York West Hotel Program,1677 Wilson Ave,M3L 1A5,North York,ON,16193,COSTI North York West Hotel Program - Women,Women,Emergency,Motel/Hotel Shelter,COVID-19 Response,13,Room Based Capacity,,,,,,14.0,0.0,13.0,1.0,0.0,,92.86
5,6,01.01.21,24,COSTI Immigrant Services,40,COSTI Reception Centre,1051.0,COSTI Reception Centre,100 Lippincott St,M5S 2P1,North York,ON,12251,COSTI Reception Centre CITY Program,Mixed Adult,Emergency,Shelter,Base Shelter and Overnight Services System,6,Bed Based Capacity,8.0,8.0,6.0,2.0,0.0,,,,,,75.0,
6,7,01.01.21,24,COSTI Immigrant Services,40,COSTI Reception Centre,1114.0,COSTI Uptown Hotel Program,55 Hallcrown Pl,M2J 4R1,North York,ON,15372,COSTI Uptown Hotel COVID-19 - Family Program,Families,Emergency,Motel/Hotel Shelter,COVID-19 Response,120,Room Based Capacity,,,,,,46.0,25.0,44.0,2.0,0.0,,95.65
7,8,01.01.21,24,COSTI Immigrant Services,40,COSTI Reception Centre,1114.0,COSTI Uptown Hotel Program,55 Hallcrown Pl,M2J 4R1,North York,ON,13751,COSTI Uptown Hotel Family Program,Families,Emergency,Motel/Hotel Shelter,Temporary Refugee Response,130,Room Based Capacity,,,,,,46.0,100.0,44.0,2.0,54.0,,95.65
8,9,01.01.21,14,Christie Ossington Neighbourhood Centre,22,Christie Ossington Men's Hostel,1160.0,CONC Etobicoke Hotel Program,445 Rexdale Blvd,M9W 6P8,Etobicoke,ON,16111,CONC Etobicoke Hotel Program - Mixed Adult,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,153,Room Based Capacity,,,,,,145.0,0.0,145.0,0.0,0.0,,100.0
9,10,01.01.21,14,Christie Ossington Neighbourhood Centre,22,Christie Ossington Men's Hostel,1172.0,CONC West End Hotel Program,14 Roncesvalles Ave,M6R 2K3,Etobicoke,ON,15711,CONC Men's Hotel Program,Men,Emergency,Motel/Hotel Shelter,COVID-19 Response,40,Room Based Capacity,,,,,,41.0,0.0,40.0,1.0,0.0,,97.56


In [24]:
drop_columns = ['_id', 'organization_id', 'organization_name', 'shelter_id',
       'shelter_group', 'location_id', 'location_name', 'location_address',
       'location_postal_code', 'location_province',
       'program_id', 'program_name', 'program_model',
       'program_area', 'service_user_count', 'capacity_actual_bed', 'capacity_funding_bed',
       'capacity_actual_room', 'capacity_funding_room','unavailable_beds', 'unavailable_rooms']

In [25]:
shelter_df.drop(columns=drop_columns, inplace=True)

In [26]:
shelter_df

Unnamed: 0,date,location_city,sector,overnight_service_type,capacity_type,occupied_beds,unoccupied_beds,occupied_rooms,unoccupied_rooms,occupancy_rate_beds,occupancy_rate_rooms
0,01.01.21,North York,Families,Motel/Hotel Shelter,Room Based Capacity,,,26.0,3.0,,89.66
1,01.01.21,North York,Mixed Adult,Motel/Hotel Shelter,Room Based Capacity,,,3.0,0.0,,100.00
2,01.01.21,North York,Men,Motel/Hotel Shelter,Room Based Capacity,,,23.0,5.0,,82.14
3,01.01.21,North York,Mixed Adult,Motel/Hotel Shelter,Room Based Capacity,,,17.0,0.0,,100.00
4,01.01.21,North York,Women,Motel/Hotel Shelter,Room Based Capacity,,,13.0,1.0,,92.86
...,...,...,...,...,...,...,...,...,...,...,...
161036,31.03.24,Toronto,Youth,Shelter,Bed Based Capacity,31.0,0.0,,,100.0,
161037,31.03.24,Toronto,Women,Shelter,Bed Based Capacity,28.0,0.0,,,100.0,
161038,31.03.24,Toronto,Youth,Shelter,Bed Based Capacity,27.0,0.0,,,100.0,
161039,31.03.24,Etobicoke,Youth,Shelter,Bed Based Capacity,33.0,0.0,,,100.0,


> dropping occupancy_rate_beds and occupancy_rate_rooms as well, as we can recalculate them and I need to group by other factors frist

In [27]:
shelter_df.drop(columns=['occupancy_rate_beds', 'occupancy_rate_rooms'], inplace=True)

#### clean capacity_type

In [28]:
shelter_df['capacity_type'] = shelter_df['capacity_type'].map(lambda x: x.split(' ')[0] if isinstance(x, str) else x)

#### turn room and beds into general unit

In [29]:
shelter_df['taken_units'] = shelter_df.apply(lambda row: row['occupied_rooms'] if row['capacity_type'] == 'Room' else row['occupied_beds'], axis=1)
shelter_df['free_units'] = shelter_df.apply(lambda row: row['unoccupied_rooms'] if row['capacity_type'] == 'Room' else row['unoccupied_beds'], axis=1)
shelter_df.drop(columns=['occupied_beds', 'unoccupied_beds', 'occupied_rooms', 'unoccupied_rooms'], inplace=True)

#### get the capacity rate (this is my target for the model)

In [30]:
shelter_df['capacity_rate'] = shelter_df['taken_units'] / (shelter_df['taken_units'] + shelter_df['free_units'])

#### get the availability rate, not for model but for eda

In [31]:
shelter_df['availability'] = shelter_df['free_units'] / (shelter_df['taken_units'] + shelter_df['free_units'])

#### check

In [32]:
display(shelter_df.head(10))
display(shelter_df.shape)

Unnamed: 0,date,location_city,sector,overnight_service_type,capacity_type,taken_units,free_units,capacity_rate,availability
0,01.01.21,North York,Families,Motel/Hotel Shelter,Room,26.0,3.0,0.896552,0.103448
1,01.01.21,North York,Mixed Adult,Motel/Hotel Shelter,Room,3.0,0.0,1.0,0.0
2,01.01.21,North York,Men,Motel/Hotel Shelter,Room,23.0,5.0,0.821429,0.178571
3,01.01.21,North York,Mixed Adult,Motel/Hotel Shelter,Room,17.0,0.0,1.0,0.0
4,01.01.21,North York,Women,Motel/Hotel Shelter,Room,13.0,1.0,0.928571,0.071429
5,01.01.21,North York,Mixed Adult,Shelter,Bed,6.0,2.0,0.75,0.25
6,01.01.21,North York,Families,Motel/Hotel Shelter,Room,44.0,2.0,0.956522,0.043478
7,01.01.21,North York,Families,Motel/Hotel Shelter,Room,44.0,2.0,0.956522,0.043478
8,01.01.21,Etobicoke,Mixed Adult,Motel/Hotel Shelter,Room,145.0,0.0,1.0,0.0
9,01.01.21,Etobicoke,Men,Motel/Hotel Shelter,Room,40.0,1.0,0.97561,0.02439


(128349, 9)

In [33]:
shelter_df.isna().sum().sum()

0

#### validate function

In [34]:
test_df = preprocess_climate_data(['../../data/raw/shelter/daily-shelter-overnight-service-occupancy-capacity-2021.csv',
        '../../data/raw/shelter/daily-shelter-overnight-service-occupancy-capacity-2022.csv',
        '../../data/raw/shelter/daily-shelter-overnight-service-occupancy-capacity-2023.csv',
        '../../data/raw/shelter/daily-shelter-overnight-service-occupancy-capacity-Q1:2024.csv'])
df_equality(test_df, shelter_df)

True

#### write to csv

In [36]:
test_df.to_csv('../../data/processed/shelter.csv', index=False)