# explore-3.ipynb

### CSc-59866 - Senior Design - Prof. Etemadpour

* Purpose: exploratory data analysis for shelter demographics dataset from NYC Open Data and NYS Department of Labor
* Date: 2020-12-18
* Authors: Xin Chen, Ian S. McBride, Lifu Tao

In [None]:
import json
import numpy as np
import os
import pandas as pd
import pickle
from sodapy import Socrata
from urllib.request import urlopen

### Access dataset

In [None]:
# Access dataset via sodapy with a token
client = Socrata(
    'data.cityofnewyork.us',
    'o37N4aJqM70C9bwiqcfTNFIRB',
)

# Monthly shelter demographics data (city-wide)
# From: https://data.cityofnewyork.us/Social-Services/DHS-Data-Dashboard/5e9h-x6ak
results = client.get('5e9h-x6ak', limit=2000)
df_original = pd.DataFrame.from_records(results, index='report_date')

### Display original data

In [None]:
# Check row count (should be 66)
display(df_original.info())
display(df_original)

### Clean shelter data

In [None]:
columns = [
    'avg-daily-family-with-children-adults',
    'avg-daily-family-with-children-children',
    'avg-daily-family-with-children',
    'avg-daily-adult-family',
    'avg-daily-single-adult-sex-male',
    'avg-daily-single-adult-sex-female',
    'avg-daily-single-adult',
    'individuals-family-with-children-age-0-to-5',
    'individuals-family-with-children-age-6-to-13',
    'individuals-family-with-children-age-14-to-17',
    'individuals-family-with-children-age-18-to-20',
    'individuals-family-with-children-age-21-to-29',
    'individuals-family-with-children-age-30-to-44',
    'individuals-family-with-children-age-45-to-64',
    'individuals-family-with-children-age-65-and-above',
    'individuals-family-with-children',
    'individuals-adult-family-age-18-to-20',
    'individuals-adult-family-age-21-to-29',
    'individuals-adult-family-age-30-to-44',
    'individuals-adult-family-age-45-to-64',
    'individuals-adult-family-age-65-and-above',
    'individuals-adult-family',
    'individuals-single-adult-age-18-to-29',
    'individuals-single-adult-age-30-to-44',
    'individuals-single-adult-age-45-to-64',
    'individuals-single-adult-age-65-and-above',
    'individuals-single-adult',
    'head-of-household-family-with-children-race-asian-pacific-islander',
    'head-of-household-family-with-children-race-black-non-hispanic',
    'head-of-household-family-with-children-race-hispanic',
    'head-of-household-family-with-children-race-native-american',
    'head-of-household-family-with-children-race-white-non-hispanic',
    'head-of-household-family-with-children-race-unknown',
    'head-of-household-family-with-children',
    'head-of-household-adult-family-race-asian-pacific-islander',
    'head-of-household-adult-family-race-black-non-hispanic',
    'head-of-household-adult-family-race-hispanic',
    'head-of-household-adult-family-race-native-american',
    'head-of-household-adult-family-race-white-non-hispanic',
    'head-of-household-adult-family-race-unknown',
    'head-of-household-adult-family',
    'head-of-household-single-adult-race-asian-pacific-islander',
    'head-of-household-single-adult-race-black-non-hispanic',
    'head-of-household-single-adult-race-hispanic',
    'head-of-household-single-adult-race-native-american',
    'head-of-household-single-adult-race-white-non-hispanic',
    'head-of-household-single-adult-race-unknown',
    'head-of-household-single-adult',
    'individuals-family-with-children-age-school-age',
    'school-attendance-percent',
    'percent-families-with-school-placement',
    'home-stat-clients-placed-into-housing',
    'homebase-enrollments',
    'placed-outside-shelter-case-family-with-children',
    'placed-outside-shelter-case-adult-family',
    'placed-outside-shelter-case-single-adult',
    'placed-in-supportive-housing-single-adult',
]
columns_nested = {
    'avg-daily-family-with-children': [
        'avg-daily-family-with-children-adults',
        'avg-daily-family-with-children-children',
        'avg-daily-family-with-children',
    ],
    'avg-daily-adult-family': [
        'avg-daily-adult-family',    
    ],
    'avg-daily-single-adult': [
        'avg-daily-single-adult-sex-male',
        'avg-daily-single-adult-sex-female',
        'avg-daily-single-adult',
    ],
    'individuals-family-with-children-age': [
        'individuals-family-with-children-age-0-to-5',
        'individuals-family-with-children-age-6-to-13',
        'individuals-family-with-children-age-14-to-17',
        'individuals-family-with-children-age-18-to-20',
        'individuals-family-with-children-age-21-to-29',
        'individuals-family-with-children-age-30-to-44',
        'individuals-family-with-children-age-45-to-64',
        'individuals-family-with-children-age-65-and-above',
        'individuals-family-with-children',
    ],
    'individuals-adult-family-age': [
        'individuals-adult-family-age-18-to-20',
        'individuals-adult-family-age-21-to-29',
        'individuals-adult-family-age-30-to-44',
        'individuals-adult-family-age-45-to-64',
        'individuals-adult-family-age-65-and-above',
        'individuals-adult-family',
    ],
    'individuals-single-adult-age': [
        'individuals-single-adult-age-18-to-29',
        'individuals-single-adult-age-30-to-44',
        'individuals-single-adult-age-45-to-64',
        'individuals-single-adult-age-65-and-above',
        'individuals-single-adult',
    ],
    'head-of-household-family-with-children-race' : [
        'head-of-household-family-with-children-race-asian-pacific-islander',
        'head-of-household-family-with-children-race-black-non-hispanic',
        'head-of-household-family-with-children-race-hispanic',
        'head-of-household-family-with-children-race-native-american',
        'head-of-household-family-with-children-race-white-non-hispanic',
        'head-of-household-family-with-children-race-unknown',
        'head-of-household-family-with-children',      
    ],
    'head-of-household-adult-family-race': [
        'head-of-household-adult-family-race-asian-pacific-islander',
        'head-of-household-adult-family-race-black-non-hispanic',
        'head-of-household-adult-family-race-hispanic',
        'head-of-household-adult-family-race-native-american',
        'head-of-household-adult-family-race-white-non-hispanic',
        'head-of-household-adult-family-race-unknown',
        'head-of-household-adult-family',
    ],
    'head-of-household-single-adult-race': [
        'head-of-household-single-adult-race-asian-pacific-islander',
        'head-of-household-single-adult-race-black-non-hispanic',
        'head-of-household-single-adult-race-hispanic',
        'head-of-household-single-adult-race-native-american',
        'head-of-household-single-adult-race-white-non-hispanic',
        'head-of-household-single-adult-race-unknown',
        'head-of-household-single-adult',
    ],
    'individuals-family-with-children-age-school-age': [
        'individuals-family-with-children-age-school-age',
    ],
    'school-attendance-percent': [
        'school-attendance-percent',
    ],
    'percent-families-with-school-placement': [
        'percent-families-with-school-placement',
    ],
    'home-stat-clients-placed-into-housing': [
        'home-stat-clients-placed-into-housing',
    ],
    'homebase-enrollments': [
        'homebase-enrollments',
    ],
    'placed-outside-shelter-case': [
        'placed-outside-shelter-case-family-with-children',
        'placed-outside-shelter-case-adult-family',
        'placed-outside-shelter-case-single-adult',
    ],
    'placed-in-supportive-housing-single-adult': [
        'placed-in-supportive-housing-single-adult',
    ],
}
print('Column count for columns and column_nested should match')
len(columns), sum([len(value) for _, value in columns_nested.items()])

In [None]:
# Select years 2019 and 2020
data_selection = df_original.loc['2019-01-01':].values

# Fix index format, column names
df = pd.DataFrame(
    data_selection,
    index=pd.date_range(start='2019-01-31', freq='m', periods=24),
    columns=columns,
)

# Helpers for cleaning
def clean_school_attendance(v):
    if v.find('NA') != -1:
        return pd.NA
    elif v.find('%') != -1:
        return float(v[0:-1]) / 100
    else:    
        return float(v)
clean_home_stat = lambda v: v.replace(',', '')

# Apply helpers
df['school-attendance-percent'] = df['school-attendance-percent'].apply(clean_school_attendance)
df['home-stat-clients-placed-into-housing'] = df['home-stat-clients-placed-into-housing'].apply(clean_home_stat)

# Convert datatype
column_dtype = {c: 'int' for c in columns}
column_dtype = {**column_dtype, 'school-attendance-percent': object, 'percent-families-with-school-placement': float}
df = df.astype(column_dtype)
display(df)
display(df.info())

### Store datasets locally

In [None]:
os.makedirs('./data', exist_ok=True)
df.to_pickle('./data/shelter_demographics.pickle')