# explore-2.ipynb

### CSc-59866 - Senior Design - Prof. Etemadpour

* Purpose: exploratory data analysis for shelter, COVID-19, and unemployment dataset from NYC Open Data and NYS Department of Labor
* Date: 2020-12-18
* Authors: Xin Chen, Ian S. McBride, Lifu Tao

In [None]:
import json
import numpy as np
import pandas as pd
from sodapy import Socrata
from urllib.request import urlopen

In [None]:
# Access dataset via API call (unauthenicated, rate-limited)
# with urlopen('https://data.cityofnewyork.us/api/geospatial/yfnk-k7r4?method=export&format=GeoJSON') as response:
#     geojson = json.load(response)

# Access dataset via manually downloaded file
# with open('./data/Community Districts.geojson', 'r') as f:
#     geojson = json.load(f)

# Access dataset via sodapy with a token
client = Socrata(
    'data.cityofnewyork.us',
    'o37N4aJqM70C9bwiqcfTNFIRB',
)

# Geodata for boro-cds
# From: https://data.cityofnewyork.us/City-Government/Community-Districts/yfnk-k7r4
geojson = client.get('jp9i-3b7y', limit=2000, content_type='geojson')


# Monthly shelter data by boro-cd
# From: https://data.cityofnewyork.us/Social-Services/Individual-Census-by-Borough-Community-District-an/veav-vj3r
results = client.get('veav-vj3r', limit=2000)
shelter_df = pd.DataFrame.from_records(results, index='report_date')

### Geo data

In [None]:
boro_cds = [f['properties']['boro_cd'] for f in geojson['features']]

In [None]:
# Create dataframe for external plot with fake values for each boro-cd
vals = np.round(np.random.rand((len(boro_cds))), decimals=1)
temp_df = pd.DataFrame({
    'boro_cd': boro_cds,
    'val': vals,
})
display(temp_df)

## Shelter data

In [None]:
# Check row count (should be >1.4K)
display(shelter_df.info())
display(shelter_df)
display(shelter_df.columns)

In [None]:
# Fix column names
columns_orig = [
    'report_date',
    'borough',
    'community_districts',
    'census_type',
    'adult_family_shelter',
    'adult_shelter',
    'family_cluster',
    'family_with_children_comm',
    'family_with_chidren_shelter',
    'adult_shelter_comm_hotel',
    'adult_family_comm_hotel',
]
columns_new = [
    'Borough',
    'Community Districts',
    'Census Type',
    'Adult Family Shelter',
    'Adult Shelter',
    'Family Cluster',
    'Family With Children Comm',
    'Family With Chidren Shelter',
    'Adult Shelter Comm Hotel',
    'Adult Family Comm Hotel',
]

# TODO delete this
shelter_df = pd.DataFrame.from_records(results, index='report_date')

# Fix columns
shelter_df.columns = columns_new

# Fix index
shelter_df.index = pd.to_datetime(shelter_df.index).to_period('M')

# Replace NaNs
shelter_df.fillna(0, inplace=True)

# Fix column types
shelter_df = shelter_df.astype({
    'Adult Family Shelter': int,
    'Adult Shelter': int,
    'Family Cluster': int,
    'Family With Children Comm': int,
    'Family With Chidren Shelter': int,
    'Adult Shelter Comm Hotel': int,
    'Adult Family Comm Hotel': int,
})

# Add total column
shelter_df['Total'] = shelter_df[[
    'Adult Family Shelter',
    'Adult Shelter',
    'Family Cluster',
    'Family With Children Comm',
    'Family With Chidren Shelter',
    'Adult Shelter Comm Hotel',
    'Adult Family Comm Hotel',
]].sum(axis=1)

display(shelter_df.head())
display(shelter_df.info())

In [None]:
# Combine borough, community districts columns into boro_cd
# TODO finish this
# df[['Borough', 'Community Districts']].apply(lambda b: b, axis=1)