# explore-1.ipynb

### CSc-59866 - Senior Design - Prof. Etemadpour

* Purpose: exploratory data analysis for shelter, COVID-19, and unemployment dataset from NYC Open Data and NYS Department of Labor
* Date: 2020-11-27
* Authors: Xin Chen, Ian S. McBride, Lifu Tao

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import requests
import scipy.stats as stats

In [None]:
# API link from:
# https://data.cityofnewyork.us/Social-Services/DHS-Daily-Report/k46n-sa2m
api_shelter = 'https://data.cityofnewyork.us/resource/k46n-sa2m.csv'
df_shelter = pd.read_csv(api_shelter)

# API link from:
# https://data.cityofnewyork.us/Health/COVID-19-Daily-Counts-of-Cases-Hospitalizations-an/rc75-m7u3
api_covid = 'https://data.cityofnewyork.us/resource/rc75-m7u3.csv'
df_covid = pd.read_csv(api_covid)

# API link from:
# https://labor.ny.gov/stats/LSLAUS.shtm
# api_labor = 'https://www.labor.ny.gov/stats/lausCSV.asp?PASS=1&geog=21093561'
# data_labor = requests.get(api_labor).text

# With downloaded labor file
try:
    with open('./data/lausCSV.csv', 'r') as f:
        data_labor = f.read()
except IOError:
    print('Error: Labor data file not found')

### Secton-1 Shelter data

In [None]:
display('shelter', df_shelter.columns)

In [None]:
columns_orig = [
    'date_of_census',
    'total_adults_in_shelter',
    'total_children_in_shelter',
    'total_individuals_in_shelter',
    'total_individuals_in_families_with_children_in_shelter_',
    'individuals_in_adult_families_in_shelter'
]
columns_renamed = [
    'Shelter Adults',
    'Shelter Children',
    'Shelter Total',
    'Shelter Individuals In Families With Children',
    'Shelter Individuals In Adult Families',
]

# Create df_shelter
df_shelter = pd.read_csv(
    api_shelter,
    index_col='date_of_census',
    usecols=columns_orig,
)
display(df_shelter.columns)

df_shelter.columns = columns_renamed
df_shelter = df_shelter[~df_shelter.index.duplicated(keep='first')]
df_shelter.index = pd.to_datetime(df_shelter.index)
df_shelter.sort_index(inplace=True)
display(df_shelter)
display(df_shelter.info())

### Section-2 - COVID data

In [None]:
display('covid', df_covid.columns)

In [None]:
# Create df_covid
columns_orig = [
    'date_of_interest',
    'case_count',
    'hospitalized_count',
    'death_count'
]
columns_renamed = [
    'COVID Cases',
    'COVID Hospitalizations',
    'COVID Deaths',
]
df_covid = pd.read_csv(
    api_covid,
    index_col='date_of_interest',
    usecols=columns_orig
)
df_covid.columns = columns_renamed
df_covid.index = pd.to_datetime(df_covid.index)
df_covid.sort_index(inplace=True)
display(df_covid)
display(df_covid.info())

In [None]:
# Join shelter, covid dataframes
df_shelter_covid = pd.concat([df_shelter, df_covid], axis=1, sort=False)

# Drop dates in df_shelter don't overlap with df_covid
df_shelter_covid.dropna(inplace=True)

display(df_shelter_covid.info())

### Section-3 - Labor data

In [None]:
# Grab rows with dataset title, column names, and two years
select_file_rows = lambda lines: [lines[idx-2:idx+2] for idx, line in enumerate(lines) if line.startswith('2020')]

# Split row strings into lists, reverse the order (old to new), remove three columns (year, annual average, and trailing empty column)
split_rows_into_list_by_year = lambda rows: {
    variable[0].strip().title(): {l[0]: l[1:-2] for l in list(map(lambda x: x.strip().split(','), variable[-1:1:-1]))}
    for variable in rows
}

# Concat years into one list per variable all in one dict
concat_years_into_one_list = lambda variables: {
    variable_name: [value for lst in d.values() for value in lst]
    for variable_name, d in variables.items()
}

# Create dataframe from dict of variables
def create_df(variables):
    df = pd.DataFrame(
        variables,
        index=pd.date_range(start='2019-01-01', end='2020-12-31', freq='m')
    )
    # Remove NaN values from dataframe
    df = df.replace('', np.nan).dropna()
    
    # Fix one column's datatype
    df['Unemployment Rate'] = df['Unemployment Rate'].transform(
        lambda x: float(x.strip('%')) / 100 if type(x) is str and x else np.nan,
        axis=0
    )
    
    # Fix type of remaining columns
    types = {
        'Unemployment Rate': float,
        'Labor Force': int,
        'Employed': int,
        'Unemployed': int,
    }
    df = df.astype(types)
    
    return df

df_labor = create_df(
    concat_years_into_one_list(
        split_rows_into_list_by_year(
            select_file_rows(
                data_labor.split('\n')))))

# Note that all columns are object type
display(df_labor.info())
display(df_labor)

### Section-4 - Raw data plots

In [None]:
fig = plt.figure(figsize=(16, 16))

# Shelter
ax = fig.add_subplot(2, 2, 1)
ax.set_title('Shelter population')
_ = df_shelter_covid.plot(
    ax=ax,
    y=[
        'Shelter Adults',
        'Shelter Children',
        'Shelter Total',
    ]
)

# COVID
ax = fig.add_subplot(2, 2, 2)
ax.set_title('COVID-19 incidents')
_ = df_shelter_covid.plot(
    ax=ax,
    y=[
        'COVID Cases',
        'COVID Hospitalizations',
        'COVID Deaths',
    ]
)

# Labor
ax = fig.add_subplot(2, 2, 3)
ax.set_title('Labor statistics')
_ = df_labor.loc['2020-02-29':].plot(
    ax=ax,
    y=[
        'Unemployment Rate',
    ]
)

# Create figures dir for saved plot images
os.makedirs('./figures', exist_ok=True)
fig.savefig('./figures/raw_plots.png')

### Correlation Plots

In [None]:
# https://www.researchgate.net/figure/Meaning-of-Pearson-correlation-coefficient-value-r_tbl1_299402589
correlation_values = [-1, -0.7, -0.5, -0.3, 0, 0.3, 0.5, 0.7, 1]
meaning = ['Perfect Negative Correlation',
           'Strong Negative Correlation',
           'Moderate Negative Correlation',
           'Weak Negative Correlation',
           'No Linear Relationship',
           'Weak Positive Correlation',
           'Moderate Positive Correlation',
           'Strong Positive Correlation',
           'Perfect Positive Correlation']
correlation_values_df = pd.DataFrame()
correlation_values_df['Correlation'] = correlation_values
correlation_values_df['Meaning'] = meaning

In [None]:
def evaluate_linear_relationship(a, b):
    slope = np.cov(a, b, bias=True)[0][1] / np.var(a)
    intercept = np.mean(b) - (slope * np.mean(a))
    predictions = (slope * a) + intercept
    residuals = b - predictions
    return slope, intercept, predictions, residuals

In [None]:
fig = plt.figure(figsize=(16,16))

# population vs Cases
print('Shelter Population vs. COVID Cases Correlation:', np.round(stats.pearsonr(df_shelter_covid['Shelter Total'], df_shelter_covid['COVID Cases'])[0], 4))
slope, intercept, predictions, residuals = evaluate_linear_relationship(df_shelter_covid['Shelter Total'], df_shelter_covid['COVID Cases'])
ax = fig.add_subplot(2, 2, 1)
_ = ax.scatter(df_shelter_covid['Shelter Total'], df_shelter_covid['COVID Cases'])
_ = ax.plot(df_shelter_covid['Shelter Total'], predictions)
_ = ax.set_xlabel('Shelter Total')
_ = ax.set_ylabel('COVID Cases')
_ = ax.set_title('Shelter Population vs. COVID Cases')

# population vs Hospitalizations
slope, intercept, predictions, residuals = evaluate_linear_relationship(df_shelter_covid['Shelter Total'], df_shelter_covid['COVID Hospitalizations'])
print('Shelter Population vs. COVID Hospitalizations Correlation:', np.round(stats.pearsonr(df_shelter_covid['Shelter Total'], df_shelter_covid['COVID Hospitalizations'])[0], 4))
ax = fig.add_subplot(2, 2, 2)
_ = ax.scatter(df_shelter_covid['Shelter Total'], df_shelter_covid['COVID Hospitalizations'])
_ = ax.plot(df_shelter_covid['Shelter Total'], predictions)
_ = ax.set_xlabel('Shelter Total')
_ = ax.set_ylabel('COVID Hospitalizations')
_ = ax.set_title('Shelter Population vs. COVID Hospitalizations')

# population vs death
slope, intercept, predictions, residuals = evaluate_linear_relationship(df_shelter_covid['Shelter Total'], df_shelter_covid['COVID Deaths'])
print('Shelter Population vs. COVID Deaths Correlation:', np.round(stats.pearsonr(df_shelter_covid['Shelter Total'], df_shelter_covid['COVID Deaths'])[0], 4))
ax = fig.add_subplot(2, 2, 3)
_ = ax.scatter(df_shelter_covid['Shelter Total'], df_shelter_covid['COVID Deaths'])
_ = ax.plot(df_shelter_covid['Shelter Total'], predictions)
_ = ax.set_xlabel('Shelter Total')
_ = ax.set_ylabel('COVID Deaths')
_ = ax.set_title('Shelter Population vs. COVID Deaths')

# Create figures dir for saved plot images
os.makedirs('./figures', exist_ok=True)
fig.savefig('./figures/correlation_plots.png')