# main.ipynb

### CSc-59867 - Senior Design - Prof. Etemadpour

* Purpose: exploratory data analysis on the NYC census demographics dataset from NYC Open Data
* Date: 2021-02-26
* Authors: Xin Chen, Ian S. McBride, Lifu Tao

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import time

In [None]:
# Read datasets into a DataFrame
# * Data came from:
#     * https://data.cityofnewyork.us/City-Government/Census-Demographics-at-the-NYC-Community-District-/5unr-w4sc
# * Data was reorganized because the original spreadsheet wasn't formatted well
# * Discussion of variable meanings
#     * https://www.2020census.gov/en/conducting-the-count/gq.html
df = pd.read_csv('data/demographics/nyc_demographics_2010_by_cd.csv', index_col='variable_edit')
df.drop(columns=['variable_orig'], inplace=True)
df = df.transpose()
df = df.astype(float)
df.sort_index(inplace=True)
display('Transposed, sorted dataframe')
display(df)

In [None]:
display('Columns')
df.info()

In [None]:
display('Correlation matrix of features')
df.corr()

In [None]:
columns = df.columns.to_list()
columns_nested = {
    'total': ['total-population'],
    'race': [
         'race-white-nonhispanic',
         'race-black-nonhispanic',
         'race-asian-and-pacific-islander-nonhispanic',
         'race-other-nonhispanic',
         'race-two-or-more-races-nonhispanic',
         'race-hispanic-origin',
    ],
    'sex': [
         'sex-female',
         'sex-male',
    ],
    'age': [
         'age-under-5-years',
         'age-5-to-9-years',
         'age-10-to-14-years',
         'age-15-to-19-years',
         'age-20-to-24-years',
         'age-25-to-44-years',
         'age-45-to-64-years',
         'age-65-years-and-over',
         'age-nonminors',        
    ],
    'persons-living-in': [
         'persons-living-in-households',
         'persons-living-in-group-quarters',
    ],
    'persons-living-in-family-households': [
         'persons-living-in-family-households',
         'persons-living-in-family-households-householder',
         'persons-living-in-family-households-spouse',
         'persons-living-in-family-households-own-child-under-18-years',
         'persons-living-in-family-households-other-relatives',
         'persons-living-in-family-households-nonrelatives',
    ],
    'persons-living-in-nonfamily-households': [
         'persons-living-in-nonfamily-household',
         'persons-living-in-nonfamily-household-householder',
         'persons-living-in-nonfamily-household-senior-alone',
         'persons-living-in-nonfamily-household-nonrelatives',  
    ],
    'households': [
         'households',
         'households-family-households',
         'households-nonfamily-households',
         'households-with-seniors',
    ],
    'family-households': [
         'family-households-married-couple-family',
         'family-households-married-couple-with-minor-child',
         'family-households-female-householder-no-husband-present',
         'family-households-female-householder-with-minor-child',
         'family-households-male-householder-no-wife-present',
         'family-households-male-householder-with-minor-child',
    ],
    'persons-per': [
         'persons-per-family',
         'persons-per-household',        
    ],
    'housing-units': [
         'housing-units',
         'housing-units-occupied',
         'housing-units-occupied-renter',
         'housing-units-occupied-owner',
         'housing-unit-size-1-person',
         'housing-unit-size-2-person',
         'housing-unit-size-3-person',
         'housing-unit-size-4-person',
         'housing-unit-size-5-persons-and-over',
         'housing-units-by-age-of-householder-15-to-24-years',
         'housing-units-by-age-of-householder-25-to-44-years',
         'housing-units-by-age-of-householder-45-to-64-years',
         'housing-units-by-age-of-householder-65-years-and-over',
    ],
}

In [None]:
# Helper for displaying CD numbers
cd_boro_dict = {
    1: 'Manhattan',
    2: 'Bronx',
    3: 'Brooklyn',
    4: 'Queens',
    5: 'Staten Island',
}
def cd_to_name(cd):
    cd_str = str(cd)
    if not len(cd_str) == 3:
        raise Exception('incorrect length of CD')
    cd_num = cd_str[1:]
    boro = cd_boro_dict.get(int(cd_str[0]))
    if not boro:
        raise Exception('incorrect borough prefix in CD')
    return f'{boro} CD-{cd_num}'

In [None]:
def get_x(df, exclude_group=None):
    # Select data
    if exclude_group and columns_nested.get(exclude_group):
        values = df.drop(columns=columns_nested[exclude_group]).values
    else:
        values = df.values
    
    scaler = StandardScaler()
    X = scaler.fit_transform(values)
    
    return X

print('Scale data')
X = get_x(df)
print('Shape:', X.shape)
X

In [None]:
time_start = time.time()

n_components = 5
pca = PCA(n_components=n_components)
pca_result = pca.fit_transform(X)

print('PCA done! Time elapsed: {} seconds'.format(time.time()-time_start))

print('Cumulative explained variance ratio')
display(np.cumsum(pca.explained_variance_ratio_))

print('Shape of components:', pca.components_.shape)

In [None]:
exp_var_pca = pca.explained_variance_ratio_
x = range(1, len(exp_var_pca) + 1)

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
_ = plt.plot(
    x,
    exp_var_pca,
    alpha=0.5,
    marker='o',
)
_ = ax.set_xlabel('PC')
_ = ax.set_ylabel('Explained variance ratio')
_ = ax.set_xticks(x)
_ = ax.set_title('PCA Results')

In [None]:
# Inspired by the tutorial:
# https://towardsdatascience.com/dive-into-pca-principal-component-analysis-with-python-43ded13ead21

ticks = list(range(n_components))
labels = [f'PC-{i}' for i in range(1, n_components + 1)]
_ = plt.matshow(pca.components_, cmap='coolwarm')
_ = plt.yticks(ticks, labels, fontsize=10)
_ = plt.colorbar()
_ = plt.xticks(range(len(df.columns)), df.columns, rotation=65, ha='left')
_ = plt.title('Mixture of features in each PC')

In [None]:
# component_count = 5
# combined_exp_var = round(sum(pca.explained_variance_ratio_[:component_count]), ndigits=3)
# print(f'Combined explained variances for first {component_count} components: {combined_exp_var}')

In [None]:
n_components
component_indicies = range(n_components)
component_names = [f'PC-{i + 1}' for i in component_indicies]
pca_df = pd.DataFrame(columns=component_names)
for i, name in zip(component_indicies, component_names):
    pca_df[name] = pca_result[:, i]

_ = pd.plotting.scatter_matrix(pca_df, figsize=(12, 12))
_ = plt.suptitle(f'Scatter matrix with {n_components} principle components')