# general_demographics-pca.ipynb

### CSc-59867 - Senior Design - Prof. Etemadpour

* Purpose: exploratory data analysis on the NYC census demographics dataset from NYC Open Data
* Date started: 2021-02-26
* Authors: Xin Chen, Ian S. McBride, Lifu Tao

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import time

### Access dataset

In [None]:
# Read datasets into a DataFrame
# * Data came from:
#     * https://data.cityofnewyork.us/City-Government/Census-Demographics-at-the-NYC-Community-District-/5unr-w4sc
# * Data was reorganized because the original spreadsheet wasn't formatted well
# * Discussion of variable meanings
#     * https://www.2020census.gov/en/conducting-the-count/gq.html
df = pd.read_csv('data/demographics/nyc_demographics_2010_by_cd.csv', index_col='variable_edit')

### Display data

In [None]:
df.drop(columns=['variable_orig'], inplace=True)
df = df.transpose()
df = df.astype(float)
df.sort_index(inplace=True)
display('Transposed, sorted dataframe')
display(df.info())
display(df.head())

### Create helpers

In [None]:
columns = df.columns.to_list()
columns_nested = {
    'total': ['total-population'],
    'race': [
         'race-white-nonhispanic',
         'race-black-nonhispanic',
         'race-asian-and-pacific-islander-nonhispanic',
         'race-other-nonhispanic',
         'race-two-or-more-races-nonhispanic',
         'race-hispanic-origin',
    ],
    'sex': [
         'sex-female',
         'sex-male',
    ],
    'age': [
         'age-under-5-years',
         'age-5-to-9-years',
         'age-10-to-14-years',
         'age-15-to-19-years',
         'age-20-to-24-years',
         'age-25-to-44-years',
         'age-45-to-64-years',
         'age-65-years-and-over',
         'age-nonminors',        
    ],
    'persons-living-in': [
         'persons-living-in-households',
         'persons-living-in-group-quarters',
    ],
    'persons-living-in-family-households': [
         'persons-living-in-family-households',
         'persons-living-in-family-households-householder',
         'persons-living-in-family-households-spouse',
         'persons-living-in-family-households-own-child-under-18-years',
         'persons-living-in-family-households-other-relatives',
         'persons-living-in-family-households-nonrelatives',
    ],
    'persons-living-in-nonfamily-households': [
         'persons-living-in-nonfamily-household',
         'persons-living-in-nonfamily-household-householder',
         'persons-living-in-nonfamily-household-senior-alone',
         'persons-living-in-nonfamily-household-nonrelatives',  
    ],
    'households': [
         'households',
         'households-family-households',
         'households-nonfamily-households',
         'households-with-seniors',
    ],
    'family-households': [
         'family-households-married-couple-family',
         'family-households-married-couple-with-minor-child',
         'family-households-female-householder-no-husband-present',
         'family-households-female-householder-with-minor-child',
         'family-households-male-householder-no-wife-present',
         'family-households-male-householder-with-minor-child',
    ],
    'persons-per': [
         'persons-per-family',
         'persons-per-household',        
    ],
    'housing-units': [
         'housing-units',
         'housing-units-occupied',
         'housing-units-occupied-renter',
         'housing-units-occupied-owner',
         'housing-unit-size-1-person',
         'housing-unit-size-2-person',
         'housing-unit-size-3-person',
         'housing-unit-size-4-person',
         'housing-unit-size-5-persons-and-over',
         'housing-units-by-age-of-householder-15-to-24-years',
         'housing-units-by-age-of-householder-25-to-44-years',
         'housing-units-by-age-of-householder-45-to-64-years',
         'housing-units-by-age-of-householder-65-years-and-over',
    ],
}

In [None]:
# Helper for displaying CD numbers
cd_boro_dict = {
    1: 'Manhattan',
    2: 'Bronx',
    3: 'Brooklyn',
    4: 'Queens',
    5: 'Staten Island',
}
def cd_to_name(cd):
    cd_str = str(cd)
    if not len(cd_str) == 3:
        raise Exception('incorrect length of CD')
    cd_num = cd_str[1:]
    boro = cd_boro_dict.get(int(cd_str[0]))
    if not boro:
        raise Exception('incorrect borough prefix in CD')
    return f'{boro} CD-{cd_num}'

### Plot correlation

In [None]:
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(1, 1, 1)

corr_matrix = df.corr().values
_ = sn.heatmap(corr_matrix, ax=ax)
_ = ax.set_title('Correlation matrix')

### Select features, scale data, run PCA

In [None]:
# Helper functions
def select_feat(df, features_nested, exclude_group=None):
    selection = df.columns.to_list().copy()
    result = df.copy()
    if exclude_group and features_nested.get(exclude_group):
        [selection.remove(col) for col in columns_nested[exclude_group]]
        result = result.drop(columns=features_nested[exclude_group])
    
    return result

def scale_data(df):
    scaler = StandardScaler()
    X = scaler.fit_transform(df.values)
    return X

def run_pca(X, feature_names, n_components=5):
    # Run and time the PCA
    time_start = time.time()
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(X)
    elapsed_time = time.time() - time_start
    print(f'PCA done! Time elapsed: {elapsed_time:0.2} seconds')
    explained_var = pca.explained_variance_ratio_
    print('Cumulative explained variance ratio', np.cumsum(explained_var))
    print('Shape of components:', pca.components_.shape)

    # Plot explained variance ratio
    indicies = range(1, len(explained_var) + 1)
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1, 1, 1)
    _ = plt.plot(
        indicies,
        explained_var,
        alpha=0.5,
        marker='o',
    )
    _ = ax.set_xlabel('PC')
    _ = ax.set_ylabel('Explained variance ratio')
    _ = ax.set_xticks(indicies)
    _ = ax.set_title('PCA Results')
    
    # Plot principle components as mixtures of features in matrix heatmap
    # Inspired by the tutorial:
    # https://towardsdatascience.com/dive-into-pca-principal-component-analysis-with-python-43ded13ead21
    ticks = list(range(n_components))
    labels = [f'PC-{i}' for i in range(1, n_components + 1)]
    _ = plt.matshow(pca.components_.T, cmap='coolwarm')
    _ = plt.yticks(range(len(feature_names)), feature_names, ha='right')
    _ = plt.xticks(ticks, labels, fontsize=10, rotation=65)
    _ = plt.colorbar()
    _ = plt.title('Mixture of features in each PC')
    
    # Plot pca results as scatter matrix
    component_names = [f'PC-{i + 1}' for i in ticks]
    pca_df = pd.DataFrame(columns=component_names)
    for i, name in zip(ticks, component_names):
        pca_df[name] = pca_result[:, i]

    _ = pd.plotting.scatter_matrix(pca_df, figsize=(12, 12))
    _ = plt.suptitle(f'Scatter matrix with {n_components} principle components')

In [None]:
# Use helpers
selected_df = select_feat(df, columns_nested, exclude_group='school-attendance-percent')
X = scale_data(selected_df)
run_pca(X, selected_df.columns, n_components=5)