# main.ipynb

### CSc-59867 - Senior Design - Prof. Etemadpour

* Purpose: exploratory data analysis on the NYC census demographics dataset from NYC Open Data
* Date: 2021-02-26
* Authors: Xin Chen, Ian S. McBride, Lifu Tao

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn.decomposition import PCA

import time

In [None]:
# Read datasets into a DataFrame
# Data came from:
# https://data.cityofnewyork.us/City-Government/Census-Demographics-at-the-NYC-Community-District-/5unr-w4sc
# Data was reorganized because the original spreadsheet wasn't formatted well
df = pd.read_csv('data/demographics/nyc_demographics_2010_by_cd.csv', index_col='Variable')
# display(df.info())
display('Original dataframe')
display(df)

display('Transposed dataframe')
df = df.transpose()
display(df)

display('Sorted dataframe')
df.sort_index(inplace=True)
display(df)

In [None]:
# Helper for displaying CD numbers
cd_boro_dict = {
    1: 'Manhattan',
    2: 'Bronx',
    3: 'Brooklyn',
    4: 'Queens',
    5: 'Staten Island',
}
def cd_to_name(cd):
    cd_str = str(cd)
    if not len(cd_str) == 3:
        raise Exception('incorrect length of CD')
    cd_num = cd_str[1:]
    boro = cd_boro_dict.get(int(cd_str[0]))
    if not boro:
        raise Exception('incorrect borough prefix in CD')
    return f'{boro} CD-{cd_num}'

In [None]:
df.values.shape

In [None]:
type(df.values)

In [None]:
# Run PCA
time_start = time.time()

pca = PCA(n_components=50)
pca_result = pca.fit_transform(df.values)

print('PCA done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
print('Variance explained per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
component_count = 5
combined_exp_var = round(sum(pca.explained_variance_ratio_[:component_count]), ndigits=3)
print(f'Combined explained variances for first {component_count} components: {combined_exp_var}')

In [None]:
component_indicies = range(1, component_count + 1)
component_names = [f'PC-{i}' for i in component_indicies]
pca_df = pd.DataFrame(columns=component_names)
for i, name in zip(component_indicies, component_names):
    pca_df[name] = pca_result[:, i]

_ = pd.plotting.scatter_matrix(pca_df, figsize=(12, 12))
plt.suptitle(f'Scatter matrix with {component_count} principle components')