In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model as lm
from collections import defaultdict
import sys; sys.path.append('../')
from src.data_analysis.tools import ac_pca

## Get Preprocessed data

In [2]:
combined_data = pd.read_csv('../data/intermediates/pre_acpca1.csv')
combined_data.set_index('FIPS', inplace=True)
print(combined_data.shape)
ind_vars = list(combined_data.columns)
total_vars = len(ind_vars)
combined_data.head()

(489, 25)


Unnamed: 0_level_0,inter_movement,out_movement,baseline m50 mobility,Density per square mile of land area - Population,Rural-urban_Continuum Code_2013,Percent of adults with less than a high school diploma 2014-18,Percent of adults with a bachelor's degree or higher 2014-18,Unemployment_rate_2018,Med_HH_Income_Percent_of_State_Total_2018,Mar Temp AVG / F,...,SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Nursery school preschool,SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Kindergarten,SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Elementary school (grades 1-8),SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!High school (grades 9-12),SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!College or graduate school,HospCt,Hispanic Population,Black Population,Deaths,normalized_deaths
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1003,0.519349,0.244188,0.610171,0.001558,0.4,0.229851,0.312698,0.104938,0.373373,0.743689,...,0.417568,0.129585,0.627997,0.462848,0.038455,0.170836,0.038549,0.116681,0.000789,0.010689
1051,0.452205,0.745075,0.840246,0.001753,0.2,0.346269,0.204762,0.092593,0.405917,0.666019,...,0.378824,0.130406,0.487643,0.645915,0.113909,0.338853,0.021248,0.290098,0.000789,0.032762
1055,0.677227,0.378689,0.449395,0.002718,0.4,0.402985,0.096825,0.135802,0.222485,0.598058,...,0.294336,0.358494,0.56754,0.360599,0.075155,0.261462,0.031866,0.210358,0.000986,0.028791
1073,0.775948,0.190299,0.483877,0.008438,0.0,0.247761,0.330159,0.111111,0.342604,0.617476,...,0.520694,0.386817,0.576102,0.4169,0.172229,0.169115,0.031499,0.594907,0.005521,0.013511
1081,0.572121,0.464994,0.307942,0.003231,0.4,0.223881,0.357143,0.104938,0.270414,0.656311,...,0.586319,0.144048,0.57165,0.442769,0.609583,0.050141,0.029159,0.312716,0.003943,0.046459


In [3]:
ind_vars

['inter_movement',
 'out_movement',
 'baseline m50 mobility',
 'Density per square mile of land area - Population',
 'Rural-urban_Continuum Code_2013',
 'Percent of adults with less than a high school diploma 2014-18',
 "Percent of adults with a bachelor's degree or higher 2014-18",
 'Unemployment_rate_2018',
 'Med_HH_Income_Percent_of_State_Total_2018',
 'Mar Temp AVG / F',
 'Apr Temp AVG / F',
 'May Temp AVG / F',
 'Jun Temp AVG / F',
 'Total_age65plus',
 'Total households!!Average household size',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Nursery school preschool',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Kindergarten',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Elementary school (grades 1-8)',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!High school (grades 9-12)',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!College or graduate school',
 'HospCt',
 'Hisp

## Run AC-PCA

In [4]:
def run_acpca(combined_data, confounders):
    ind_vars = list(combined_data.drop(columns=confounders).columns)
    X = combined_data.drop(columns=['Deaths', 'normalized_deaths'], inplace=False)
    X = X.drop(columns=confounders, inplace=False).to_numpy() # 'Density per square mile of land area - Population',
#     print(X.shape)
    X_rows, X_cols = X.shape
    column_names = ['PC' + str(i) for i in range(1, X_cols + 1)]
    if len(confounders) > 0: 
        Y = combined_data.filter(confounders).to_numpy()
    else: 
        Y = np.zeros((X_rows))
    if len(Y.shape) < 2: 
        Y = Y.reshape(len(Y), 1)
    acpca_data, PCs, eigens = ac_pca(X, Y)
    acpca_data = acpca_data.to_numpy()
    PCs = PCs.to_numpy()
    eigens = eigens.to_numpy()
    return acpca_data, PCs, eigens, column_names, ind_vars

## Run linear regression on all PCs and get coefficients

In [5]:
def linreg_coeffs(combined_data, acpca_data, eigens):   
    _, total_pcs = eigens.shape
    pos_pcs = []
    for pc in range(total_pcs): 
        eig = eigens[0,pc]
        if eig > 0: 
            pos_pcs.append(pc)
    if len(pos_pcs) == 0: 
        return 0, 0, pos_pcs
    X = acpca_data[:,pos_pcs]
    Y = combined_data['normalized_deaths']
    reg = lm.LinearRegression().fit(X,Y)
    final_score = reg.score(X,Y)
    coeffs = {}
    for i in range(len(pos_pcs)): 
        coeffs[pos_pcs[i]] = reg.coef_[i]
    return final_score, coeffs, pos_pcs

## Calculate individual weight of each variable

In [6]:
def calc_weights(coeffs, pos_pcs, ind_vars):
    weights = {}
    r,c = PCs.shape
    for pc in pos_pcs: 
        eig = eigens[0, pc]
        PC_dict = {}
        # PC9 = [-0.190896056547888, 0.468236937317933,-0.228567222005448,0.408002822993487,0.0535105125245226,0.407238079363182,0.181820228133276,0.366604933989350,0.435246581102968]
        for i in range(r): 
        #     print('{0:<80} \t {1:>0.3f}'.format(combined_data.columns.values[i+2], PC9[i]))
            loading = PCs[i,pc] # weight of variable in component
            variable = ind_vars[i] # variable name in column headers 
            PC_dict[variable] = loading
            coef = coeffs[pc] # coefficient corresponding to PC
            if variable not in weights:
                weights[variable] = 0
            weights[variable] += coef * loading
    return weights 

## Print results and identify strongest variable

In [7]:
def results(weights, final_score, mobility_features):
    print('score:', final_score)
    max_weight = np.max(np.abs(np.array(list(weights.values()))))
    max_var = None
    max_mobility_weight = 0
    max_mobility_var = None
    for var, weight in weights.items():
        if var in mobility_features and weight > max_mobility_weight:
            max_mobility_weight = weight
            max_mobility_var = var
    #     print('{0:<100} & {1:>0.4f} \\\\ \hline'.format(var.replace('_', '\\_'), weight))
        if abs(weight) == max_weight: 
            max_var = var
            print('*** {0:<100} {1:>0.4f}'.format(var, weight))
        else:
            print('{0:<100} {1:>0.4f}'.format(var, weight))
    print('\n\nmax var: ', max_var)
    return max_var, max_weight, max_mobility_var, max_mobility_weight

In [9]:
mobility_features = ['inter_movement', 'out_movement', 'baseline m50 mobility']
confounders = {} 
it = 0 
while len(confounders) < total_vars - 1:
    print('Iteration', it)
    print('confounders:', *confounders.keys())
    it += 1
    acpca_data, PCs, eigens, column_names, ind_vars = run_acpca(combined_data, confounders)
    final_score, coeffs, pos_pcs = linreg_coeffs(combined_data, acpca_data, eigens)
    if len(pos_pcs) == 0: 
        print('\n\n\n')
        break 
    weights = calc_weights(coeffs, pos_pcs, ind_vars)
    max_var, max_weight, max_mobility_var, max_mobility_weight = results(weights, final_score, mobility_features)
    confounders[max_var] = max_weight
    print('\n\n\n')
for var, weight in confounders.items():
    print('{0:<100} & {1:>0.4f} \\\\hline'.format(var, weight))

Iteration 0
confounders:
score: 0.6344357748050589
inter_movement                                                                                       0.0411
out_movement                                                                                         0.0921
baseline m50 mobility                                                                                0.0178
*** Density per square mile of land area - Population                                                    0.9435
Rural-urban_Continuum Code_2013                                                                      0.0311
Percent of adults with less than a high school diploma 2014-18                                       0.0444
Percent of adults with a bachelor's degree or higher 2014-18                                         0.0369
Unemployment_rate_2018                                                                               0.0520
Med_HH_Income_Percent_of_State_Total_2018                                        