In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model as lm
from collections import defaultdict
import sys; sys.path.append('../')
from src.data_analysis.tools import ac_pca

## Get Preprocessed data

In [2]:
combined_data = pd.read_csv('../data/intermediates/pre_acpca1.csv')
combined_data.set_index('FIPS', inplace=True)
print(combined_data.shape)
ind_vars = list(combined_data.columns)
total_vars = len(ind_vars)
combined_data.head()

(397, 21)


Unnamed: 0_level_0,baseline,Density per square mile of land area - Population,Rural-urban_Continuum Code_2013,Percent of adults with less than a high school diploma 2014-18,Percent of adults with a bachelor's degree or higher 2014-18,Unemployment_rate_2018,Med_HH_Income_Percent_of_State_Total_2018,Mar Temp AVG / F,Apr Temp AVG / F,May Temp AVG / F,...,Total_age65plus,Total households!!Average household size,SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Nursery school preschool,SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Kindergarten,SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Elementary school (grades 1-8),SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!High school (grades 9-12),SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!College or graduate school,HospCt,Deaths,normalized_deaths
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1055.0,0.449395,0.002718,0.4,0.402985,0.12249,0.135802,0.222485,0.697723,0.736148,0.781775,...,0.203494,0.325792,0.294336,0.358494,0.56754,0.342774,0.075155,0.261462,0.000986,0.028791
1073.0,0.483877,0.008438,0.0,0.247761,0.417671,0.111111,0.342604,0.718427,0.751979,0.803357,...,0.138366,0.276018,0.520694,0.386817,0.576102,0.400644,0.172229,0.169115,0.005521,0.013511
1081.0,0.307942,0.003231,0.4,0.223881,0.451807,0.104938,0.270414,0.757764,0.757256,0.805755,...,0.056305,0.343891,0.586319,0.144048,0.57165,0.427235,0.609583,0.050141,0.003943,0.046459
1095.0,0.516342,0.002275,0.6,0.510448,0.15261,0.092593,0.238462,0.672878,0.712401,0.772182,...,0.162744,0.334842,0.241579,0.364819,0.680389,0.62336,0.08621,0.281908,0.000591,0.024185
1097.0,0.456792,0.004744,0.2,0.352239,0.226908,0.17284,0.207101,0.846791,0.823219,0.868106,...,0.145504,0.303167,0.505194,0.328976,0.607318,0.491922,0.119985,0.106354,0.006901,0.028506


In [3]:
ind_vars

['baseline',
 'Density per square mile of land area - Population',
 'Rural-urban_Continuum Code_2013',
 'Percent of adults with less than a high school diploma 2014-18',
 "Percent of adults with a bachelor's degree or higher 2014-18",
 'Unemployment_rate_2018',
 'Med_HH_Income_Percent_of_State_Total_2018',
 'Mar Temp AVG / F',
 'Apr Temp AVG / F',
 'May Temp AVG / F',
 'Jun Temp AVG / F',
 'Total_age65plus',
 'Total households!!Average household size',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Nursery school preschool',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Kindergarten',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Elementary school (grades 1-8)',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!High school (grades 9-12)',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!College or graduate school',
 'HospCt',
 'Deaths',
 'normalized_deaths']

## Run AC-PCA

In [4]:
def run_acpca(combined_data, confounders):
    ind_vars = list(combined_data.drop(columns=confounders).columns)
    X = combined_data.drop(columns=['Deaths', 'normalized_deaths'], inplace=False)
    X = X.drop(columns=confounders, inplace=False).to_numpy() # 'Density per square mile of land area - Population',
#     print(X.shape)
    X_rows, X_cols = X.shape
    column_names = ['PC' + str(i) for i in range(1, X_cols + 1)]
    
    Y = combined_data.filter(confounders).to_numpy()
    if len(Y.shape) < 2: 
        Y = Y.reshape(len(Y), 1)
    acpca_data, PCs, eigens = ac_pca(X, Y)
    acpca_data = acpca_data.to_numpy()
    PCs = PCs.to_numpy()
    eigens = eigens.to_numpy()
    return acpca_data, PCs, eigens, column_names, ind_vars

## Run linear regression on all PCs and get coefficients

In [5]:
def linreg_coeffs(combined_data, acpca_data, eigens):   
    _, total_pcs = eigens.shape
    pos_pcs = []
    for pc in range(total_pcs): 
        eig = eigens[0,pc]
        if eig > 0: 
            pos_pcs.append(pc)
    if len(pos_pcs) == 0: 
        return 0, 0, pos_pcs
    X = acpca_data[:,pos_pcs]
    Y = combined_data['normalized_deaths']
    reg = lm.LinearRegression().fit(X,Y)
    final_score = reg.score(X,Y)
    coeffs = {}
    for i in range(len(pos_pcs)): 
        coeffs[pos_pcs[i]] = reg.coef_[i]
    return final_score, coeffs, pos_pcs

## Calculate individual weight of each variable

In [6]:
def calc_weights(coeffs, pos_pcs, ind_vars):
    weights = {}
    r,c = PCs.shape
    for pc in pos_pcs: 
        eig = eigens[0, pc]
        PC_dict = {}
        # PC9 = [-0.190896056547888, 0.468236937317933,-0.228567222005448,0.408002822993487,0.0535105125245226,0.407238079363182,0.181820228133276,0.366604933989350,0.435246581102968]
        for i in range(r): 
        #     print('{0:<80} \t {1:>0.3f}'.format(combined_data.columns.values[i+2], PC9[i]))
            loading = PCs[i,pc] # weight of variable in component
            variable = ind_vars[i] # variable name in column headers 
            PC_dict[variable] = loading
            coef = coeffs[pc] # coefficient corresponding to PC
            if variable not in weights:
                weights[variable] = 0
            weights[variable] += coef * loading
    return weights 

## Print results and identify strongest variable

In [7]:
def results(weights, final_score):
    print('score:', final_score)
    max_weight = np.max(np.abs(np.array(list(weights.values()))))
    max_var = None
    for var, weight in weights.items():
    #     print('{0:<100} & {1:>0.4f} \\\\ \hline'.format(var.replace('_', '\\_'), weight))
        if abs(weight) == max_weight: 
            max_var = var
            print('*** {0:<100} {1:>0.4f}'.format(var, weight))
        else:
            print('{0:<100} {1:>0.4f}'.format(var, weight))
    print('\n\nmax var: ', max_var)
    return max_var

In [9]:
confounders = ['baseline']
it = 0 
while len(confounders) < total_vars - 1:
    print('Iteration', it)
    print('confounders:', *confounders)
    it += 1
    acpca_data, PCs, eigens, column_names, ind_vars = run_acpca(combined_data, confounders)
    print(ind_vars)
    final_score, coeffs, pos_pcs = linreg_coeffs(combined_data, acpca_data, eigens)
    if len(pos_pcs) == 0: 
        print('\n\n\n')
        break 
    weights = calc_weights(coeffs, pos_pcs, ind_vars)
    max_var = results(weights, final_score)
    confounders.append(max_var)
    print('\n\n\n')
for c in confounders:
    print(c)

Iteration 0
confounders: baseline
['Density per square mile of land area - Population', 'Rural-urban_Continuum Code_2013', 'Percent of adults with less than a high school diploma 2014-18', "Percent of adults with a bachelor's degree or higher 2014-18", 'Unemployment_rate_2018', 'Med_HH_Income_Percent_of_State_Total_2018', 'Mar Temp AVG / F', 'Apr Temp AVG / F', 'May Temp AVG / F', 'Jun Temp AVG / F', 'Total_age65plus', 'Total households!!Average household size', 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Nursery school preschool', 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Kindergarten', 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Elementary school (grades 1-8)', 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!High school (grades 9-12)', 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!College or graduate school', 'HospCt', 'Deaths', 'normalized_deaths']
score: 0.57556813281