In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model as lm
from collections import defaultdict
import sys; sys.path.append('../')
from src.data_analysis.tools import ac_pca

## Get Preprocessed data

In [2]:
combined_data = pd.read_csv('../data/intermediates/pre_acpca1.csv')
combined_data.set_index('FIPS', inplace=True)
print(combined_data.shape)
ind_vars = list(combined_data.columns)
total_vars = len(ind_vars)
combined_data.head()

(444, 27)


Unnamed: 0_level_0,Unnamed: 0,2wk Prior Mobility,2wk Onset Mobility,2wk Post Mobility,inter_movement,out_movement,baseline m50 mobility,Density per square mile of land area - Population,Rural-urban_Continuum Code_2013,Percent of adults with less than a high school diploma 2014-18,...,SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Elementary school (grades 1-8),SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!High school (grades 9-12),SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!College or graduate school,HospCt,Hispanic Population,Black Population,Outbreak Month Temp AVG / F,Time from outbreak to intervention,Deaths,normalized_deaths
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1003,0,0.459962,0.738688,0.824197,0.519349,0.327736,0.663421,0.001558,0.4,0.229851,...,0.627997,0.500158,0.02375,0.170836,0.038549,0.116048,0.754826,0.959795,0.000591,0.006585
1051,1,0.521206,0.785409,0.844992,0.452205,1.0,0.913574,0.001753,0.2,0.346269,...,0.487643,0.710296,0.1044,0.338853,0.021248,0.289589,0.928571,0.947819,0.000788,0.025222
1073,2,0.581889,0.834773,0.879697,0.775948,0.255409,0.526105,0.008438,0.0,0.247761,...,0.576102,0.447416,0.166735,0.169115,0.031499,0.594616,0.698842,0.976048,0.005518,0.012855
1081,3,0.301939,0.575735,0.703762,0.572121,0.624091,0.334816,0.003231,0.4,0.223881,...,0.57165,0.477111,0.634204,0.050141,0.029159,0.312223,0.708494,0.974337,0.003745,0.040884
1089,4,0.492726,0.741843,0.824973,0.68622,0.197095,0.553652,0.005921,0.2,0.20597,...,0.502458,0.524013,0.25127,0.082998,0.043017,0.337374,0.666023,0.969204,0.000197,0.001246


In [3]:
ind_vars

['Unnamed: 0',
 '2wk Prior Mobility',
 '2wk Onset Mobility',
 '2wk Post Mobility',
 'inter_movement',
 'out_movement',
 'baseline m50 mobility',
 'Density per square mile of land area - Population',
 'Rural-urban_Continuum Code_2013',
 'Percent of adults with less than a high school diploma 2014-18',
 "Percent of adults with a bachelor's degree or higher 2014-18",
 'Unemployment_rate_2018',
 'Med_HH_Income_Percent_of_State_Total_2018',
 'Total_age65plus',
 'Total households!!Average household size',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Nursery school preschool',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Kindergarten',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Elementary school (grades 1-8)',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!High school (grades 9-12)',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!College or graduate school',
 'HospCt',
 'Hispa

## Run AC-PCA

In [4]:
def run_acpca(combined_data, confounders):
    ind_vars = list(combined_data.drop(columns=confounders).columns)
    X = combined_data.drop(columns=['Deaths', 'normalized_deaths'], inplace=False)
    X = X.drop(columns=confounders, inplace=False).to_numpy() # 'Density per square mile of land area - Population',
#     print(X.shape)
    X_rows, X_cols = X.shape
    column_names = ['PC' + str(i) for i in range(1, X_cols + 1)]
    if len(confounders) > 0: 
        Y = combined_data.filter(confounders).to_numpy()
    else: 
        Y = np.zeros((X_rows))
    if len(Y.shape) < 2: 
        Y = Y.reshape(len(Y), 1)
    acpca_data, PCs, eigens = ac_pca(X, Y)
    acpca_data = acpca_data.to_numpy()
    PCs = PCs.to_numpy()
    eigens = eigens.to_numpy()
    return acpca_data, PCs, eigens, column_names, ind_vars

## Run linear regression on all PCs and get coefficients

In [5]:
def linreg_coeffs(combined_data, acpca_data, eigens):   
    _, total_pcs = eigens.shape
    pos_pcs = []
    for pc in range(total_pcs): 
        eig = eigens[0,pc]
        if eig > 0: 
            pos_pcs.append(pc)
    if len(pos_pcs) == 0: 
        return 0, 0, pos_pcs
    X = acpca_data[:,pos_pcs]
    Y = combined_data['normalized_deaths']
    reg = lm.LinearRegression().fit(X,Y)
    final_score = reg.score(X,Y)
    coeffs = {}
    for i in range(len(pos_pcs)): 
        coeffs[pos_pcs[i]] = reg.coef_[i]
    return final_score, coeffs, pos_pcs

## Calculate individual weight of each variable

In [6]:
def calc_weights(coeffs, pos_pcs, ind_vars):
    weights = {}
    r,c = PCs.shape
    for pc in pos_pcs: 
        eig = eigens[0, pc]
        PC_dict = {}
        # PC9 = [-0.190896056547888, 0.468236937317933,-0.228567222005448,0.408002822993487,0.0535105125245226,0.407238079363182,0.181820228133276,0.366604933989350,0.435246581102968]
        for i in range(r): 
        #     print('{0:<80} \t {1:>0.3f}'.format(combined_data.columns.values[i+2], PC9[i]))
            loading = PCs[i,pc] # weight of variable in component
            variable = ind_vars[i] # variable name in column headers 
            PC_dict[variable] = loading
            coef = coeffs[pc] # coefficient corresponding to PC
            if variable not in weights:
                weights[variable] = 0
            weights[variable] += coef * loading
    return weights 

## Print results and identify strongest variable

In [7]:
def results(weights, final_score, mobility_features):
    print('score:', final_score)
    max_weight_mag = np.max(np.abs(np.array(list(weights.values()))))
    max_weight = 0
    max_var = None
    max_mobility_weight = 0
    max_mobility_var = None
    for var, weight in weights.items():
        if var in mobility_features and weight > max_mobility_weight:
            max_mobility_weight = weight
            max_mobility_var = var
    #     print('{0:<100} & {1:>0.4f} \\\\ \hline'.format(var.replace('_', '\\_'), weight))
        if abs(weight) == max_weight_mag: 
            max_weight = weight
            max_var = var
            print('*** {0:<100} {1:>0.4f}'.format(var, weight))
        else:
            print('{0:<100} {1:>0.4f}'.format(var, weight))
    print('\n\nmax var: ', max_var)
    return max_var, max_weight, max_mobility_var, max_mobility_weight

In [8]:
mobility_features = ['inter_movement', 'out_movement', 'baseline m50 mobility']
confounders = {} 
it = 0 
while len(confounders) < total_vars - 1:
    print('Iteration', it)
    print('confounders:', *confounders.keys())
    it += 1
    acpca_data, PCs, eigens, column_names, ind_vars = run_acpca(combined_data, confounders)
    final_score, coeffs, pos_pcs = linreg_coeffs(combined_data, acpca_data, eigens)
    if len(pos_pcs) == 0: 
        print('\n\n\n')
        break 
    weights = calc_weights(coeffs, pos_pcs, ind_vars)
    max_var, max_weight, max_mobility_var, max_mobility_weight = results(weights, final_score, mobility_features)
    confounders[max_var] = max_weight
    print('\n\n\n')
for var, weight in confounders.items():
    print('{0:<100} & {1:>0.4f} \\\\hline'.format(var, weight))

Iteration 0
confounders:
score: 0.7079535905201758
Unnamed: 0                                                                                           0.0000
2wk Prior Mobility                                                                                   0.0187
2wk Onset Mobility                                                                                   0.0200
2wk Post Mobility                                                                                    -0.0282
inter_movement                                                                                       0.0217
out_movement                                                                                         0.0653
baseline m50 mobility                                                                                0.0390
*** Density per square mile of land area - Population                                                    0.9546
Rural-urban_Continuum Code_2013                                                 