In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model as lm
from collections import defaultdict
import sys; sys.path.append('../')
from src.data_analysis.tools import ac_pca

## Get Preprocessed data

In [2]:
combined_data = pd.read_csv('../data/processed/pre_acpca.csv')
# combined_data = combined_data.drop(columns='Total_age65plus')
# combined_data = combined_data.drop(columns='HospCt')
combined_data.set_index('FIPS', inplace=True)
print(combined_data.shape)
ind_vars = list(combined_data.columns)
total_vars = len(ind_vars)
combined_data.head()

(450, 30)


Unnamed: 0_level_0,2wk Prior Intra-Mobility,2wk Onset Intra-Mobility,2wk Post Intra-Mobility,2wk Prior Inter-Mobility,2wk Onset Inter-Mobility,2wk Post Inter-Mobility,Density per square mile of land area - Population,Rural-urban_Continuum Code_2013,Percent of adults with less than a high school diploma 2014-18,Percent of adults with a bachelor's degree or higher 2014-18,...,HospCt,NursingCt,Hispanic Population,Black Population,Outbreak Month Temp AVG / F,Time from outbreak to intervention,Time from outbreak to national intervention,Time from national intervention to outbreak,Deaths,normalized_deaths
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1003,2.589641,2.633919,2.930109,1.945899,2.1706,2.617561,-0.239328,1.305179,-0.300952,-0.034295,...,0.382083,0.091096,-0.663966,-0.373652,1.010922,-0.822207,-0.886143,0.714802,-0.191269,-0.452454
1051,2.616537,2.872442,2.954639,3.883383,4.369864,4.475424,-0.235513,0.211454,0.484199,-0.666851,...,2.054335,-0.338005,-0.774031,0.602986,1.798213,-1.721703,-0.886143,2.264974,-0.187297,-0.142693
1073,1.50476,1.746496,1.841919,1.357854,1.500695,1.47728,-0.105273,-0.882272,-0.18016,0.068031,...,0.364953,-0.167607,-0.708818,2.319588,0.757239,0.45208,0.368819,-0.503191,-0.091985,-0.348247
1081,1.711319,1.790704,1.695938,2.248023,2.091171,1.828681,-0.206733,1.305179,-0.341216,0.22617,...,-0.819174,-0.698036,-0.723703,0.730364,0.800977,0.302164,0.055079,-0.503191,-0.127727,0.117622
1089,1.658056,1.603836,1.517825,1.086011,1.039857,0.987019,-0.154306,0.211454,-0.462009,0.933145,...,-0.492157,-0.190833,-0.635543,0.871903,0.608528,-0.147584,-0.886143,-0.503191,-0.199211,-0.541201


In [3]:
# combined_data = pd.read_csv('../data/intermediates/diff_acpca.csv')
# combined_data['Deaths'] = combined_data['normalized_deaths']
# combined_data.set_index('FIPS', inplace=True)
# print(combined_data.shape)
# ind_vars = list(combined_data.columns)
# total_vars = len(ind_vars)
# combined_data.head()

In [4]:
ind_vars

['2wk Prior Intra-Mobility',
 '2wk Onset Intra-Mobility',
 '2wk Post Intra-Mobility',
 '2wk Prior Inter-Mobility',
 '2wk Onset Inter-Mobility',
 '2wk Post Inter-Mobility',
 'Density per square mile of land area - Population',
 'Rural-urban_Continuum Code_2013',
 'Percent of adults with less than a high school diploma 2014-18',
 "Percent of adults with a bachelor's degree or higher 2014-18",
 'Unemployment_rate_2018',
 'Med_HH_Income_Percent_of_State_Total_2018',
 'Total_age65plus',
 'Total households!!Average household size',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Nursery school preschool',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Kindergarten',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!Elementary school (grades 1-8)',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!High school (grades 9-12)',
 'SCHOOL ENROLLMENT!!Population 3 years and over enrolled in school!!College or graduate 

## Run AC-PCA

In [5]:
def run_acpca(combined_data, confounders, lam):
    ind_vars = list(combined_data.drop(columns=confounders).columns)
    X = combined_data.drop(columns=['Deaths', 'normalized_deaths'], inplace=False)
    X = X.drop(columns=confounders, inplace=False).to_numpy() # 'Density per square mile of land area - Population',
#     print(X.shape)
    X_rows, X_cols = X.shape
    column_names = ['PC' + str(i) for i in range(1, X_cols + 1)]
    if len(confounders) > 0: 
        Y = combined_data.filter(confounders).to_numpy()
    else: 
        Y = np.zeros((X_rows))
    if len(Y.shape) < 2: 
        Y = Y.reshape(len(Y), 1)
    acpca_data, PCs, eigens = ac_pca(X, Y, lam)
    acpca_data = acpca_data.to_numpy()
    PCs = PCs.to_numpy()
    eigens = eigens.to_numpy()
    return acpca_data, PCs, eigens, column_names, ind_vars

## Run linear regression on all PCs and get coefficients

In [6]:
def linreg_coeffs(combined_data, acpca_data, eigens):   
    _, total_pcs = eigens.shape
    pos_pcs = []
    for pc in range(total_pcs): 
        eig = eigens[0,pc]
        if eig > 0: 
            pos_pcs.append(pc)
    if len(pos_pcs) == 0: 
        return 0, 0, pos_pcs
    X = acpca_data[:,pos_pcs]
    Y = combined_data['normalized_deaths']
    reg = lm.LinearRegression().fit(X,Y)
    final_score = reg.score(X,Y)
    coeffs = {}
    for i in range(len(pos_pcs)): 
        coeffs[pos_pcs[i]] = reg.coef_[i]
    return final_score, coeffs, pos_pcs

## Calculate individual weight of each variable

In [7]:
def calc_weights(coeffs, pos_pcs, ind_vars):
    weights = {}
    r,c = PCs.shape
    for pc in pos_pcs: 
        eig = eigens[0, pc]
        PC_dict = {}
        # PC9 = [-0.190896056547888, 0.468236937317933,-0.228567222005448,0.408002822993487,0.0535105125245226,0.407238079363182,0.181820228133276,0.366604933989350,0.435246581102968]
        for i in range(r): 
        #     print('{0:<80} \t {1:>0.3f}'.format(combined_data.columns.values[i+2], PC9[i]))
            loading = PCs[i,pc] # weight of variable in component
            variable = ind_vars[i] # variable name in column headers 
            PC_dict[variable] = loading
            coef = coeffs[pc] # coefficient corresponding to PC
            if variable not in weights:
                weights[variable] = 0
            weights[variable] += coef * loading
    return weights 

## Print results and identify strongest variable

In [8]:
def results(weights, final_score, mobility_features):
    print('score:', final_score)
    max_weight_mag = np.max(np.abs(np.array(list(weights.values()))))
    max_weight = 0
    max_var = None
    max_mobility_weight = 0
    max_mobility_var = None
    for var, weight in weights.items():
        if var in mobility_features and weight > max_mobility_weight:
            max_mobility_weight = weight
            max_mobility_var = var
    #     print('{0:<100} & {1:>0.4f} \\\\ \hline'.format(var.replace('_', '\\_'), weight))
        if abs(weight) == max_weight_mag: 
            max_weight = weight
            max_var = var
            print('*** {0:<100} {1:>0.4f}'.format(var, weight))
        else:
            print('{0:<100} {1:>0.4f}'.format(var, weight))
    print('\n\nmax var: ', max_var)
    return max_var, max_weight, max_mobility_var, max_mobility_weight

In [9]:
mobility_features = ['inter_movement', 'out_movement', 'baseline m50 mobility']
confounders = {} 
it = 0 
max_score = 0
max_score_iter = 0
while len(confounders) < total_vars - 1:
    print('Iteration', it)
    print('confounders:', *confounders.keys())
    acpca_data, PCs, eigens, column_names, ind_vars = run_acpca(combined_data, confounders, lam=20)
    final_score, coeffs, pos_pcs = linreg_coeffs(combined_data, acpca_data, eigens)
    if final_score > max_score:
        max_score = final_score
        max_score_iter = it
    if len(pos_pcs) == 0: 
        print('\n\n\n')
        break 
    weights = calc_weights(coeffs, pos_pcs, ind_vars)
    max_var, max_weight, max_mobility_var, max_mobility_weight = results(weights, final_score, mobility_features)
    confounders[max_var] = max_weight
    print('\n\n\n')
    it += 1
print("Max Score of ", max_score, " achieved on iteration: ", max_score_iter)
for var, weight in confounders.items():
    print('{0:<100} & {1:>0.4f} \\\\hline'.format(var, weight))
for var in combined_data.drop(columns=['Deaths', 'normalized_deaths']).columns:
    if var not in confounders.keys():
        confounders[var] = 0

Iteration 0
confounders:
score: 0.7082845608257148
2wk Prior Intra-Mobility                                                                             0.2384
2wk Onset Intra-Mobility                                                                             -2.3453
2wk Post Intra-Mobility                                                                              1.7686
2wk Prior Inter-Mobility                                                                             -0.1431
*** 2wk Onset Inter-Mobility                                                                             2.6113
2wk Post Inter-Mobility                                                                              -2.2302
Density per square mile of land area - Population                                                    0.7508
Rural-urban_Continuum Code_2013                                                                      0.1100
Percent of adults with less than a high school diploma 2014-18                

In [10]:
coefs =np.array(list(confounders.values())) 
variables = np.array(list(confounders.keys()))
df_coef = pd.DataFrame({'Variable': variables, 'Coefficient': coefs})
df_coef = df_coef.append({"Variable": "Score", "Coefficient": max_score}, ignore_index=True)
df_coef.to_csv("../data/processed/ACPCA_Single_model_Result.csv", index=False)
df_coef.head(50)

Unnamed: 0,Variable,Coefficient
0,2wk Onset Inter-Mobility,2.611268
1,2wk Prior Inter-Mobility,0.825706
2,2wk Onset Intra-Mobility,0.937686
3,2wk Post Intra-Mobility,1.184544
4,Density per square mile of land area - Population,0.487074
5,Percent of adults with less than a high school...,0.139984
6,SCHOOL ENROLLMENT!!Population 3 years and over...,-0.11573
7,HospCt,0.103216
8,SCHOOL ENROLLMENT!!Population 3 years and over...,0.107378
9,Percent of adults with a bachelor's degree or ...,0.104229


## Scaling to unit variance