In [7]:
import pandas as pd
import numpy as np
import csv

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm

from datetime import datetime

d = datetime.today().strftime('%Y%m%d-%H%M%S')

data_dir = '/work/users/k/4/k4thryn/Repos/EpSampling/data/'

In [8]:
def get_state_df(files):
    first_df = None
    for i,f in enumerate(files):
        this_df = pd.read_csv(f)
        if first_df is None:
            first_df = this_df
        else:
            df = pd.merge(first_df, this_df, on='GEOID', suffixes=(f'_x{i}', f'_x{i+1}'))
            first_df = df
    return df

In [9]:
df = pd.read_csv('../constants/state_fips.csv')
state_to_fips = dict(zip(df.Postal,df.FIPS))
# state_to_fips

In [11]:
import glob
import os


acs_dir = '/work/users/k/4/k4thryn/Repos/EpSampling/data/acs_results/'

all_st_dfs = []

for i,(sub_dir, dirs, _) in enumerate(os.walk(acs_dir)):
    if i==0:
        continue
    
    state = sub_dir[-2:]
    fips = state_to_fips[state]

    files = glob.glob(f'{sub_dir}/*.csv')
    
    df = get_state_df(files)
    
    df['State_fips'] = fips
    df['State'] = state
    df = df[['State'] + ['State_fips'] + 
            [col for col in df.columns if col not in ['State_fips','State']]]

    all_st_dfs.append(df)
    
final_df = pd.concat(all_st_dfs)
final_df.rename({'GEOID':'Fips'},axis=1,inplace=True)
final_df

Unnamed: 0,State,State_fips,Fips,POP_x2,POP_M,POP_F,POP_A0004,POP_A0509,POP_A1014,POP_A1517,...,HU_x15,HU_UIS01D,HU_UIS01A,HU_UIS02,HU_UIS0304,HU_UIS0509,HU_UIS1019,HU_UIS2049,HU_UIS50P,HU_UISOTHER
0,AK,2,2013,3409,2014,1395,122,103,151,106,...,1113,835,3,57,72,41,35,0,0,70
1,AK,2,2016,5251,2995,2256,162,215,148,110,...,1456,417,70,242,275,154,79,180,12,27
2,AK,2,2020,292545,149648,142897,20218,20836,18642,11308,...,118055,56649,16185,6372,12729,6852,4719,6892,2642,5015
3,AK,2,2050,18514,9724,8790,1902,1923,1696,1028,...,5992,4826,125,305,228,56,65,150,9,228
4,AK,2,2060,849,480,369,67,41,28,23,...,922,711,33,23,29,51,12,7,0,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,WY,56,56037,42459,21850,20609,2657,3456,3008,1824,...,19174,11446,847,478,1023,559,514,404,73,3830
19,WY,56,56039,23319,12280,11039,1068,1032,1475,575,...,13255,7788,1314,555,1098,1056,505,404,97,438
20,WY,56,56041,20514,10437,10077,1413,1712,1762,986,...,8819,5230,606,99,237,230,424,125,62,1806
21,WY,56,56043,7768,4004,3764,399,321,606,442,...,3842,2847,87,128,267,128,22,17,5,341


In [12]:
final_df.to_csv(f'{data_dir}/processed/all_county_acs_covs_{d}.csv',index=False)

### <font color=blue> Post-processing?

In [6]:
data_df = final_df[final_df.columns[3:]]

In [7]:
# Remove highly correlated features.

corr_matrix = data_df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop features 
filtered_data_df = data_df.drop(to_drop, axis=1, inplace=False)
filtered_data_df

Unnamed: 0,POP_x2,MHI,IND_AFFHM,IND_MAN_x7,IND_INF_x7,IND_PUBA_x7,IND_AFFHM_AFFH,IND_AFFHM_MQE,IND_PSMAW_M,HU_VAC,...,POP_ASIANNH,POP_NHPINH,POP_OTH2PLNH,POP_AIAN,HU_UIS01A,HU_UIS02,HU_UIS0304,HU_UIS2049,HU_UIS50P,HU_UISOTHER
0,3409,72258.0,216,1047,10,339,216,0,0,199,...,712,11,330,1505,3,57,72,0,0,70
1,5251,90708.0,47,1079,33,396,41,6,9,452,...,2404,87,344,549,70,242,275,180,12,27
2,292545,88871.0,3545,3547,2704,14462,560,2985,428,11360,...,27758,8399,29471,21150,16185,6372,12729,6892,2642,5015
3,18514,57460.0,148,101,67,1579,58,90,250,1472,...,257,42,532,15640,125,305,228,150,9,228
4,849,81563.0,11,33,17,77,11,0,0,607,...,35,13,71,282,33,23,29,7,0,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,42459,76668.0,3231,1184,372,1295,96,3135,0,3645,...,448,14,988,506,847,478,1023,404,73,3830
19,23319,94498.0,385,198,422,401,318,67,0,3724,...,369,15,617,30,1314,555,1098,404,97,438
20,20514,75106.0,884,543,90,472,136,748,0,1144,...,37,0,571,52,606,99,237,125,62,1806
21,7768,62271.0,336,379,32,287,230,106,0,472,...,31,0,239,71,87,128,267,17,5,341


In [8]:
filtered_data_df.columns

Index(['POP_x2', 'MHI', 'IND_AFFHM', 'IND_MAN_x7', 'IND_INF_x7', 'IND_PUBA_x7',
       'IND_AFFHM_AFFH', 'IND_AFFHM_MQE', 'IND_PSMAW_M', 'HU_VAC',
       'POP_BLACKNH', 'POP_AIANNH', 'POP_ASIANNH', 'POP_NHPINH',
       'POP_OTH2PLNH', 'POP_AIAN', 'HU_UIS01A', 'HU_UIS02', 'HU_UIS0304',
       'HU_UIS2049', 'HU_UIS50P', 'HU_UISOTHER'],
      dtype='object')

In [9]:
## Normalize to 0-1 range.

# from sklearn.preprocessing import MinMaxScaler

# df = data

# scaler = MinMaxScaler()
# my_scaler = scaler.fit(df.values)
# scaled = my_scaler.transform(df.values)

# scaled_df = pd.DataFrame(scaled,columns = df.columns)


# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

## Variance thresholding.

# from sklearn.feature_selection import VarianceThreshold

# selector = VarianceThreshold(0.005)
# selected = selector.fit_transform(new_data)
# selected