In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd   
from math import log as ln
from functools import reduce
import statsmodels.formula.api as smf
from scipy.stats import entropy

  import pandas.util.testing as tm


In [3]:
rootPath ='/content/drive/MyDrive/ASU/CIS791_Chen_Final/uds-data/'
filenames_uds = [rootPath+'UDS-2014-Full-Dataset.xlsx',
                 rootPath+'UDS-2015-Full-Dataset.xlsx',
                 rootPath+'UDS-2016-Full-Dataset.xlsx',
                 rootPath+'UDS-2017-Full-Dataset.xlsx',
                 rootPath+'UDS-2018-Full-Dataset.xlsx',
                 rootPath+'UDS-2019-Full-Dataset.xlsx',
                 rootPath+'UDS-2020-Full-Dataset.xlsx']

filenames_lal = [rootPath+'UDS-2016-look-alikes.xlsx',
             rootPath+'UDS-2017-look-alikes.xlsx',
             rootPath+'UDS-2018-look-alikes.xlsx',
             rootPath+'UDS-2019-look-alikes.xlsx',
             rootPath+'UDS-2020-look-alikes.xlsx']

### SiteDF

In [4]:
site_df = pd.read_csv('/content/drive/MyDrive/ASU/Telehealth/Data/healthcentersiteinfo_fix.csv')

In [5]:
# drop HCs with 1+ sites that don't have county information
site_df = site_df[~site_df['grant_number'].isin(set(site_df[site_df['county_fips'].isna()].grant_number.values.tolist()))].reset_index(drop=True)

In [6]:
site_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38949 entries, 0 to 38948
Data columns (total 28 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   bhcmis_id                       38949 non-null  object 
 1   grant_number                    38949 non-null  object 
 2   Health Center Name              38949 non-null  object 
 3   Site Name                       38949 non-null  object 
 4   Site Type                       38949 non-null  object 
 5   Site Status                     38949 non-null  object 
 6   Location Type                   38949 non-null  object 
 7   Location Setting                38949 non-null  object 
 8   Operational Schedule            32702 non-null  object 
 9   Calendar Schedule               32909 non-null  object 
 10  TotalWeekly Hours Of Operation  38792 non-null  object 
 11  Service Area Population         38932 non-null  object 
 12  Site Operated By                

### Weighted average

In [None]:
import os
import requests

#### Download files

In [None]:
# get url
def get_url(year, table):
  url = "https://raw.githubusercontent.com/kyxyxn/Telemedicine/main/Data/County/ACSST5Y{}.{}.csv".format(year, table)
  return url

# get file path
def get_file_path(url, goal_path):
  full_path = ''
  csv_file_name = url.split('County/')[-1]
  full_path = os.path.join(goal_path, csv_file_name)
  return full_path
  
# download file
def download_file(url):
  # with a url, download the full file
  goal_path = '/content/drive/MyDrive/ASU/Telehealth/Data/Controls'
  full_file_path = get_file_path(url, goal_path=goal_path)
  r = requests.get(url)
  with open (full_file_path, 'wb') as f:
    for chunk in r.iter_content(chunk_size=1024):
      if chunk: 
        f.write(chunk)
  return full_file_path

# get files
def get_file(year, table):
  # with a full url, download the fulll file
  url = get_url(year, table)
  # download the file
  download_file(url)
  print("Downloaded!")

In [None]:
for i in range(2014, 2021):
  print(i)
  get_file(i, table='S1903')

#### Read files: Income

In [None]:
income_df = pd.DataFrame()
for i in range(2014, 2021):
  print(i)
  temp_df = pd.read_csv('/content/drive/MyDrive/ASU/Telehealth/Data/Controls/ACSST5Y{}.S1903.csv'.format(i))
  temp_df = temp_df.replace(to_replace={'(X)':np.nan})
  temp_df = temp_df[['GEO_ID','S1903_C02_001E']]
  temp_df['year'] = i
  income_df = income_df.append(temp_df)
  print("Done!")

income_df['county_fips'] = [str(i)[-5:] for i in income_df['GEO_ID']]
income_df.columns = ['GEO_ID','median_household_income','year','county_fips']
income_df = income_df[income_df['GEO_ID']!='id'].reset_index(drop=True)
income_df = income_df[['county_fips','year','median_household_income']]

In [None]:
income_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22541 entries, 0 to 22540
Data columns (total 3 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   county_fips              22541 non-null  object
 1   year                     22541 non-null  int64 
 2   median_household_income  22539 non-null  object
dtypes: int64(1), object(2)
memory usage: 528.4+ KB


In [None]:
income_df = income_df[~income_df['median_household_income'].isna()].reset_index(drop=True)

#### Merge with site data

In [None]:
income_df.loc[:,'county_fips'] = income_df.loc[:,'county_fips'].astype('float64')
site_df.loc[:,'county_fips'] = site_df.loc[:,'county_fips'].astype('float64')
income_df.loc[:,'year'] = income_df.loc[:,'year'].astype(int)
site_df.loc[:,'year'] = site_df.loc[:,'year'].astype(int)

In [None]:
temp_df = pd.merge(site_df, income_df, on=['county_fips','year'], how='outer')
temp_df = temp_df[~temp_df['grant_number'].isna()].reset_index(drop=True)
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38949 entries, 0 to 38948
Data columns (total 29 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   bhcmis_id                       38949 non-null  object 
 1   grant_number                    38949 non-null  object 
 2   Health Center Name              38949 non-null  object 
 3   Site Name                       38949 non-null  object 
 4   Site Type                       38949 non-null  object 
 5   Site Status                     38949 non-null  object 
 6   Location Type                   38949 non-null  object 
 7   Location Setting                38949 non-null  object 
 8   Operational Schedule            32702 non-null  object 
 9   Calendar Schedule               32909 non-null  object 
 10  TotalWeekly Hours Of Operation  38792 non-null  object 
 11  Service Area Population         38932 non-null  object 
 12  Site Operated By                

In [None]:
len(site_df), len(income_df), len(temp_df)

(38949, 22539, 38949)

In [None]:
## define weights
temp_df_wt = temp_df.groupby(['grant_number','year']).count()[['bhcmis_id']].reset_index(level=[0,1])
temp_df_wt.rename(columns={'bhcmis_id':'wt'}, inplace=True)
temp_df = pd.merge(temp_df, temp_df_wt, on=['grant_number','year'])
temp_df['wt'] = [1/i for i in temp_df['wt']]
temp_df = temp_df[~temp_df['median_household_income'].isna()].reset_index(drop=True)
temp_df.loc[:,'median_household_income'] = temp_df.loc[:,'median_household_income'].astype(int)
# to create iindex
temp_df['grantnumber_year'] = [str(temp_df['grant_number'][i])+"_"+str(temp_df['year'][i]) for i in range(len(temp_df))]

In [None]:
temp_df[['grantnumber_year', 
         'county_fips','median_household_income','wt']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38941 entries, 0 to 38940
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   grantnumber_year         38941 non-null  object 
 1   county_fips              38941 non-null  float64
 2   median_household_income  38941 non-null  int64  
 3   wt                       38941 non-null  float64
dtypes: float64(2), int64(1), object(1)
memory usage: 1.2+ MB


In [None]:
# get weighted average of income
temp_df_wa = temp_df.groupby(temp_df.grantnumber_year).apply(lambda x: np.average(x.median_household_income, weights=x.wt)).to_frame().reset_index(level=[0])
temp_df_wa.columns = ['grantnumber_year','median_household_income_wa']
# merge
temp_df = pd.merge(temp_df, temp_df_wa, on=['grantnumber_year'])
income_df = temp_df[['grant_number','year','median_household_income_wa']].drop_duplicates().reset_index(drop=True)

In [None]:
income_df.to_csv('/content/drive/MyDrive/ASU/Telehealth/Data/healthcentersite_income_wa.csv', index=False)