In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd   
from math import log as ln
from functools import reduce
import statsmodels.formula.api as smf
from scipy.stats import entropy

  import pandas.util.testing as tm


In [3]:
rootPath ='/content/drive/MyDrive/ASU/CIS791_Chen_Final/uds-data/'
filenames_uds = [rootPath+'UDS-2014-Full-Dataset.xlsx',
                 rootPath+'UDS-2015-Full-Dataset.xlsx',
                 rootPath+'UDS-2016-Full-Dataset.xlsx',
                 rootPath+'UDS-2017-Full-Dataset.xlsx',
                 rootPath+'UDS-2018-Full-Dataset.xlsx',
                 rootPath+'UDS-2019-Full-Dataset.xlsx',
                 rootPath+'UDS-2020-Full-Dataset.xlsx']

filenames_lal = [rootPath+'UDS-2016-look-alikes.xlsx',
             rootPath+'UDS-2017-look-alikes.xlsx',
             rootPath+'UDS-2018-look-alikes.xlsx',
             rootPath+'UDS-2019-look-alikes.xlsx',
             rootPath+'UDS-2020-look-alikes.xlsx']

### SiteDF

In [4]:
site_df = pd.read_csv('/content/drive/MyDrive/ASU/Telehealth/Data/healthcentersiteinfo_fix.csv')

In [5]:
# drop HCs with 1+ sites that don't have county information
site_df = site_df[~site_df['grant_number'].isin(set(site_df[site_df['county_fips'].isna()].grant_number.values.tolist()))].reset_index(drop=True)

In [6]:
site_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38949 entries, 0 to 38948
Data columns (total 28 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   bhcmis_id                       38949 non-null  object 
 1   grant_number                    38949 non-null  object 
 2   Health Center Name              38949 non-null  object 
 3   Site Name                       38949 non-null  object 
 4   Site Type                       38949 non-null  object 
 5   Site Status                     38949 non-null  object 
 6   Location Type                   38949 non-null  object 
 7   Location Setting                38949 non-null  object 
 8   Operational Schedule            32702 non-null  object 
 9   Calendar Schedule               32909 non-null  object 
 10  TotalWeekly Hours Of Operation  38792 non-null  object 
 11  Service Area Population         38932 non-null  object 
 12  Site Operated By                

### Weighted average

In [7]:
import os
import requests

#### Download file

In [19]:
# get url
def get_url():
  url = "https://raw.githubusercontent.com/kyxyxn/Telemedicine/main/Data/telehealth_data/market-saturation-utilization-telehealth.csv"
  return url

# get file path
def get_file_path(url, goal_path):
  full_path = ''
  csv_file_name = url.split('telehealth_data/')[-1]
  full_path = os.path.join(goal_path, csv_file_name)
  return full_path
  
# download file
def download_file(url):
  # with a url, download the full file
  goal_path = '/content/drive/MyDrive/ASU/Telehealth/Data'
  full_file_path = get_file_path(url, goal_path=goal_path)
  r = requests.get(url)
  with open (full_file_path, 'wb') as f:
    for chunk in r.iter_content(chunk_size=1024):
      if chunk: 
        f.write(chunk)
  return full_file_path

# get files
def get_file():
  # with a full url, download the fulll file
  url = get_url()
  # download the file
  download_file(url)
  print("Downloaded!")
  
get_file()

Downloaded!


#### Read and pre-proceess files: Telehealth providers

In [89]:
provider_df = pd.read_csv('/content/drive/MyDrive/ASU/Telehealth/Data/market-saturation-utilization-telehealth.csv')

In [90]:
# clean data
provider_df = provider_df[(provider_df['aggregation_level']=='COUNTY')&(provider_df['state']!='-')&(provider_df['county']!='--ALL--')].reset_index(drop=True)
provider_df['year'] = [str(i)[:4] for i in provider_df['reference_period']]
provider_df = provider_df[provider_df['reference_period'].isin(['2017-01-01 to 2017-12-31', '2018-01-01 to 2018-12-31', '2019-01-01 to 2019-12-31'])].reset_index(drop=True)

# change data type
provider_df['percentage_of_users_out_of_ffs_beneficiaries'] = provider_df['percentage_of_users_out_of_ffs_beneficiaries'].replace({'%':''},regex=True)
provider_df['number_of_fee_for_service_beneficiaries'] = provider_df['number_of_fee_for_service_beneficiaries'].replace({',':''},regex=True)
provider_df['number_of_providers'] = provider_df['number_of_providers'].replace({',':''},regex=True)
provider_df['number_of_users'] = provider_df['number_of_users'].replace({',':''},regex=True)

obj_cols=['percentage_of_users_out_of_ffs_beneficiaries','number_of_fee_for_service_beneficiaries','number_of_providers','number_of_users']
for obj in obj_cols:
  print(obj)
  provider_df.loc[:,'{}'.format(obj)] = provider_df.loc[:,'{}'.format(obj)].astype('float64')

use_cols=['state', 'county', 'state_fips', 'county_fips', 'number_of_fee_for_service_beneficiaries', 'number_of_providers', 'year']
provider_df = provider_df[use_cols]

percentage_of_users_out_of_ffs_beneficiaries
number_of_fee_for_service_beneficiaries
number_of_providers
number_of_users


In [92]:
# create fips
provider_df.loc[:,'state_fips'] = provider_df.loc[:,'state_fips'].astype(int)
provider_df.loc[:,'county_fips'] = provider_df.loc[:,'county_fips'].astype(int)

provider_df['state_fips'] = [str(i) if len(str(i))==2 else "0"+str(i) for i in provider_df['state_fips']]

new_list = []
for i in range(len(provider_df)):
  if len(str(provider_df['county_fips'][i]))==1:
    new_list.append(str('00')+str(provider_df['county_fips'][i]))

  elif len(str(provider_df['county_fips'][i]))==2:
    new_list.append(str('0')+str(provider_df['county_fips'][i]))

  elif len(str(provider_df['county_fips'][i]))==3:
    new_list.append(str(provider_df['county_fips'][i]))

provider_df['county_fips'] = new_list

# new fips
provider_df['county_fips'] = [str(provider_df['state_fips'][i])+str(provider_df['county_fips'][i]) for i in range(len(provider_df))]

In [93]:
provider_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6649 entries, 0 to 6648
Data columns (total 7 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   state                                    6649 non-null   object 
 1   county                                   6649 non-null   object 
 2   state_fips                               6649 non-null   object 
 3   county_fips                              6649 non-null   object 
 4   number_of_fee_for_service_beneficiaries  6649 non-null   float64
 5   number_of_providers                      6649 non-null   float64
 6   year                                     6649 non-null   object 
dtypes: float64(2), object(5)
memory usage: 363.7+ KB


#### Merge with site data

In [94]:
provider_df.loc[:,'county_fips'] = provider_df.loc[:,'county_fips'].astype('float64')
site_df.loc[:,'county_fips'] = site_df.loc[:,'county_fips'].astype('float64')
provider_df.loc[:,'year'] = provider_df.loc[:,'year'].astype(int)
site_df.loc[:,'year'] = site_df.loc[:,'year'].astype(int)

In [105]:
temp_df = pd.merge(site_df, provider_df, on=['county_fips','year'], how='outer')
temp_df = temp_df[~temp_df['grant_number'].isna()].reset_index(drop=True)
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38949 entries, 0 to 38948
Data columns (total 33 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   bhcmis_id                                38949 non-null  object 
 1   grant_number                             38949 non-null  object 
 2   Health Center Name                       38949 non-null  object 
 3   Site Name                                38949 non-null  object 
 4   Site Type                                38949 non-null  object 
 5   Site Status                              38949 non-null  object 
 6   Location Type                            38949 non-null  object 
 7   Location Setting                         38949 non-null  object 
 8   Operational Schedule                     32702 non-null  object 
 9   Calendar Schedule                        32909 non-null  object 
 10  TotalWeekly Hours Of Operation           38792

In [106]:
## define weights
temp_df_wt = temp_df.groupby(['grant_number','year']).count()[['bhcmis_id']].reset_index(level=[0,1])
temp_df_wt.rename(columns={'bhcmis_id':'wt'}, inplace=True)
temp_df = pd.merge(temp_df, temp_df_wt, on=['grant_number','year'])
temp_df['wt'] = [1/i for i in temp_df['wt']]
temp_df = temp_df[~temp_df['number_of_providers'].isna()].reset_index(drop=True)
temp_df.loc[:,'number_of_providers'] = temp_df.loc[:,'number_of_providers'].astype(int)
# to create index
temp_df['grantnumber_year'] = [str(temp_df['grant_number'][i])+"_"+str(temp_df['year'][i]) for i in range(len(temp_df))]

In [108]:
temp_df

Unnamed: 0,bhcmis_id,grant_number,Health Center Name,Site Name,Site Type,Site Status,Location Type,Location Setting,Operational Schedule,Calendar Schedule,...,full_address,zip_5,county_fips,state,county,state_fips,number_of_fee_for_service_beneficiaries,number_of_providers,wt,grantnumber_year
0,44230,H80CS00486,ANSON REGIONAL MEDICAL SERVICES,"ANSON REGIONAL MEDICAL SERVICES, INC.",Administrative/Service Delivery Site,Active,Permanent,All Other Clinic Types,Full-Time,Year-Round,...,203 Salisbury St WADESBORO,28170.0,37007.0,NC,Anson,37,4235.0,3,0.500000,H80CS00486_2017
1,44230,H80CS00486,ANSON REGIONAL MEDICAL SERVICES,ARMS-Union,Service Delivery Site,Active,Permanent,All Other Clinic Types,,,...,1315 Sunset Drive Monroe NC 28112,28112.0,37179.0,NC,Union,37,22786.0,1,0.500000,H80CS00486_2017
2,44230,H80CS00486,ANSON REGIONAL MEDICAL SERVICES,"ANSON REGIONAL MEDICAL SERVICES, INC.",Administrative/Service Delivery Site,Active,Permanent,All Other Clinic Types,Full-Time,Year-Round,...,203 Salisbury St WADESBORO,28170.0,37007.0,NC,Anson,37,4146.0,3,0.500000,H80CS00486_2018
3,44230,H80CS00486,ANSON REGIONAL MEDICAL SERVICES,ARMS-Union,Service Delivery Site,Active,Permanent,All Other Clinic Types,-,-,...,1315 Sunset Drive Monroe NC 28112,28112.0,37179.0,NC,Union,37,22842.0,0,0.500000,H80CS00486_2018
4,32750,H80CS00176,"Choptank Community Health System, Inc.",ANGELICA NURSERY MIGRANT CAMP,Service Delivery Site,Active,Seasonal,All Other Clinic Types,Part-Time,Seasonal,...,11365 Locust Grove Rd Kennedyville MD 21645,21645.0,24029.0,MD,Kent,24,5433.0,0,0.035714,H80CS00176_2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15525,08E01156,H80CS28358,POWELL HEALTH CARE COALITION,Heritage Health Center,Administrative/Service Delivery Site,Active,Permanent,All Other Clinic Types,,,...,306 N BENT ST POWELL WY 82435,82435.0,56029.0,WY,Park,56,7393.0,1,0.500000,H80CS28358_2017
15526,08E01156,H80CS28358,POWELL HEALTH CARE COALITION,Heritage Health Center,Service Delivery Site,Active,Permanent,All Other Clinic Types,,,...,128 N BENT ST POWELL WY 82435,82435.0,56029.0,WY,Park,56,7393.0,1,0.500000,H80CS28358_2017
15527,08E01156,H80CS28358,POWELL HEALTH CARE COALITION,Heritage Health Center,Service Delivery Site,Active,Permanent,All Other Clinic Types,-,-,...,128 N BENT ST POWELL WY 82435,82435.0,56029.0,WY,Park,56,7697.0,1,1.000000,H80CS28358_2018
15528,08E01156,H80CS28358,POWELL HEALTH CARE COALITION,Heritage Health Center,Service Delivery Site,Active,Permanent,All Other Clinic Types,,,...,128 N BENT ST POWELL WY 82435,82435.0,56029.0,WY,Park,56,7913.0,4,0.500000,H80CS28358_2019


In [107]:
temp_df[['grantnumber_year', 
         'county_fips','number_of_providers','wt']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15530 entries, 0 to 15529
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   grantnumber_year     15530 non-null  object 
 1   county_fips          15530 non-null  float64
 2   number_of_providers  15530 non-null  int64  
 3   wt                   15530 non-null  float64
dtypes: float64(2), int64(1), object(1)
memory usage: 485.4+ KB


In [110]:
# get weighted average of `number of providers`
temp_df_wa = temp_df.groupby(temp_df.grantnumber_year).apply(lambda x: np.average(x.number_of_providers, weights=x.wt)).to_frame().reset_index(level=[0])
temp_df_wa.columns = ['grantnumber_year','number_of_providers_wa']
# merge
temp_df = pd.merge(temp_df, temp_df_wa, on=['grantnumber_year'])
provider_df = temp_df[['grant_number','year','number_of_providers_wa']].drop_duplicates().reset_index(drop=True)

In [111]:
provider_df.to_csv('/content/drive/MyDrive/ASU/Telehealth/Data/healthcentersite_provider_wa.csv', index=False)