<a href="https://colab.research.google.com/github/kyxyxn/Telemedicine/blob/main/3_CEM_variables.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd   
from functools import reduce
from scipy.stats import entropy

In [3]:
rootPath ='/content/drive/MyDrive/ASU/CIS791_Chen_Final/uds-data/'
filenames_uds = [rootPath+'uds-2014-full-dataset.xlsx',
                 rootPath+'uds-2015-full-dataset.xlsx',
                 rootPath+'uds-2016-full-dataset.xlsx',
                 rootPath+'uds-2017-full-dataset.xlsx',
                 rootPath+'uds-2018-full-dataset.xlsx',
                 rootPath+'uds-2019-full-dataset.xlsx',
                 rootPath+'uds-2020-full-dataset.xlsx']

In [4]:
hc_df = pd.read_csv(rootPath+'samples_hc.csv')
visits_df = pd.read_csv(rootPath+'dv_visits.csv')

In [5]:
# These should be identified
hc_list = hc_df['Grant Number'].value_counts().index.tolist()

In [6]:
len(hc_list)

891

### HC characteristics

#### 1. Number of patients

In [None]:
# Table 4 -- Total number of patients
def get_df(filenames, table):
  final_df = pd.DataFrame()
  for i in range(len(filenames)):
    print(i)
    temp_df = pd.read_excel(filenames[i], sheet_name=table)
    temp_df.columns = temp_df.iloc[0].values.tolist()
    print(temp_df.columns.values.tolist()[:5])
    temp_df['year']=filenames[i].split('uds-')[-1][:4]
    final_df = final_df.append(temp_df).reset_index(drop=True)
  return final_df

t4_df = get_df(filenames_uds, 'Table4')
t4_df = t4_df[['GrantNumber', 'year', 'T4_L6_Ca', 'T4_L7_Ca',	'T4_L7_Cb', 'T4_L8_Ca', 'T4_L8_Cb', 'T4_L9_Ca','T4_L9_Cb', 'T4_L11_Ca', 'T4_L11_Cb']]
t4_df = t4_df[t4_df['GrantNumber']!='GrantNumber'].reset_index(drop=True)
t4_df.columns = ['Grant Number', 'year', 'total_num_patients', 'total_num_patients_uninsured_17','total_num_patients_uninsured_18older', 'total_num_patients_medicaid_17', 'total_num_patients_medicaid_18older',
                 'total_num_patients_medicare_17', 'total_num_patients_medicare_18older', 'total_num_patients_private_17', 'total_num_patients_private_18older']
t4_df = t4_df.replace(to_replace={'-':np.nan})
t4_df = t4_df[~t4_df['total_num_patients'].isna()].reset_index(drop=True)
t4_df = t4_df.fillna(0)

t4_df['total_num_patients_medicaid'] = t4_df['total_num_patients_medicaid_17']+t4_df['total_num_patients_medicaid_18older']
t4_df['total_num_patients_medicare'] = t4_df['total_num_patients_medicare_17']+t4_df['total_num_patients_medicare_18older']
t4_df['total_num_patients_private'] = t4_df['total_num_patients_private_17']+t4_df['total_num_patients_private_18older']
t4_df['total_num_patients_uninsured'] = t4_df['total_num_patients_uninsured_17']+t4_df['total_num_patients_uninsured_18older']

t4_df = t4_df[['Grant Number', 'year', 'total_num_patients', 'total_num_patients_medicaid', 'total_num_patients_medicare', 'total_num_patients_private', 'total_num_patients_uninsured']]
t4_df = t4_df[t4_df['Grant Number'].isin(hc_list)].reset_index(drop=True) # Focus

In [172]:
t4_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9513 entries, 0 to 9512
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Grant Number                  9513 non-null   object 
 1   year                          9513 non-null   object 
 2   total_num_patients            9513 non-null   float64
 3   total_num_patients_medicaid   9513 non-null   float64
 4   total_num_patients_medicare   9513 non-null   float64
 5   total_num_patients_private    9513 non-null   float64
 6   total_num_patients_uninsured  9513 non-null   float64
dtypes: float64(5), object(2)
memory usage: 520.4+ KB


#### 2. Financial assistance

In [202]:
money_df = pd.read_excel(r'/content/drive/MyDrive/ASU/CIS791_Chen_Final/uds-data/hc_site_zips.xlsx')
money_df = money_df[~money_df['Grantee Name'].isna()]
money_df = money_df[['Grant #', 'Fiscal Year', 'Financial Assistance']].sort_values(by=['Grant #', 'Fiscal Year']).reset_index(drop=True)
money_df = money_df.rename(columns={'Grant #':'Grant Number', 'Fiscal Year':'year'})
money_df['Financial Assistance'] = money_df['Financial Assistance'].replace(to_replace={'\$':'', ',':''}, regex=True)
money_df.loc[:,'Financial Assistance'] = money_df.loc[:,'Financial Assistance'].astype(float)
money_df.loc[:,'year'] = money_df.loc[:,'year'].astype(int)
money_df = money_df[money_df['Grant Number'].isin(hc_list)].reset_index(drop=True) # Focus
money_df = money_df[money_df['year']>2013].reset_index(drop=True)

### Market characteristics: Get County information from HC Sites

In [23]:
# Clean grant data from website
## Follow year above
info_df = pd.read_excel(r'/content/drive/MyDrive/ASU/CIS791_Chen_Final/uds-data/hc_site_zips.xlsx')
info_df['Fiscal Year'] = info_df['Fiscal Year'].replace(to_replace={'All Other Clinic Types':np.nan, 'School':np.nan, 'Health Center Location Setting':np.nan,
                                                                    'Unknown':np.nan, 'Hospital':np.nan, 'Nursing Home':np.nan, 'Domestic Violence':np.nan,
                                                                    'Correctional Facility':np.nan})
info_df['Fiscal Year'] = info_df['Fiscal Year'].fillna(method="ffill")

In [24]:
# Clean data
df_1 = info_df[info_df['Program Name']=='Health Center Program (H80)'].reset_index(drop=True)
df_2 = info_df[info_df['Program Name']!='Health Center Program (H80)'].reset_index(drop=True)
df_2 = df_2.drop(columns=['Grantee Name'])
df_2.columns = df_2.iloc[0].values
df_2 = df_2[df_2['Services Delivered at Site']!='Services Delivered at Site'].reset_index(drop=True)
df_2 = df_2.rename(columns={'2019':'year', 'County':'county_site', 'State':'state_site', 'Grant #':'Grant Number', 'BHCMIS Org. ID':'BHCMIS ID', 
                            'ZIP':'zip_site', 'State County FIPS Code':'FIPS_site', 'HHS Region':'hhs_site', 'Rural Status':'rural_site'})
df_1 = df_1.rename(columns={'Grant #':'Grant Number', 'Fiscal Year':'year', 'State': 'state_center', 'County':'county_center', 
                            'ZIP':'zip_center', 'State County FIPS Code':'FIPS_center', 'HHS Region':'hhs_center', 'Rural Status':'rural_center'})
df_1['index_1'] = 'center'
df_2['index_2'] = 'site'

In [25]:
temp_df = pd.merge(df_1[['Grant Number', 'Grantee Name', 'year', 'state_center', 'county_center', 'zip_center', 'FIPS_center', 'hhs_center', 'rural_center', 'index_1']],
                   df_2[['Grant Number', 'Site Name', 'Health Center Type', 'Health Center Location Type', 'year', 'state_site', 'county_site', 'zip_site', 'FIPS_site', 'hhs_site', 'rural_site', 'index_2']],
                   on=['Grant Number', 'year'], how='outer')
temp_df = temp_df[temp_df['Grant Number'].isin(hc_list)].reset_index(drop=True) # should be our focus
# info_df = temp_df.copy()
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59039 entries, 0 to 59038
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Grant Number                 59039 non-null  object
 1   Grantee Name                 59039 non-null  object
 2   year                         59039 non-null  object
 3   state_center                 59039 non-null  object
 4   county_center                59039 non-null  object
 5   zip_center                   59039 non-null  object
 6   FIPS_center                  59039 non-null  object
 7   hhs_center                   59039 non-null  object
 8   rural_center                 59039 non-null  object
 9   index_1                      59039 non-null  object
 10  Site Name                    58796 non-null  object
 11  Health Center Type           58796 non-null  object
 12  Health Center Location Type  58796 non-null  object
 13  state_site                   58

In [26]:
# Centers that don't have sites should be center == site
site_df = temp_df[~temp_df['index_2'].isna()].reset_index(drop=True)
nosite_df = temp_df[temp_df['index_2'].isna()].reset_index(drop=True)
# fill blanks
nosite_df['Site Name'] = nosite_df['Grantee Name']
nosite_df['state_site'] = nosite_df['state_site']
nosite_df['county_site'] = nosite_df['county_center']
nosite_df['zip_site'] = nosite_df['zip_center']
nosite_df['FIPS_site'] = nosite_df['FIPS_center']
nosite_df['hhs_site'] = nosite_df['hhs_center']
nosite_df['rural_site'] = nosite_df['rural_center']
# Concat
info_df = pd.concat([site_df, nosite_df]).reset_index(drop=True)
# Fill missing values
## zip
info_df.loc[info_df['Site Name']=='GOLDEN HOUSE - Family Violence Center', ['zip_site']] = '54302'
info_df.loc[info_df['Site Name']=='The Julian Center Clinic', ['zip_site']] = '46202'
info_df.loc[info_df['Site Name']=='Westside Academy @ Blodgett - 312 Oswego St.', ['zip_site']] = '13204'
info_df.loc[info_df['Site Name']=='EDDY CENTER ADULT SHELTER', ['zip_site']] = '06457'
info_df.loc[info_df['Site Name']=='WYA AT CRYSALIS', ['zip_site']] = '06450'
# All other missinig are mostly domestic violence HCs
info_df = info_df[~info_df['zip_site'].isna()].reset_index(drop=True)
info_df['zip5_site'] = [int(str(i)[:5]) for i in info_df['zip_site']]

## county
info_df.loc[info_df['Site Name']=='GOLDEN HOUSE - Family Violence Center', ['FIPS_site']] = '55009'
info_df.loc[info_df['Site Name']=='The Julian Center Clinic', ['FIPS_site']] = '18097'
info_df.loc[info_df['Site Name']=='EDDY CENTER ADULT SHELTER', ['FIPS_site']] = '09007'
info_df.loc[info_df['Site Name']=='Plainfield Central School SBHC', ['FIPS_site']] = '09015'
info_df.loc[info_df['Site Name']=='WYA AT CRYSALIS', ['FIPS_site']] = '09009'
info_df.loc[info_df['Site Name']=='Westside Academy @ Blodgett - 312 Oswego St.', ['FIPS_site']] = '36067'
info_df.loc[info_df['Site Name']=='Santa Fe Suites Outreach Service Site', ['FIPS_site']] = '35049'

## state
info_df['state_site'] = info_df['state_site'].replace(to_replace={'XX':np.nan})
info_df['state_fips_site'] = [str(i)[:2] for i in info_df['FIPS_site']]
fips_df = info_df[['state_fips_site', 'state_site']].drop_duplicates().dropna().sort_values(by=['state_fips_site']).reset_index(drop=True)
info_df = pd.merge(info_df.drop(columns=['state_site']), fips_df, on=['state_fips_site'])

info_df = info_df[info_df['year']!=2013].reset_index(drop=True)

In [27]:
info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59003 entries, 0 to 59002
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Grant Number                 59003 non-null  object
 1   Grantee Name                 59003 non-null  object
 2   year                         59003 non-null  object
 3   state_center                 59003 non-null  object
 4   county_center                59003 non-null  object
 5   zip_center                   59003 non-null  object
 6   FIPS_center                  59003 non-null  object
 7   hhs_center                   59003 non-null  object
 8   rural_center                 59003 non-null  object
 9   index_1                      59003 non-null  object
 10  Site Name                    59003 non-null  object
 11  Health Center Type           58760 non-null  object
 12  Health Center Location Type  58760 non-null  object
 13  county_site                  59

In [28]:
# For proportion
def get_unique(dataframe, var):
  temp_gb = dataframe.groupby(['Grant Number', 'year']).nunique()[[var]].reset_index(level=[0,1]).rename(columns={var:'num_unique_{}'.format(var)})
  temp_gb['num_unique_{}'.format(var)] = 1/temp_gb['num_unique_{}'.format(var)]
  dataframe = pd.merge(dataframe, temp_gb, on=['Grant Number', 'year'])
  return dataframe

info_df = get_unique(dataframe = info_df, var='zip5_site')
info_df = get_unique(dataframe = info_df, var='FIPS_site')
info_df = get_unique(dataframe = info_df, var='hhs_site')
info_df = get_unique(dataframe = info_df, var='rural_site')

#### 1. Population

In [29]:
pop_df = pd.DataFrame()
for i in range(2014, 2021):
  print(i)
  temp_df = pd.read_csv('/content/drive/MyDrive/ASU/Telehealth/Data/Controls/ACSDP5Y{}.DP05.csv'.format(i))
  temp_df = temp_df.replace(to_replace={'(X)':np.nan})
  temp_df = temp_df[['GEO_ID','DP05_0001E']]
  temp_df['year'] = i
  pop_df = pop_df.append(temp_df)
  print("Done!")

pop_df['county_fips'] = [str(i)[-5:] for i in pop_df['GEO_ID']]
pop_df = pop_df[pop_df['GEO_ID']!='id'].reset_index(drop=True)
pop_df.columns = ['GEO_ID','county_population','year','county_fips']
pop_df = pop_df[['county_fips','year','county_population']]
pop_df['county_population'] = [float(i) for i in pop_df['county_population']]

2014


  exec(code_obj, self.user_global_ns, self.user_ns)


Done!
2015


  exec(code_obj, self.user_global_ns, self.user_ns)


Done!
2016
Done!
2017


  exec(code_obj, self.user_global_ns, self.user_ns)


Done!
2018


  exec(code_obj, self.user_global_ns, self.user_ns)


Done!
2019
Done!
2020
Done!


#### 2. Median household income

In [71]:
income_df = pd.DataFrame()
for i in range(2014, 2021):
  print(i)
  temp_df = pd.read_csv('/content/drive/MyDrive/ASU/Telehealth/Data/Controls/ACSST5Y{}.S1903.csv'.format(i))
  temp_df = temp_df.replace(to_replace={'(X)':np.nan})
  if i<2017:
    temp_df = temp_df[['GEO_ID','S1903_C02_001E']]
    temp_df.columns =['GEO_ID', 'median_household_income']
  elif i>=2017:
    temp_df = temp_df[['GEO_ID','S1903_C03_001E']]
    temp_df.columns =['GEO_ID', 'median_household_income']
    
  temp_df['year'] = i
  income_df = income_df.append(temp_df)
  print("Done!")

income_df['county_fips'] = [str(i)[-5:] for i in income_df['GEO_ID']]
income_df.columns = ['GEO_ID','median_household_income','year','county_fips']
income_df = income_df[income_df['GEO_ID']!='id'].reset_index(drop=True)
income_df = income_df[['county_fips','year','median_household_income']]
income_df['median_household_income'] = income_df['median_household_income'].replace(to_replace={'-':np.nan})

income_df.loc[(income_df['county_fips']=='48301')&(income_df['year']==2014), 'median_household_income'] = 48125
income_df.loc[(income_df['county_fips']=='35039')&(income_df['year']==2018), 'median_household_income'] = 36687 #(33422+39952)/2
income_df.loc[(income_df['county_fips']=='48243')&(income_df['year']==2020), 'median_household_income'] = 52982

income_df['median_household_income'] = [float(i) for i in income_df['median_household_income']]

2014
Done!
2015
Done!
2016
Done!
2017
Done!
2018
Done!
2019
Done!
2020
Done!


#### 3. Race

In [83]:
race_df = pd.DataFrame()
for i in range(2014, 2021):
  print(i)
  temp_df = pd.read_csv('/content/drive/MyDrive/ASU/Telehealth/Data/Controls/ACSDP5Y{}.DP05.csv'.format(i))
  temp_df = temp_df.replace(to_replace={'(X)':np.nan})

  if i<=2016:
    temp_df = temp_df[['GEO_ID', 'DP05_0001E', 'DP05_0032E', 'DP05_0033E', 'DP05_0034E', 'DP05_0039E', 'DP05_0047E', 'DP05_0066E']]
    temp_df.columns = ['GEO_ID', 'county_population', 'county_population_white', 'county_population_black', 'county_population_native_american', 'county_population_asian', 'county_population_islander', 'county_population_hispanic']
  elif i>2016:
    temp_df = temp_df[['GEO_ID', 'DP05_0001E', 'DP05_0037E', 'DP05_0038E', 'DP05_0039E', 'DP05_0044E', 'DP05_0052E', 'DP05_0071E']]
    temp_df.columns = ['GEO_ID', 'county_population', 'county_population_white', 'county_population_black', 'county_population_native_american', 'county_population_asian', 'county_population_islander', 'county_population_hispanic']

  temp_df['year'] = i
  race_df = race_df.append(temp_df)
  print("Done!")

race_df['county_fips'] = [str(i)[-5:] for i in race_df['GEO_ID']]
race_df = race_df[race_df['GEO_ID']!='id'].reset_index(drop=True)
race_df = race_df.drop(columns=['GEO_ID'])
col_list = ['county_population', 'county_population_white', 'county_population_black', 'county_population_native_american', 'county_population_asian', 'county_population_islander', 'county_population_hispanic']
for col in col_list:
  race_df[col] = [float(i) for i in race_df[col]]

2014


  exec(code_obj, self.user_global_ns, self.user_ns)


Done!
2015


  exec(code_obj, self.user_global_ns, self.user_ns)


Done!
2016
Done!
2017


  exec(code_obj, self.user_global_ns, self.user_ns)


Done!
2018


  exec(code_obj, self.user_global_ns, self.user_ns)


Done!
2019
Done!
2020
Done!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [84]:
info_df.loc[:, 'year'] = info_df.loc[:, 'year'].astype(int)
info_df = info_df[info_df['year']!=2013].reset_index(drop=True)
temp_df = reduce(lambda x, y: pd.merge(x,y, on=['FIPS_site', 'year'], how='outer'), [info_df.rename(columns={'county_fips':'FIPS_site'}), 
                                                                                     pop_df.rename(columns={'county_fips':'FIPS_site'}), 
                                                                                     race_df.drop(columns=['county_population']).rename(columns={'county_fips':'FIPS_site'}),
                                                                                     income_df.rename(columns={'county_fips':'FIPS_site'})])
temp_df = temp_df[~temp_df['Grant Number'].isna()].reset_index(drop=True)
info_df = temp_df.copy()

### County

In [47]:
county_df = info_df[['Grant Number', 'year', 'FIPS_site', 'num_unique_FIPS_site']].drop_duplicates().sort_values(by=['Grant Number', 'year', 'num_unique_FIPS_site', 'FIPS_site']).reset_index(drop=True)
county_df = county_df[county_df['year']!='2013'].reset_index(drop=True)

In [222]:
# Population
county_df.loc[:, 'year'] = county_df.loc[:, 'year'].astype(int)
pop_df.loc[:, 'year'] = pop_df.loc[:, 'year'].astype(int)

temp_df = pd.merge(county_df, pop_df.rename(columns={'county_fips':'FIPS_site'}), on=['year', 'FIPS_site'], how='outer')
temp_df = temp_df[~temp_df['Grant Number'].isna()].reset_index(drop=True)
temp_df['proportion_county_population'] = temp_df['num_unique_FIPS_site']*temp_df['county_population']

## Proportion
temp_df_gb_pop = temp_df.groupby(['Grant Number', 'year']).sum()[['proportion_county_population']].rename(columns={'proportion_county_population':'hc_population'}).reset_index(level=[0,1])

# Income
county_df.loc[:, 'year'] = county_df.loc[:, 'year'].astype(int)
income_df.loc[:, 'year'] = income_df.loc[:, 'year'].astype(int)

temp_df = pd.merge(county_df, income_df.rename(columns={'county_fips':'FIPS_site'}), on=['year', 'FIPS_site'], how='outer')
temp_df = temp_df[~temp_df['Grant Number'].isna()].reset_index(drop=True)
temp_df['proportion_income'] = temp_df['num_unique_FIPS_site']*temp_df['median_household_income']

## Proportion
temp_df_gb_inc = temp_df.groupby(['Grant Number', 'year']).sum()[['proportion_income']].rename(columns={'proportion_income':'hc_household_income'}).reset_index(level=[0,1])

# Population
county_df.loc[:, 'year'] = county_df.loc[:, 'year'].astype(int)
race_df.loc[:, 'year'] = race_df.loc[:, 'year'].astype(int)

temp_df = pd.merge(county_df, race_df.rename(columns={'county_fips':'FIPS_site'}), on=['year', 'FIPS_site'], how='outer')
temp_df = temp_df[~temp_df['Grant Number'].isna()].reset_index(drop=True)

temp_df['proportion_county_pop'] = temp_df['num_unique_FIPS_site']*temp_df['county_population']
temp_df['proportion_county_white'] = temp_df['num_unique_FIPS_site']*temp_df['county_population_white']
temp_df['proportion_county_black'] = temp_df['num_unique_FIPS_site']*temp_df['county_population_black']
temp_df['proportion_county_native_american'] = temp_df['num_unique_FIPS_site']*temp_df['county_population_native_american']
temp_df['proportion_county_asian'] = temp_df['num_unique_FIPS_site']*temp_df['county_population_asian']
temp_df['proportion_county_islander'] = temp_df['num_unique_FIPS_site']*temp_df['county_population_islander']
temp_df['proportion_county_hispanic'] = temp_df['num_unique_FIPS_site']*temp_df['county_population_hispanic']

# ## Proportion
temp_df_gb_race = temp_df.groupby(['Grant Number', 'year']).sum()[['proportion_county_white','proportion_county_black','proportion_county_native_american','proportion_county_asian',
                                                                   'proportion_county_islander','proportion_county_hispanic','proportion_county_pop']].rename(columns={'proportion_county_population':'hc_population'}).reset_index(level=[0,1])

temp_df_gb_race['proportion_county_white'] = 100*(temp_df_gb_race['proportion_county_white']/temp_df_gb_race['proportion_county_pop'])
temp_df_gb_race['proportion_county_black'] = 100*(temp_df_gb_race['proportion_county_black']/temp_df_gb_race['proportion_county_pop'])
temp_df_gb_race['proportion_county_native_american'] = 100*(temp_df_gb_race['proportion_county_native_american']/temp_df_gb_race['proportion_county_pop'])
temp_df_gb_race['proportion_county_asian'] = 100*(temp_df_gb_race['proportion_county_asian']/temp_df_gb_race['proportion_county_pop'])
temp_df_gb_race['proportion_county_islander'] = 100*(temp_df_gb_race['proportion_county_islander']/temp_df_gb_race['proportion_county_pop'])
temp_df_gb_race['proportion_county_hispanic'] = 100*(temp_df_gb_race['proportion_county_hispanic']/temp_df_gb_race['proportion_county_pop'])

temp_df_gb_race.columns=['Grant Number', 'year', 'hc_white', 'hc_black', 'hc_native_american', 'hc_asian', 'hc_islander', 'hc_hispanic', 'hc_pop']
temp_df_gb_race = temp_df_gb_race.drop(columns=['hc_pop'])

# ## Merge all
temp_df = reduce(lambda x, y: pd.merge(x, y, on=['Grant Number', 'year'], how='outer'),[temp_df_gb_pop, temp_df_gb_inc, temp_df_gb_race])
ctrl_df = temp_df.copy()

In [236]:
## Merge all
t4_df.loc[:, 'year'] = t4_df.loc[:, 'year'].astype(int)
money_df.loc[:, 'year'] = money_df.loc[:, 'year'].astype(int)
ctrl_df.loc[:, 'year'] = ctrl_df.loc[:, 'year'].astype(int)

temp_df = reduce(lambda x, y: pd.merge(x, y, on=['Grant Number', 'year'], how='outer'), [t4_df, money_df, ctrl_df])
temp_df = temp_df.rename(columns={'Financial Assistance':'grant_amount'})

In [242]:
temp_df.to_csv(rootPath+'ctrl_df.csv', index=False)