# Generate Payment and Value of Care dataframe

In [1]:
import pandas as pd
import numpy as np
import warnings
from IPython.utils import io
import sys
import time

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

main_dir = '~/Desktop/Rush/CMS_HospitalArchives/'

## Define Custom Functions

In [2]:
def curate(df):

    try:
        df = df[df['Facility ID'] != np.nan]
        df['Facility ID'] = df['Facility ID'].values.astype(str)
        
        ids = df['Facility ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['Facility ID'] = ids2
        
    except:
        pass
    try:
        df = df[df['Facility Name'] != np.nan]
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    return df


def rename_and_fill(df):
    
    cols1 = ['Provider ID', "Measure Start Date", "Measure End Date",
             'Hospital Name', 'Hospital name', 'County name',
             'Phone number', 'Payment measure name', 'Payment measure ID', 
             'Payment category', 'Lower estimate', 'Higher estimate', 
             'Payment footnote', 'Value of care display name', 
             'Value of care display ID', 'Value of care category', 
             'Value of care footnote', 
             'Measure start date', 'Measure end date',
             'Measure ID',
             'Measure name', 'Category', 'Footnote',
             'Address 1', 'City/Town', 'County/Parish',
             'Telephone Number',
             ]
    
    
    cols2 = ['Facility ID', "Start Date", "End Date",
             'Facility Name', 'Facility Name', 'County Name',
             'Phone Number', 'Payment Measure Name', 'Payment Measure ID', 
             'Payment Category', 'Lower Estimate', 'Higher Estimate', 
             'Payment Footnote', 'Value of Care Display Name', 
             'Value of Care Display ID', 'Value of Care Category', 
             'Value of Care Footnote', 
             'Start Date', 'End Date',
             'Payment Measure ID',
             'Payment Measure Name', 'Payment Category', 'Payment Footnote',
             'Address', 'City', 'County Name',
             'Phone Number',
             ]
    
    for i, col in enumerate(cols1):
        if col in list(df):
            df.rename(columns={col: cols2[i]}, inplace=True)
                    
    l = list(df)
    l = list(set([x for x in l if l.count(x) > 1]))
    if len(l) > 0:
        print('duplicates:', l)
        sys.exit()
        
    return df


def process2(df, lists, yr, mo):
    df = rename_and_fill(df)
    df = curate(df)
    lists.append(list(df))
    df['file_month'] = [mo]* df.shape[0]
    df['file_year'] = [yr]* df.shape[0]
    df = df.reindex(sorted(df.columns), axis=1)
    return df, lists


## Load Files

In [3]:
df_list = []
lists = []

yrs = ['2023', '2023', '2023', '2023',
       '2022', '2022', '2022', '2022',
       '2021','2021','2021', '2021', '2021',
       '2020', '2020', '2020', '2020', 
       '2019', '2019', '2019', '2019', 
       '2018', '2018', '2018', '2018',
       '2017', '2017', '2017',
       '2016', '2016', '2016', '2016',
       '2015', '2015', '2015',
       ]

mos = ['01', '04', '07', '10',
       '01', '04', '07', '10',
       '01', '03', '04', '07', '10', 
       '10', '07', '04', '01', 
       '10', '07', '04', '03', 
       '10', '07', '05', '01',
       '10', '07', '04',
       '12', '11', '08', '05',
       '12', '10', '07',
       ]

subdirs = ['2023/hospitals_01_2023/Payment_and_Value_of_Care-Hospital.csv', 
           '2023/hospitals_04_2023/Payment_and_Value_of_Care-Hospital.csv',
           '2023/hospitals_07_2023/Payment_and_Value_of_Care-Hospital.csv',
           '2023/hospitals_10_2023/Payment_and_Value_of_Care-Hospital.csv',
           
           '2022/hospitals_01_2022/Payment_and_Value_of_Care-Hospital.csv', 
           '2022/hospitals_04_2022/Payment_and_Value_of_Care-Hospital.csv',
           '2022/hospitals_07_2022/Payment_and_Value_of_Care-Hospital.csv',
           '2022/hospitals_10_2022/Payment_and_Value_of_Care-Hospital.csv',
           
           '2021/hospitals_01_2021/Payment_and_Value_of_Care-Hospital.csv',
           '2021/hospitals_03_2021/Payment_and_Value_of_Care-Hospital.csv',
           '2021/hospitals_04_2021/Payment_and_Value_of_Care-Hospital.csv',
           '2021/hospitals_07_2021/Payment_and_Value_of_Care-Hospital.csv',
           '2021/hospitals_10_2021/Payment_and_Value_of_Care-Hospital.csv',
           
           '2020/hospitals_archive_10_2020/Payment_and_value_of_care_Hospital.csv',
           '2020/hospitals_archive_07_2020/Payment_and_value_of_care_Hospital.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200422/Payment and Value of Care - Hospital.csv',
           '2020/HOSArchive_Revised_Flatfiles_20200129/Payment and Value of Care - Hospital.csv',
           
           '2019/HOSArchive_Revised_Flatfiles_20191030/Payment and Value of Care - Hospital.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190702/Payment and Value of Care - Hospital.csv',
           '2019/HOSArchive_Revised_FlatFiles_20190424/Payment and Value of Care - Hospital.csv',
           '2019/HOSArchive_Revised_Flatfiles_20190321/Payment and Value of Care - Hospital.csv',
           
           '2018/HOSArchive_Revised_FlatFiles_20181031/Payment and Value of Care - Hospital.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180725/Payment and Value of Care - Hospital.csv', 
           '2018/HOSArchive_Revised_FlatFiles_20180523/Payment and Value of Care - Hospital.csv',
           '2018/HOSArchive_Revised_FlatFiles_20180126/Payment and Value of Care - Hospital.csv',
           
           '2017/HOSArchive_Revised_FlatFiles_20171024/Payment and Value of Care - Hospital.csv',
           '2017/HOSArchive_Revised_FlatFiles_20170726/Payment and Value of Care - Hospital.csv', 
           '2017/HOSArchive_Revised_Flatfiles_20170428/Payment and Value of Care - Hospital.csv', 
           
           '2016/HOSArchive_Revised_Flatfiles_20161219/Payment and Value of Care - Hospital.csv',
           '2016/Hospital_Revised_FlatFiles_20161110/Payment and Value of Care - Hospital.csv', 
           '2016/HOSArchive_Revised_FlatFiles_20160810/Payment and Value of Care - Hospital.csv',
           '2016/HOSArchive_Revised_FlatFiles_20160504/Payment and Value of Care - Hospital.csv',
           
           '2015/HOSArchive_Revised_FlatFiles_20151210/Payment and Value of Care - Hospital.csv',
           '2015/HOSArchive_Revised_FlatFiles_20151008/Payment and Value of Care - Hospital.csv',
           '2015/HOSArchive_Revised_FlatFiles_20150716/Payment - Hospital.csv',
           
           ]

for i, subdir in enumerate(subdirs):
    with io.capture_output() as captured: df = pd.read_csv(main_dir + subdir, encoding = "ISO-8859-1")
    print(subdir + ' :  (rows, columns) =', df.shape)
    df, lists = process2(df, lists, yrs[i], mos[i])
    df_list.append(df)

df = pd.concat(df_list)

print('df.shape:', df.shape)
df = df[~df['Denominator'].isin([np.nan, float("NaN"), 'Not Available'])]
print('df.shape:', df.shape)

df.head()

2023/hospitals_01_2023/Payment_and_Value_of_Care-Hospital.csv :  (rows, columns) = (18704, 22)
2023/hospitals_04_2023/Payment_and_Value_of_Care-Hospital.csv :  (rows, columns) = (18692, 22)
2023/hospitals_07_2023/Payment_and_Value_of_Care-Hospital.csv :  (rows, columns) = (18664, 22)
2023/hospitals_10_2023/Payment_and_Value_of_Care-Hospital.csv :  (rows, columns) = (18664, 22)
2022/hospitals_01_2022/Payment_and_Value_of_Care-Hospital.csv :  (rows, columns) = (18704, 22)
2022/hospitals_04_2022/Payment_and_Value_of_Care-Hospital.csv :  (rows, columns) = (18704, 22)
2022/hospitals_07_2022/Payment_and_Value_of_Care-Hospital.csv :  (rows, columns) = (18684, 22)
2022/hospitals_10_2022/Payment_and_Value_of_Care-Hospital.csv :  (rows, columns) = (18692, 22)
2021/hospitals_01_2021/Payment_and_Value_of_Care-Hospital.csv :  (rows, columns) = (18840, 22)
2021/hospitals_03_2021/Payment_and_Value_of_Care-Hospital.csv :  (rows, columns) = (18840, 22)
2021/hospitals_04_2021/Payment_and_Value_of_Care-H

Unnamed: 0,Address,City,County Name,Denominator,End Date,Facility ID,Facility Name,Higher Estimate,Lower Estimate,Payment,Payment Category,Payment Footnote,Payment Measure ID,Payment Measure Name,Phone Number,Start Date,State,Value of Care Category,Value of Care Display ID,Value of Care Display Name,Value of Care Footnote,ZIP Code,file_month,file_year
0,1108 ROSS CLARK CIRCLE,DOTHAN,HOUSTON,311,06/30/2021,10001,SOUTHEAST HEALTH MEDICAL CENTER,"$29,027","$24,890","$26,894",No Different Than the National Average Payment,,PAYM_30_AMI,Payment for heart attack patients,(334) 793-8701,07/01/2018,AL,Average Mortality and Average Payment,MORT_PAYM_30_AMI,Value of Care Heart Attack measure,,36301,1,2023
1,1108 ROSS CLARK CIRCLE,DOTHAN,HOUSTON,620,06/30/2021,10001,SOUTHEAST HEALTH MEDICAL CENTER,"$18,783","$16,947","$17,835",No Different Than the National Average Payment,,PAYM_30_HF,Payment for heart failure patients,(334) 793-8701,07/01/2018,AL,Better Mortality and Average Payment,MORT_PAYM_30_HF,Value of Care Heart Failure measur,,36301,1,2023
2,1108 ROSS CLARK CIRCLE,DOTHAN,HOUSTON,397,06/30/2021,10001,SOUTHEAST HEALTH MEDICAL CENTER,"$20,653","$19,430","$20,182",No Different Than the National Average Payment,,PAYM_30_PN,Payment for pneumonia patients,(334) 793-8701,07/01/2018,AL,Average Mortality and Average Payment,MORT_PAYM_30_PN,Value of Care Pneumonia measure,,36301,1,2023
3,1108 ROSS CLARK CIRCLE,DOTHAN,HOUSTON,101,03/31/2021,10001,SOUTHEAST HEALTH MEDICAL CENTER,"$23,309","$20,104","$21,623",No Different Than the National Average Payment,,PAYM_90_HIP_KNEE,Payment for hip/knee replacement patients,(334) 793-8701,04/01/2018,AL,Average Complications and Average Payment,COMP_PAYM_90_HIP_KNEE,Value of Care hip/knee replacement,,36301,1,2023
4,2505 U S HIGHWAY 431 NORTH,BOAZ,MARSHALL,48,06/30/2021,10005,MARSHALL MEDICAL CENTERS,"$29,475","$22,668","$25,814",No Different Than the National Average Payment,,PAYM_30_AMI,Payment for heart attack patients,(256) 593-8310,07/01/2018,AL,Average Mortality and Average Payment,MORT_PAYM_30_AMI,Value of Care Heart Attack measure,,35957,1,2023


In [4]:
df = df.filter(items=['Facility ID', 'Facility Name', 'file_month', 'file_year', 
                      'Start Date', 'End Date', 'Denominator',
                      'Payment Measure ID', 'Payment Measure Name', 
                      'Payment Category', 'Payment Footnote', 'Payment', 
                      'Higher Estimate', 'Lower Estimate',
                      'Value of Care Category', 'Value of Care Display ID', 
                      'Value of Care Display Name', 'Value of Care Footnote',
                      ], axis=1)

d = {'No Different Than the National Average Payment': 'No different than national average payment',
     'Less Than the National Average Payment': 'Less than national average payment',
     'Greater Than the National Average Payment': 'Greater than national average payment', 
     'No Different than the National Average Payment': 'No different than national average payment',
     'Greater than the National Average Payment': 'Greater than national average payment', 
     'Less than the National Average Payment': 'Less than national average payment'}
df['Payment Category'].replace(to_replace=d, inplace=True)

d = {'Value of Care Heart Attack measure': 'Value of Care - Heart Attack',
     'Value of Care Heart Failure measur': 'Value of Care - Heart Failure',
     'Value of Care Pneumonia measure': 'Value of Care - Pneumonia',
     'Value of Care Heart Failure measure': 'Value of Care - Heart Failure',
     'Value of Care hip/knee replacement measure': 'Value of Care - hip/knee replacement',
     'Value of Care hip/knee replacement': 'Value of Care - hip/knee replacement',
     np.nan: 'Not Available'}
df['Value of Care Display Name'].replace(to_replace=d, inplace=True)

d = {'Average Mortality and Average Payment': 'Average mortality and average payment',     
     'Average Mortality and Higher Payment': 'Average mortality and higher payment',
     'Average Mortality and Lower Payment': 'Average mortality and lower payment',
     'Better Mortality and Average Payment': 'Better mortality and average payment',
     'Better Mortality and Higher Payment': 'Better mortality and higher payment',
     'Better Mortality and Lower Payment': 'Better mortality and lower payment',
     'Worse Mortality and Average Payment': 'Worse mortality and average payment', 
     'Worse Mortality and Higher Payment': 'Worse mortality and higher payment',
     'Worse Mortality and Lower Payment': 'Worse mortality and lower payment',
     'Average Complications and Average Payment':'Average complications and average payment',
     'Average Complications and Higher Payment': 'Average complications and higher payment',
     'Average Complications and Lower Payment': 'Average complications and lower payment',
     'Better Complications and Average Payment': 'Better complications and average payment',
     'Better Complications and Higher Payment': 'Better complications and higher payment',
     'Better Complications and Lower Payment': 'Better complications and lower payment',
     'Worse Complications and Average Payment': 'Worse complications and average payment',
     'Worse Complications and Higher Payment': 'Worse complications and higher payment',
     'Worse Complications and Lower Payment': 'Worse complications and lower payment',
      np.nan: 'Not Available'}
df['Value of Care Category'].replace(to_replace=d, inplace=True)
df.head()


Unnamed: 0,Facility ID,Facility Name,file_month,file_year,Start Date,End Date,Denominator,Payment Measure ID,Payment Measure Name,Payment Category,Payment Footnote,Payment,Higher Estimate,Lower Estimate,Value of Care Category,Value of Care Display ID,Value of Care Display Name,Value of Care Footnote
0,10001,SOUTHEAST HEALTH MEDICAL CENTER,1,2023,07/01/2018,06/30/2021,311,PAYM_30_AMI,Payment for heart attack patients,No different than national average payment,,"$26,894","$29,027","$24,890",Average mortality and average payment,MORT_PAYM_30_AMI,Value of Care - Heart Attack,
1,10001,SOUTHEAST HEALTH MEDICAL CENTER,1,2023,07/01/2018,06/30/2021,620,PAYM_30_HF,Payment for heart failure patients,No different than national average payment,,"$17,835","$18,783","$16,947",Better mortality and average payment,MORT_PAYM_30_HF,Value of Care - Heart Failure,
2,10001,SOUTHEAST HEALTH MEDICAL CENTER,1,2023,07/01/2018,06/30/2021,397,PAYM_30_PN,Payment for pneumonia patients,No different than national average payment,,"$20,182","$20,653","$19,430",Average mortality and average payment,MORT_PAYM_30_PN,Value of Care - Pneumonia,
3,10001,SOUTHEAST HEALTH MEDICAL CENTER,1,2023,04/01/2018,03/31/2021,101,PAYM_90_HIP_KNEE,Payment for hip/knee replacement patients,No different than national average payment,,"$21,623","$23,309","$20,104",Average complications and average payment,COMP_PAYM_90_HIP_KNEE,Value of Care - hip/knee replacement,
4,10005,MARSHALL MEDICAL CENTERS,1,2023,07/01/2018,06/30/2021,48,PAYM_30_AMI,Payment for heart attack patients,No different than national average payment,,"$25,814","$29,475","$22,668",Average mortality and average payment,MORT_PAYM_30_AMI,Value of Care - Heart Attack,


In [5]:
df['Measure Name'] = df['Payment Measure Name'] + ' (' + df['Payment Measure ID'] + ')'
df = df.filter(items=['Facility ID', 'Facility Name', 'file_month', 'file_year', 'Measure Name', 
                      'Denominator', 'Payment Category', 'Payment', 'Higher Estimate', 
                      'Lower Estimate', 'Value of Care Category', 'Start Date', 'End Date'])

labs = ['Payment', 'Higher Estimate', 'Lower Estimate']
for lab in labs:
    scores = []
    for s in df[lab].tolist():
        if s == 'Not Available':
            s = np.nan
        else:
            s = s.strip('$')
            s = s.replace(',', '')
            s = float(s)
        scores.append(s)
    df[lab] = scores

In [6]:
cols = ['Facility ID', 'Facility Name', 'file_month', 'file_year', 'Start Date', 'End Date']
main_df = pd.DataFrame(columns=cols)
Measures = sorted(df['Measure Name'].unique())

for i, mi in enumerate(Measures):    
    tdf = df[df['Measure Name'] == mi]
    measures = sorted(tdf['Measure Name'].unique())
    
    df2 = pd.DataFrame(columns=cols)

    for j, m in enumerate(measures):
        tdf2 = tdf[tdf['Measure Name'] == m]
        for n in list(tdf2):
            if n == 'Measure Name' or n in cols:
                continue
            else:
                tdf2[n] = pd.to_numeric(tdf2[n], errors='coerce')
                tdf2.rename(columns={n: m + ' (' + n + ')'}, inplace=True)
        
        tdf2.drop(labels=['Measure Name'], axis=1, inplace=True)
        
        df2 = df2.merge(tdf2, on=cols, how='outer')
    
    main_df = main_df.merge(df2, on=cols, how='outer')

tdf = main_df.copy(deep=True)
del df2, main_df

print(tdf.shape)
tdf = tdf.loc[:, ~tdf.T.duplicated(keep='first')]
tdf.dropna(how='all', axis=1, inplace=True)
print(tdf.shape)
tdf.drop_duplicates(inplace=True)
print(tdf.shape)
tdf.drop_duplicates(subset = cols, inplace=True)
print(tdf.shape)
tdf.head()


(206890, 30)
(206890, 22)
(206890, 22)
(206890, 22)


Unnamed: 0,Facility ID,Facility Name,file_month,file_year,Payment for heart attack patients (PAYM_30_AMI) (Denominator),Payment for heart attack patients (PAYM_30_AMI) (Payment),Payment for heart attack patients (PAYM_30_AMI) (Higher Estimate),Payment for heart attack patients (PAYM_30_AMI) (Lower Estimate),Start Date,End Date,Payment for heart failure patients (PAYM_30_HF) (Denominator),Payment for heart failure patients (PAYM_30_HF) (Payment),Payment for heart failure patients (PAYM_30_HF) (Higher Estimate),Payment for heart failure patients (PAYM_30_HF) (Lower Estimate),Payment for hip/knee replacement patients (PAYM_90_HIP_KNEE) (Denominator),Payment for hip/knee replacement patients (PAYM_90_HIP_KNEE) (Payment),Payment for hip/knee replacement patients (PAYM_90_HIP_KNEE) (Higher Estimate),Payment for hip/knee replacement patients (PAYM_90_HIP_KNEE) (Lower Estimate),Payment for pneumonia patients (PAYM_30_PN) (Denominator),Payment for pneumonia patients (PAYM_30_PN) (Payment),Payment for pneumonia patients (PAYM_30_PN) (Higher Estimate),Payment for pneumonia patients (PAYM_30_PN) (Lower Estimate)
0,10001,SOUTHEAST HEALTH MEDICAL CENTER,1,2023,311.0,26894.0,29027.0,24890.0,07/01/2018,06/30/2021,620.0,17835.0,18783.0,16947.0,,,,,397.0,20182.0,20653.0,19430.0
1,10005,MARSHALL MEDICAL CENTERS,1,2023,48.0,25814.0,29475.0,22668.0,07/01/2018,06/30/2021,142.0,17287.0,19040.0,15749.0,,,,,343.0,18639.0,19563.0,18407.0
2,10006,NORTH ALABAMA MEDICAL CENTER,1,2023,287.0,28589.0,30977.0,26395.0,07/01/2018,06/30/2021,451.0,18745.0,19918.0,17693.0,,,,,511.0,20218.0,20779.0,19452.0
3,10011,ST. VINCENT'S EAST,1,2023,128.0,29000.0,32482.0,26238.0,07/01/2018,06/30/2021,256.0,17700.0,19110.0,16428.0,,,,,316.0,21250.0,22079.0,19527.0
4,10012,DEKALB REGIONAL MEDICAL CENTER,1,2023,52.0,23731.0,27002.0,20779.0,07/01/2018,06/30/2021,96.0,18265.0,20364.0,16445.0,,,,,191.0,18938.0,19683.0,18699.0


In [7]:
ttdf = tdf.drop(labels=['Start Date', 'End Date'], axis=1)
ttdf.drop_duplicates(inplace=True)
print(ttdf.shape)


(206890, 20)


In [8]:

start_time = time.time()
ttdf['marker'] = ttdf['Facility ID'] + ' | ' + ttdf['Facility Name']  + ' | ' + ttdf['file_month'] + ' | ' + ttdf['file_year']
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
column = ttdf.pop('marker')
ttdf.insert(0, column.name, column)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
ttdf.drop_duplicates(inplace=True)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
collapsed_df = ttdf.groupby(ttdf.marker).apply(lambda group: group.ffill().bfill().head(1))
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
ttdf = collapsed_df.reset_index(drop=True)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

start_time = time.time()
ttdf.drop_duplicates(inplace=True)
end_time = time.time()
print("Run time = {:.3f} seconds".format(end_time - start_time))
print(ttdf.shape)

ttdf.head()

Run time = 0.132 seconds
(206890, 21)
Run time = 0.002 seconds
(206890, 21)
Run time = 0.288 seconds
(206890, 21)
Run time = 2721.584 seconds
(206890, 21)
Run time = 0.073 seconds
(140392, 21)
Run time = 0.197 seconds
(140392, 21)


Unnamed: 0,marker,Facility ID,Facility Name,file_month,file_year,Payment for heart attack patients (PAYM_30_AMI) (Denominator),Payment for heart attack patients (PAYM_30_AMI) (Payment),Payment for heart attack patients (PAYM_30_AMI) (Higher Estimate),Payment for heart attack patients (PAYM_30_AMI) (Lower Estimate),Payment for heart failure patients (PAYM_30_HF) (Denominator),Payment for heart failure patients (PAYM_30_HF) (Payment),Payment for heart failure patients (PAYM_30_HF) (Higher Estimate),Payment for heart failure patients (PAYM_30_HF) (Lower Estimate),Payment for hip/knee replacement patients (PAYM_90_HIP_KNEE) (Denominator),Payment for hip/knee replacement patients (PAYM_90_HIP_KNEE) (Payment),Payment for hip/knee replacement patients (PAYM_90_HIP_KNEE) (Higher Estimate),Payment for hip/knee replacement patients (PAYM_90_HIP_KNEE) (Lower Estimate),Payment for pneumonia patients (PAYM_30_PN) (Denominator),Payment for pneumonia patients (PAYM_30_PN) (Payment),Payment for pneumonia patients (PAYM_30_PN) (Higher Estimate),Payment for pneumonia patients (PAYM_30_PN) (Lower Estimate)
0,010001 | SOUTHEAST ALABAMA MEDICAL CENTER | 01...,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1,2018,716.0,22178.0,23323.0,21108.0,824.0,16525.0,17335.0,15754.0,335.0,26863.0,28041.0,25762.0,573.0,17230.0,18106.0,16358.0
1,010001 | SOUTHEAST ALABAMA MEDICAL CENTER | 01...,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1,2020,668.0,24216.0,25579.0,22939.0,828.0,17716.0,18523.0,16905.0,284.0,24984.0,26172.0,23894.0,531.0,19203.0,20214.0,18191.0
2,010001 | SOUTHEAST ALABAMA MEDICAL CENTER | 01...,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1,2021,574.0,24934.0,26429.0,23490.0,823.0,17854.0,18676.0,17061.0,245.0,22216.0,23313.0,21191.0,536.0,20216.0,21271.0,19186.0
3,010001 | SOUTHEAST ALABAMA MEDICAL CENTER | 03...,10001,SOUTHEAST ALABAMA MEDICAL CENTER,3,2019,715.0,23394.0,24641.0,22216.0,813.0,17041.0,17849.0,16268.0,310.0,25812.0,26982.0,24704.0,534.0,18281.0,19236.0,17354.0
4,010001 | SOUTHEAST ALABAMA MEDICAL CENTER | 03...,10001,SOUTHEAST ALABAMA MEDICAL CENTER,3,2021,574.0,24934.0,26429.0,23490.0,823.0,17854.0,18676.0,17061.0,245.0,22216.0,23313.0,21191.0,536.0,20216.0,21271.0,19186.0


## Save dataframe

In [9]:
ttdf.drop(labels=['marker'], axis=1, inplace=True)
ttdf.to_pickle('~/GitHub/hospitals-data-archive/dataframes/partial_dataframes/Payment_and_ValueOfCare_df.pkl.gz', protocol=5, compression='gzip')


In [10]:
m1 = list(ttdf)
ls = ['Facility ID','Facility Name','file_month','file_year']
for l in ls: 
    m1.remove(l)

## Save measurement dates

In [11]:
# Columns to keep as is
id_cols = ['Facility ID', 'Facility Name', 'file_month', 'file_year', 'Start Date', 'End Date']

# Melt the specific columns and create the 'Measure' and 'Score' columns
measures_df = df.melt(id_vars=id_cols, var_name='Measure Name', value_name='Score')
measures_df.drop(labels=['Score', 'Facility ID', 'Facility Name'], axis=1, inplace=True)

print(measures_df.shape)
measures_df.drop_duplicates(inplace=True)
measures_df.reset_index(drop=True, inplace=True)
print(measures_df.shape)

measures_df['Start Date'] = pd.to_datetime(measures_df['Start Date'])
measures_df['End Date'] = pd.to_datetime(measures_df['End Date'])
measures_df.to_csv('~/GitHub/hospitals-data-archive/measure_dates/Payment_and_ValueOfCare_df.csv')

measures_df.head()

(2800168, 5)
(434, 5)


Unnamed: 0,file_month,file_year,Start Date,End Date,Measure Name
0,1,2023,2018-07-01,2021-06-30,Measure Name
1,1,2023,2018-04-01,2021-03-31,Measure Name
2,4,2023,2018-07-01,2021-06-30,Measure Name
3,4,2023,2018-04-01,2021-03-31,Measure Name
4,7,2023,2019-07-01,2022-06-30,Measure Name


In [12]:
m2 = measures_df['Measure Name'].unique().tolist()
sorted(m1) == sorted(m2)

False

In [1]:
print(sorted(m1))
print('\n')
print(sorted(m2))

NameError: name 'm1' is not defined