In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)

In [19]:
gen_path = "../data/general_payments/cms_general_2024.csv"
res_path = "../data/research_payments/cms_research_2024.csv"

gen_usecols = [
    "Covered_Recipient_Profile_ID",
    "Covered_Recipient_NPI",
    "Covered_Recipient_First_Name",
    "Covered_Recipient_Last_Name",

    "Recipient_City",
    "Recipient_State",

    "Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID",
    "Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name",
    "Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1",

    "Total_Amount_of_Payment_USDollars",
    "Date_of_Payment",
    "Nature_of_Payment_or_Transfer_of_Value", # used strictly for filtering and dropped later
    "Record_ID",
    "Program_Year",
]

res_usecols = [
    "Covered_Recipient_Profile_ID",
    "Covered_Recipient_NPI",
    "Covered_Recipient_First_Name",
    "Covered_Recipient_Last_Name",

    "Recipient_City",
    "Recipient_State",

    "Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID",
    "Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name",
    "Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1",

    "Total_Amount_of_Payment_USDollars",
    "Date_of_Payment",
    "Record_ID",
    "Program_Year",
]

gen_df = pd.read_csv(gen_path, usecols=gen_usecols, low_memory=False)
gen_df = gen_df.reindex(columns=gen_usecols)

res_df = pd.read_csv(res_path, usecols=res_usecols, low_memory=False)
res_df = res_df.reindex(columns=res_usecols)

In [20]:
# CLEAN COLUMNS
print(gen_df.columns.tolist())
print(res_df.columns.tolist())

print([col.strip().lower() for col in gen_df.columns])
print([col.strip().lower() for col in res_df.columns])

gen_df.columns = [col.strip().lower() for col in gen_df.columns]
res_df.columns = [col.strip().lower() for col in res_df.columns]

['Covered_Recipient_Profile_ID', 'Covered_Recipient_NPI', 'Covered_Recipient_First_Name', 'Covered_Recipient_Last_Name', 'Recipient_City', 'Recipient_State', 'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID', 'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name', 'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1', 'Total_Amount_of_Payment_USDollars', 'Date_of_Payment', 'Nature_of_Payment_or_Transfer_of_Value', 'Record_ID', 'Program_Year']
['Covered_Recipient_Profile_ID', 'Covered_Recipient_NPI', 'Covered_Recipient_First_Name', 'Covered_Recipient_Last_Name', 'Recipient_City', 'Recipient_State', 'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID', 'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name', 'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1', 'Total_Amount_of_Payment_USDollars', 'Date_of_Payment', 'Record_ID', 'Program_Year']
['covered_recipient_profile_id', 'covered_recipient_npi', 'covered_recipient_first_name', 'cover

In [21]:
# FILTER UNRELATED PAYMENT NATURES SUCH AS FOOD AND DRINKS
useful_natures = [
    "Compensation for services other than consulting, including serving as faculty or as a speaker at a venue other than a continuing education program",
    "Consulting Fee",
    "Education",
    "Honoraria",
    "Royalty or License",
    "Compensation for serving as faculty or as a speaker for a medical education program",
    "Long term medical supply or device loan",
    "Grant",
]

print(len(gen_df))
gen_df = gen_df[gen_df['nature_of_payment_or_transfer_of_value'].isin(useful_natures)]
print(len(gen_df))

15397627
649560


In [22]:
# DROP NATURES USED FOR FILTERING TO CONCAT LATER
gen_df.drop("nature_of_payment_or_transfer_of_value", axis=1, inplace=True)
display(gen_df.head())

Unnamed: 0,covered_recipient_profile_id,covered_recipient_npi,covered_recipient_first_name,covered_recipient_last_name,recipient_city,recipient_state,applicable_manufacturer_or_applicable_gpo_making_payment_id,applicable_manufacturer_or_applicable_gpo_making_payment_name,name_of_drug_or_biological_or_device_or_medical_supply_1,total_amount_of_payment_usdollars,date_of_payment,record_id,program_year
0,,,,,New Orleans,LA,100000000226,"Genentech USA, Inc.",Ocrevus,10000.0,02/23/2024,1133195830,2024
1,,,,,Nashville,TN,100000000226,"Genentech USA, Inc.",Polivy,5000.0,08/01/2024,1133196475,2024
2,,,,,LA CROSSE,WI,100000686813,Linde Gas & Equipment Inc.,REQNOXBOX-I,330.0,02/13/2024,1142632901,2024
3,,,,,LA CROSSE,WI,100000686813,Linde Gas & Equipment Inc.,REQNOXBOX-I,330.0,08/06/2024,1142632912,2024
4,,,,,LA CROSSE,WI,100000686813,Linde Gas & Equipment Inc.,REQNOXBOX-I,330.0,02/08/2024,1142632930,2024


In [23]:
print(gen_df['covered_recipient_npi'].isnull().sum())
print(res_df['covered_recipient_npi'].isnull().sum())

21525
728842


In [18]:
subset = [
    "covered_recipient_npi",
    "total_amount_of_payment_usdollars",
    "date_of_payment",
]

print(len(gen_df))
gen_df = gen_df.dropna(subset=subset)
print(len(gen_df))

print(len(res_df))
res_df = res_df.dropna(subset=subset)
print(len(res_df))

649560
628035
756193
27351


In [32]:
df["covered_recipient_profile_id"] = df["covered_recipient_profile_id"].astype("Int64")
df["covered_recipient_npi"] = df["covered_recipient_npi"].astype("Int64")

df["covered_recipient_first_name"] = df["covered_recipient_first_name"].str.strip()
df["covered_recipient_last_name"] = df["covered_recipient_last_name"].str.strip()

df["covered_recipient_specialty_1"] = df["covered_recipient_specialty_1"].str.strip().str.title()
df["covered_recipient_primary_type_1"] = df["covered_recipient_primary_type_1"].str.strip().str.title()
df["covered_recipient_license_state_code1"] = df["covered_recipient_license_state_code1"].str.strip().str.upper()
df["recipient_city"] = df["recipient_city"].str.strip().str.title()
df["recipient_state"] = df["recipient_state"].str.strip().str.upper()
df["recipient_zip_code"] = df["recipient_zip_code"].str.strip()
df["applicable_manufacturer_or_applicable_gpo_making_payment_name"] = df["applicable_manufacturer_or_applicable_gpo_making_payment_name"].str.strip()

df["date_of_payment"] = pd.to_datetime(df["date_of_payment"], format="%m/%d/%Y", errors="coerce")
df["nature_of_payment_or_transfer_of_value"] = df["nature_of_payment_or_transfer_of_value"].str.strip()
df["form_of_payment_or_transfer_of_value"] = df["form_of_payment_or_transfer_of_value"].str.strip()

bool_map = {"Yes": True, "No": False}
df["physician_ownership_indicator"] = df["physician_ownership_indicator"].map(bool_map).astype("boolean").fillna(False)

In [33]:
id_columns = [
    "covered_recipient_profile_id",
    "covered_recipient_npi",
    "applicable_manufacturer_or_applicable_gpo_making_payment_id",
    "number_of_payments_included_in_total_amount",
    "record_id",
    "program_year",
]

for col in id_columns:
    df[col] = df[col].astype("Int64")

In [34]:
df["recipient_zip_5"] = df["recipient_zip_code"].str[:5]
df.head()

Unnamed: 0,covered_recipient_profile_id,covered_recipient_npi,covered_recipient_first_name,covered_recipient_last_name,covered_recipient_specialty_1,covered_recipient_primary_type_1,covered_recipient_license_state_code1,recipient_city,recipient_state,recipient_zip_code,applicable_manufacturer_or_applicable_gpo_making_payment_id,applicable_manufacturer_or_applicable_gpo_making_payment_name,name_of_drug_or_biological_or_device_or_medical_supply_1,total_amount_of_payment_usdollars,date_of_payment,nature_of_payment_or_transfer_of_value,form_of_payment_or_transfer_of_value,number_of_payments_included_in_total_amount,record_id,physician_ownership_indicator,program_year,recipient_zip_5
0,11107494,1871018762,ROANNA,EGGERT,Physician Assistants & Advanced Practice Nursi...,Nurse Practitioner,FL,Pinellas Park,FL,33781,100000005450,"Neuronetics, Inc.",NEUROSTAR TMS THERAPY SYSTEM,39.62,2023-04-28,Food and Beverage,Cash or cash equivalent,1,966803191,False,2023,33781
1,1389841,1861886830,REZA,GORAVANCHI,Allopathic & Osteopathic Physicians|Psychiatry...,Medical Doctor,NV,Las Vegas,NV,89103,100000005450,"Neuronetics, Inc.",NEUROSTAR TMS THERAPY SYSTEM,19.27,2023-02-14,Food and Beverage,Cash or cash equivalent,1,966803197,False,2023,89103
2,5703924,1861847113,LAUREN,WILSON,Allopathic & Osteopathic Physicians|Psychiatry...,Medical Doctor,TX,New Orleans,LA,70112,100000005450,"Neuronetics, Inc.",NEUROSTAR TMS THERAPY SYSTEM,23.75,2023-01-25,Food and Beverage,Cash or cash equivalent,1,966803203,False,2023,70112
3,892163,1588894869,RACHEL,CARR,Allopathic & Osteopathic Physicians|Obstetrics...,Medical Doctor,IN,Indianapolisq,IN,46256-4649,100000151631,"Minerva Surgical, Inc",,14.59,2023-02-02,Food and Beverage,In-kind items and services,1,966519119,False,2023,46256
4,892163,1588894869,RACHEL,CARR,Allopathic & Osteopathic Physicians|Obstetrics...,Medical Doctor,IN,Indianapolisq,IN,46256-4649,100000151631,"Minerva Surgical, Inc",,22.17,2023-06-20,Food and Beverage,In-kind items and services,1,966519125,False,2023,46256


In [35]:
df = df[
    [
        "covered_recipient_profile_id",
        "covered_recipient_npi",
        "covered_recipient_first_name",
        "covered_recipient_last_name",
        "covered_recipient_specialty_1",
        "covered_recipient_primary_type_1",
        "covered_recipient_license_state_code1",
        "recipient_city",
        "recipient_state",
        "recipient_zip_code",
        "recipient_zip_5",
        "applicable_manufacturer_or_applicable_gpo_making_payment_id",
        "applicable_manufacturer_or_applicable_gpo_making_payment_name",
        "name_of_drug_or_biological_or_device_or_medical_supply_1",
        "total_amount_of_payment_usdollars",
        "date_of_payment",
        "nature_of_payment_or_transfer_of_value",
        "form_of_payment_or_transfer_of_value",
        "number_of_payments_included_in_total_amount",
        "record_id",
        "physician_ownership_indicator",
        "program_year",
    ]
]
df.head()

Unnamed: 0,covered_recipient_profile_id,covered_recipient_npi,covered_recipient_first_name,covered_recipient_last_name,covered_recipient_specialty_1,covered_recipient_primary_type_1,covered_recipient_license_state_code1,recipient_city,recipient_state,recipient_zip_code,recipient_zip_5,applicable_manufacturer_or_applicable_gpo_making_payment_id,applicable_manufacturer_or_applicable_gpo_making_payment_name,name_of_drug_or_biological_or_device_or_medical_supply_1,total_amount_of_payment_usdollars,date_of_payment,nature_of_payment_or_transfer_of_value,form_of_payment_or_transfer_of_value,number_of_payments_included_in_total_amount,record_id,physician_ownership_indicator,program_year
0,11107494,1871018762,ROANNA,EGGERT,Physician Assistants & Advanced Practice Nursi...,Nurse Practitioner,FL,Pinellas Park,FL,33781,33781,100000005450,"Neuronetics, Inc.",NEUROSTAR TMS THERAPY SYSTEM,39.62,2023-04-28,Food and Beverage,Cash or cash equivalent,1,966803191,False,2023
1,1389841,1861886830,REZA,GORAVANCHI,Allopathic & Osteopathic Physicians|Psychiatry...,Medical Doctor,NV,Las Vegas,NV,89103,89103,100000005450,"Neuronetics, Inc.",NEUROSTAR TMS THERAPY SYSTEM,19.27,2023-02-14,Food and Beverage,Cash or cash equivalent,1,966803197,False,2023
2,5703924,1861847113,LAUREN,WILSON,Allopathic & Osteopathic Physicians|Psychiatry...,Medical Doctor,TX,New Orleans,LA,70112,70112,100000005450,"Neuronetics, Inc.",NEUROSTAR TMS THERAPY SYSTEM,23.75,2023-01-25,Food and Beverage,Cash or cash equivalent,1,966803203,False,2023
3,892163,1588894869,RACHEL,CARR,Allopathic & Osteopathic Physicians|Obstetrics...,Medical Doctor,IN,Indianapolisq,IN,46256-4649,46256,100000151631,"Minerva Surgical, Inc",,14.59,2023-02-02,Food and Beverage,In-kind items and services,1,966519119,False,2023
4,892163,1588894869,RACHEL,CARR,Allopathic & Osteopathic Physicians|Obstetrics...,Medical Doctor,IN,Indianapolisq,IN,46256-4649,46256,100000151631,"Minerva Surgical, Inc",,22.17,2023-06-20,Food and Beverage,In-kind items and services,1,966519125,False,2023


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14672763 entries, 0 to 14704232
Data columns (total 22 columns):
 #   Column                                                         Dtype         
---  ------                                                         -----         
 0   covered_recipient_profile_id                                   Int64         
 1   covered_recipient_npi                                          Int64         
 2   covered_recipient_first_name                                   object        
 3   covered_recipient_last_name                                    object        
 4   covered_recipient_specialty_1                                  object        
 5   covered_recipient_primary_type_1                               object        
 6   covered_recipient_license_state_code1                          object        
 7   recipient_city                                                 object        
 8   recipient_state                                        

In [37]:
df['nature_of_payment_or_transfer_of_value'].value_counts()

nature_of_payment_or_transfer_of_value
Food and Beverage                                                                                                                                     13457311
Travel and Lodging                                                                                                                                      552420
Compensation for services other than consulting, including serving as faculty or as a speaker at a venue other than a continuing education program      231114
Consulting Fee                                                                                                                                          173687
Education                                                                                                                                               161565
Gift                                                                                                                                                     31806
Honorar

In [38]:
useful_natures = [
    "Compensation for services other than consulting, including serving as faculty or as a speaker at a venue other than a continuing education program",
    "Consulting Fee",
    "Education",
    "Honoraria",
    "Royalty or License",
    "Compensation for serving as faculty or as a speaker for a medical education program",
    "Long term medical supply or device loan",
    "Grant",
]

print(len(df))
df = df[df['nature_of_payment_or_transfer_of_value'].isin(useful_natures)]
print(len(df))

14672763
621389
