In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
DF_FILEPATH = 'fp.csv'
df_ards = pd.read_csv(DF_FILEPATH)

df_ards.head()

In [None]:
for item in df_ards.head()['drug']:
    print(item)

In [None]:
df_prices = pd.read_excel('./vaFssPharmPrices.xlsx')
df_prices.columns

In [None]:
# Function to search 'trade name' for a specific drug and extract 'price'
collected_costs = {}
unique_units = set()

conversions = {
    "mg": (0.001, "g"),    # 1 milligram is 0.001 grams
    "mcg": (0.000001, "g"),  # 1 microgram is 0.000001 grams
    "ng": (0.000000001, "g"),
    "ml": (0.001, "l"),    # 1 milliliter is 0.001 liters
    "g": (1, "g"),         # 1 gram is 1 gram (no conversion needed)
    "kg": (1000, "g"),     # 1 kilogram is 1000 grams
    "ml": (0.001, "l"),
    "l": (1, "l"),         # 1 liter is 1 liter (no conversion needed)
    "mg/ml" : (1, "g/l"),
    "ng/ml": (0.000001, "g/l"),  # Add conversions for ng/ml
    "mcg/ml": (0.001, "g/l"),
    "ug": (0.000001, "g"),  # Add conversions for ug
    "iu": (1, "iu"),       # Add conversions for IU
    "units": (1, "units"),  # Add conversions for units
    "unt": (1, "unt"),
    "unt/ml": (1, "unt/ml"),
    # Add more conversions for other units as needed
}


def convert(cost, amount, unit):
    split = unit.split('/')
    unit_1 = split[0]
    unit_2 = split[1] if len(split) > 1 else ''
    conversion_factor = conversions[unit_1.lower()][0] / conversions[unit_2.lower()][0] if unit_2 else conversions[unit_1.lower()][0]
    conversion_unit = conversions[unit_1.lower()][1] + '/' + conversions[unit_2.lower()][1] if unit_2 else conversions[unit_1.lower()][1]
    return (cost / (conversion_factor * amount), conversion_unit) 

def search_and_extract_cost(drug, tradeName_column, price_column, size_column):
    escaped_drug = re.escape(drug)
    mask = df_prices[tradeName_column].str.contains(escaped_drug, case=False)
    if mask.any():
        fss_price = df_prices.loc[mask, price_column].iloc[0]
        package_size = re.search(r'\d+', df_prices.loc[mask, size_column].iloc[0])
        if not fss_price or not package_size: 
            print(escaped_drug)
            return None
        return fss_price / float(package_size.group())
    else:
        return None  # Return None if drug not found

def extract_unit(drug, tradeName_column):

    escaped_drug = re.escape(drug)
    mask = df_prices[tradeName_column].str.contains(escaped_drug, case=False)
    if mask.any():
        text = df_prices.loc[mask, tradeName_column].iloc[0]
         # Define a regular expression pattern to find numbers followed by various units
        # pattern = r'(\d+(\.\d+)?)(\s*)(mg|g|ml|l|mcg|µg|kg|kg|mg/ml|ng/ml|g/ml|ug|IU|iu|units|unt|unt/ml|unt/g|unt/mg)(\b)'
        pattern = r'(\d+(\.\d+)?)(\s*)(mg|g|ml|l|mcg|µg|kg|kg|mg/ml|ng/ml|g/ml|ug|IU|units|unt|unts)(/\s*)?(mg|g|ml|l|mcg|µg|kg|kg|mg/ml|ng/ml|g/ml|ug|IU|units|unt|unts)?(\b)?'

        matches = re.findall(pattern, text, re.IGNORECASE)

        # Extracting information from matches
        if matches:
            for match in matches:
                amount = float(match[0])  # Extract the number
                unit_1 = match[3]  # Extract the unit
                unit_2 = match[5] if match[5] else ''
                full_unit = unit_1 + '/' + unit_2 if unit_2 else unit_1
                unique_units.add(full_unit.lower())
                return (amount, full_unit.lower())
        
        return (None, None)
    else:
        return (None, None)  # Return None if drug not found
    
   

def search_unit(drug):
    escaped_drug = re.escape(drug)
    mask = df_prices['TradeName'].str.contains(escaped_drug, case=False)
    if mask.any():
        trade_name = df_prices.loc[mask, 'TradeName'].iloc[0]
        

def standardize_cost(cost, amount, unit):
    if not cost or not amount or not unit: return (None, None)
    conv_cost, conv_unit = convert(cost, amount, unit)
    return (conv_cost, conv_unit)


In [None]:
# ARDS Disease
df_ards['OPAL Price'] = df_ards['drug'].apply(lambda x: search_and_extract_cost(x, 'TradeName', 'Price', 'PackageDescription'))
df_ards['OPAL Amount'], df_ards['OPAL Unit'] = zip(*df_ards['drug'].apply(lambda x: extract_unit(x, 'TradeName')))
df_ards['OPAL Standardized Cost'], df_ards['OPAL Standardized Unit'] = zip(*df_ards.apply(lambda row: standardize_cost(row['OPAL Price'], row['OPAL Amount'], row['OPAL Unit']), axis=1))

In [None]:
print(df_ards['OPAL Standardized Unit'].unique())
print(df_ards['OPAL Unit'].unique())
len(df_ards) - df_ards['OPAL Standardized Cost'].isnull().sum()

In [None]:
df_cardiac = pd.read_csv('./Findings/cardiac_arrest_treatments.csv')

In [None]:
# Cardiac Arrest Disease
df_cardiac['OPAL Price'] = df_cardiac['drug'].apply(lambda x: search_and_extract_cost(x, 'TradeName', 'Price', 'PackageDescription'))
df_cardiac['OPAL Amount'], df_cardiac['OPAL Unit'] = zip(*df_cardiac['drug'].apply(lambda x: extract_unit(x, 'TradeName')))
df_cardiac['OPAL Standardized Cost'], df_cardiac['OPAL Standardized Unit'] = zip(*df_cardiac.apply(lambda row: standardize_cost(row['OPAL Price'], row['OPAL Amount'], row['OPAL Unit']), axis=1))

In [None]:
df_sepsis = pd.read_csv('./Findings/sepsis_treatments.csv')

In [None]:
# Sepsis Disease
df_sepsis['OPAL Price'] = df_sepsis['drug'].apply(lambda x: search_and_extract_cost(x, 'TradeName', 'Price', 'PackageDescription'))
df_sepsis['OPAL Amount'], df_sepsis['OPAL Unit'] = zip(*df_sepsis['drug'].apply(lambda x: extract_unit(x, 'TradeName')))
df_sepsis['OPAL Standardized Cost'], df_sepsis['OPAL Standardized Unit'] = zip(*df_sepsis.apply(lambda row: standardize_cost(row['OPAL Price'], row['OPAL Amount'], row['OPAL Unit']), axis=1))

In [None]:
# Create New CSV's from df's
df_ards.to_csv('./StandardizedCost/ARDS_standardized_cost.csv')
df_cardiac.to_csv('./StandardizedCost/cardiac_arrest_standardized_cost.csv')
df_sepsis.to_csv('./StandardizedCost/sepsis_standardized_cost.csv')