In [1]:
import pandas as pd 

demo = pd.read_csv('demo_21q3')
drug = pd.read_csv('drug_21q3')
outc = pd.read_csv('outc_21q3')
reac = pd.read_csv('reac_21q3')
indi = pd.read_csv('indi_21q3')
ther = pd.read_csv('ther_21q3')

In [3]:
indi_sorted=indi.sort_values(by=['primaryid','indi_drug_seq'])

In [4]:
drug_sorted=drug.sort_values(by=['primaryid','drug_seq'])

In [5]:
drug_indi = drug_sorted.merge(indi_sorted,left_on=['primaryid','drug_seq'],right_on=['primaryid', 'indi_drug_seq'])

In [6]:
drug_indi_ther = drug_indi.merge(ther,left_on=['primaryid','drug_seq'],right_on=['primaryid','dsg_drug_seq'])

In [7]:
drug_indi_ther_outc = drug_indi_ther.merge(outc,left_on=['primaryid'],right_on=['primaryid'])

In [10]:
drug_indi_ther_outc_reac = drug_indi_ther_outc.merge(reac,left_on=['primaryid'],right_on=['primaryid'])

In [11]:
# creating the prod_ai frequency

freq=drug_indi_ther_outc_reac['prod_ai'].value_counts(0)
drug_indi_ther_outc_reac['freq_prod_ai'] = drug_indi_ther_outc_reac['prod_ai'].map(freq)

In [12]:
# ✅ Option 2: Ordinal severity score (recommended if you want ranking)
severity_map = {
    'OT': 1,
    'HO': 2,
    'RI': 3,
    'DS': 4,
    'LT': 5,
    'DE': 6,
    'CA': 6
}

drug_indi_ther_outc_reac['severity_score'] = drug_indi_ther_outc_reac['outc_cod'].map(severity_map)

In [13]:
# creating the prod_ai frequency

freq=drug_indi_ther_outc_reac['prod_ai'].value_counts(0)
drug_indi_ther_outc_reac['freq_prod_ai'] = drug_indi_ther_outc_reac['prod_ai'].map(freq)

In [14]:
drug_severity = drug_indi_ther_outc_reac.groupby('prod_ai')['severity_score'].mean()
drug_indi_ther_outc_reac['drug_severity_mean'] = drug_indi_ther_outc_reac['prod_ai'].map(drug_severity)

In [15]:
num_drug_per_person = drug_indi_ther_outc_reac.groupby('primaryid')['prod_ai'].nunique()
drug_indi_ther_outc_reac['num_drug'] = drug_indi_ther_outc_reac['primaryid'].map(num_drug_per_person)

In [16]:
df = drug_indi_ther_outc_reac

df['treatment_duration_days'] = (
    (pd.to_datetime(df['end_dt']) - pd.to_datetime(df['start_dt']))
    .dt.days
)

agg_df = df.groupby('primaryid').agg({
    'num_drug': 'max',
    'severity_score': 'mean',
    'drug_severity_mean': 'mean',
    'freq_prod_ai': 'sum',
    'treatment_duration_days': 'mean',
    
    # categorical → counts
    'prod_ai': 'nunique',
    'pt': 'nunique',
    'outc_cod': 'nunique'
}).reset_index()

agg_df.rename(columns={
    'prod_ai': 'unique_drugs',
    'pt': 'unique_reactions',
    'outc_cod': 'unique_outcomes'
}, inplace=True)

In [17]:
from sklearn.preprocessing import LabelEncoder

for col in ['role_cod', 'dechal']:
    agg_df[col] = df.groupby('primaryid')[col].first().values
    agg_df[col] = LabelEncoder().fit_transform(agg_df[col])

In [18]:
agg_df['target'] = (agg_df['severity_score'] > 2).astype(int)

In [19]:
import pandas as pd
agg_df.to_csv('final_data.csv')