In [1]:
import pandas as pd 

demo = pd.read_csv('demo_21q3')
drug = pd.read_csv('drug_21q3')
outc = pd.read_csv('outc_21q3')
reac = pd.read_csv('reac_21q3')
indi = pd.read_csv('indi_21q3')
ther = pd.read_csv('ther_21q3')

In [2]:
indi_sorted=indi.sort_values(by=['primaryid','indi_drug_seq'])

In [3]:
drug_sorted=drug.sort_values(by=['primaryid','drug_seq'])

In [4]:
drug_indi = drug_sorted.merge(indi_sorted,left_on=['primaryid','drug_seq'],right_on=['primaryid', 'indi_drug_seq'])

In [5]:
drug_indi_ther = drug_indi.merge(ther,left_on=['primaryid','drug_seq'],right_on=['primaryid','dsg_drug_seq'])

In [6]:
drug_indi_ther_outc = drug_indi_ther.merge(outc,left_on=['primaryid'],right_on=['primaryid'])

In [7]:
drug_indi_ther_outc_reac = drug_indi_ther_outc.merge(reac,left_on=['primaryid'],right_on=['primaryid'])

In [8]:
# creating the prod_ai frequency

freq=drug_indi_ther_outc_reac['prod_ai'].value_counts(0)
drug_indi_ther_outc_reac['freq_prod_ai'] = drug_indi_ther_outc_reac['prod_ai'].map(freq)

In [9]:
# ✅ Option 2: Ordinal severity score (recommended if you want ranking)
severity_map = {
    'OT': 1,
    'HO': 2,
    'RI': 3,
    'DS': 4,
    'LT': 5,
    'DE': 6,
    'CA': 6
}

drug_indi_ther_outc_reac['severity_score'] = drug_indi_ther_outc_reac['outc_cod'].map(severity_map)

In [10]:
# creating the prod_ai frequency

freq=drug_indi_ther_outc_reac['prod_ai'].value_counts(0)
drug_indi_ther_outc_reac['freq_prod_ai'] = drug_indi_ther_outc_reac['prod_ai'].map(freq)

In [11]:
drug_severity = drug_indi_ther_outc_reac.groupby('prod_ai')['severity_score'].mean()
drug_indi_ther_outc_reac['drug_severity_mean'] = drug_indi_ther_outc_reac['prod_ai'].map(drug_severity)

In [12]:
num_drug_per_person = drug_indi_ther_outc_reac.groupby('primaryid')['prod_ai'].nunique()
drug_indi_ther_outc_reac['num_drug'] = drug_indi_ther_outc_reac['primaryid'].map(num_drug_per_person)

In [13]:
df = drug_indi_ther_outc_reac

## adding more features

In [14]:
from sklearn.preprocessing import LabelEncoder

def feature_engineering_pipeline(df):  
    print("Starting feature engineering...")
    print(f"Initial shape: {df.shape}")
    
    df = df[~df['indi_pt'].str.contains('Product used for unknown indication', na=False)]
    df = df[~df['pt'].str.contains('Off label use', na=False)]
    print(f"After removing unknown indications: {df.shape}")
    
    # 2. FILTER TOP CATEGORIES
    # For prod_ai (active substance)
    top_prod_ai = df['prod_ai'].value_counts().head(30).index
    df = df[df['prod_ai'].isin(top_prod_ai)]
    print(f"After filtering top 30 prod_ai: {df.shape}")
    
    # For indi_pt (drug indication)
    top_indi = df['indi_pt'].value_counts().head(30).index
    df = df[df['indi_pt'].isin(top_indi)]
    print(f"After filtering top 30 indications: {df.shape}")
    
    # For pt (reaction)
    top_reactions = df['pt'].value_counts().head(30).index
    df = df[df['pt'].isin(top_reactions)]
    print(f"After filtering top 30 reactions: {df.shape}")
    
    # 3. ENCODE CATEGORICAL VARIABLES 
    le_prod = LabelEncoder()
    le_indi = LabelEncoder()
    le_pt = LabelEncoder()
    le_role = LabelEncoder()
    
    df['prod_ai_encoded'] = le_prod.fit_transform(df['prod_ai'].fillna('UNKNOWN'))
    df['indi_pt_encoded'] = le_indi.fit_transform(df['indi_pt'].fillna('UNKNOWN'))
    df['pt_encoded'] = le_pt.fit_transform(df['pt'].fillna('UNKNOWN'))
    df['role_cod_encoded'] = le_role.fit_transform(df['role_cod'].fillna('UNKNOWN'))
    
    # 4. CREATE SEVERITY FEATURES
    # Map outcome codes to numeric severity
    severity_map = {
        'OT': 1,  # Other
        'RI': 2,  # Required Intervention
        'HO': 3,  # Hospitalization
        'DS': 4,  # Disability
        'LT': 5,  # Life-threatening
        'CA': 6,  # Congenital Anomaly
        'DE': 6   # Death
    }
    df['severity_score'] = df['outc_cod'].map(severity_map).fillna(1)
    
    # 5. CREATE DRUG-LEVEL AGGREGATIONS
    # Drug frequency (how common is this drug in adverse events)
    drug_freq = df.groupby('prod_ai').size()
    df['drug_frequency'] = df['prod_ai'].map(drug_freq)
    
    # Average severity for each drug
    drug_severity = df.groupby('prod_ai')['severity_score'].mean()
    df['drug_avg_severity'] = df['prod_ai'].map(drug_severity)
    
    # 6. CREATE PATIENT-LEVEL FEATURES
    # Number of drugs per patient (polypharmacy indicator)
    drugs_per_patient = df.groupby('primaryid')['prod_ai'].nunique()
    df['num_drugs'] = df['primaryid'].map(drugs_per_patient)
    
    # Number of reactions per patient
    reactions_per_patient = df.groupby('primaryid')['pt'].nunique()
    df['num_reactions'] = df['primaryid'].map(reactions_per_patient)
    
    # 7. CREATE TEMPORAL FEATURES (from start_dt fields)
    df['treatment_duration'] = df['end_dt_yrs'] - df['start_dt_yrs']
    df['treatment_duration'] = df['treatment_duration'].fillna(0)
    
    # Convert start year to age of report (relative to 2021)
    df['report_age_years'] = 2021 - df['start_dt_yrs'].fillna(2021)
    
    # 8. CREATE INTERACTION FEATURES
    # Drug-indication interaction 
    df['drug_indication_combo'] = (df['prod_ai_encoded'].astype(str) + '_' + 
                                    df['indi_pt_encoded'].astype(str))
    
    # 9. HANDLE DECHAL (dechallenge) -  binary features
    # Map to binary: Y=1, N=0, U/NaN=0
    df['dechal_binary'] = df['dechal'].map({'Y': 1, 'N': 0}).fillna(0)
    
    # 10. CREATE ROLE-BASED FEATURES
    # Primary suspect (PS) vs others
    df['is_primary_suspect'] = (df['role_cod'] == 'PS').astype(int)
    df['is_secondary_suspect'] = (df['role_cod'] == 'SS').astype(int)
    df['is_concomitant'] = (df['role_cod'] == 'C').astype(int)
    
    # 11. DROP COLUMNS WITH TOO MANY NULLS (>30% similar to final code)
    null_pct = df.isnull().sum() / len(df)
    cols_to_drop = null_pct[null_pct > 0.3].index.tolist()
    print(f"Dropping columns with >30% nulls: {cols_to_drop}")
    df = df.drop(columns=cols_to_drop, errors='ignore')
    
    # 12. FINAL CLEANUP
    # Drop remaining nulls in key columns
    df = df.dropna(subset=['prod_ai_encoded', 'indi_pt_encoded', 
                           'pt_encoded', 'severity_score'])
    
    print(f"Final shape after feature engineering: {df.shape}")
    
    return df, {
        'prod_ai_encoder': le_prod,
        'indi_pt_encoder': le_indi,
        'pt_encoder': le_pt,
        'role_cod_encoder': le_role
    }

  

In [15]:
df_engineered, encoders = feature_engineering_pipeline(df=drug_indi_ther_outc_reac)

Starting feature engineering...
Initial shape: (2912330, 22)
After removing unknown indications: (2381316, 22)
After filtering top 30 prod_ai: (760431, 22)
After filtering top 30 indications: (638269, 22)
After filtering top 30 reactions: (260498, 22)
Dropping columns with >30% nulls: ['start_dt_date', 'end_dt_date']
Final shape after feature engineering: (260498, 35)


In [16]:
df_engineered

Unnamed: 0,primaryid,drug_seq,role_cod,prod_ai,dechal,indi_drug_seq,indi_pt,dsg_drug_seq,start_dt,end_dt,...,drug_avg_severity,num_drugs,num_reactions,treatment_duration,report_age_years,drug_indication_combo,dechal_binary,is_primary_suspect,is_secondary_suspect,is_concomitant
1328,79512844,19,C,ACETAMINOPHEN,U,19,Premedication,19,2006.0,2006.0,...,2.177010,1,1,0.0,15.0,0_23,0.0,0,0,1
1339,79512844,19,C,ACETAMINOPHEN,U,19,Premedication,19,2006.0,2006.0,...,2.177010,1,1,0.0,15.0,0_23,0.0,0,0,1
1350,79512844,19,C,ACETAMINOPHEN,U,19,Premedication,19,2006.0,2006.0,...,2.177010,1,1,0.0,15.0,0_23,0.0,0,0,1
6031,81711717,29,I,OMEPRAZOLE,U,29,Dyspepsia,29,20110525.0,,...,2.147281,1,5,0.0,10.0,17_11,0.0,0,0,0
6044,81711717,29,I,OMEPRAZOLE,U,29,Dyspepsia,29,20110525.0,,...,2.147281,1,5,0.0,10.0,17_11,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912103,9474120138,8,SS,OCTREOTIDE ACETATE,D,8,Carcinoid tumour,8,20130409.0,20130423.0,...,2.085234,1,10,0.0,8.0,15_7,0.0,0,1,0
2912107,9474120138,8,SS,OCTREOTIDE ACETATE,D,8,Carcinoid tumour,8,20130409.0,20130423.0,...,2.085234,1,10,0.0,8.0,15_7,0.0,0,1,0
2912110,9474120138,8,SS,OCTREOTIDE ACETATE,D,8,Carcinoid tumour,8,20130409.0,20130423.0,...,2.085234,1,10,0.0,8.0,15_7,0.0,0,1,0
2912134,9474120138,8,SS,OCTREOTIDE ACETATE,D,8,Carcinoid tumour,8,20130409.0,20130423.0,...,2.085234,1,10,0.0,8.0,15_7,0.0,0,1,0


In [18]:
def create_model_features(df):
    """
    Create the final feature set for modeling (similar to VectorAssembler in final code)
    """
    
    feature_columns = [
        'prod_ai_encoded',
        'indi_pt_encoded', 
        'role_cod_encoded',
        'drug_frequency',
        'drug_avg_severity',
        'num_drugs',
        'num_reactions',
        'treatment_duration',
        'report_age_years',
        'dechal_binary',
        'is_primary_suspect',
        'is_secondary_suspect',
        'is_concomitant'
    ]
    
    # Filter to only include columns that exist
    feature_columns = [col for col in feature_columns if col in df.columns]
    
    X = df[feature_columns].copy()
    
    # Target variable (reaction type or severity)
    y_reaction = df['pt_encoded'].copy()  # Predicting reaction type
    y_severity = df['severity_score'].copy()  # Predicting severity
    
    print(f"Feature matrix shape: {X.shape}")
    print(f"Features used: {feature_columns}")
    
    return X, y_reaction, y_severity

In [19]:
X, y_reaction, y_severity = create_model_features(df_engineered)

Feature matrix shape: (260498, 13)
Features used: ['prod_ai_encoded', 'indi_pt_encoded', 'role_cod_encoded', 'drug_frequency', 'drug_avg_severity', 'num_drugs', 'num_reactions', 'treatment_duration', 'report_age_years', 'dechal_binary', 'is_primary_suspect', 'is_secondary_suspect', 'is_concomitant']


In [20]:
X.to_csv('X_features.csv', index=False)
y_reaction.to_csv('y_reaction.csv', index=False)
y_severity.to_csv('y_severity.csv', index=False)