In [4]:
import pandas as pd
import numpy as np
import mlxtend

In [5]:
#Import the data
arm_df = pd.read_csv('C:\\Users\\USER\\Desktop\\FYP\\fyp-sandbox-2\\Data\\ARM_DATA/arm_landing.csv')
arm_df

Unnamed: 0,primaryid,combined_case_ls
0,193578573,"['age_bin_child', 'reaction_general_physical_c..."
1,186972702,"['age_bin_teenager', 'outc_ot', 'drug_remicade..."
2,186976496,['reaction_inappropriate_schedule_of_product_a...
3,187294742,"['drug_sultamicillin', 'age_bin_teenager', 'dr..."
4,188004601,"['reaction_xray_abnormal', 'outc_ho', 'age_bin..."
...,...,...
184031,192122501,"['reaction_abdominal_pain_upper', 'origin_us',..."
184032,237422431,"['age_bin_child', 'weight_above 60 kg', 'drug_..."
184033,220202801,"['origin_us', 'weight_above 60 kg', 'age_bin_t..."
184034,213026962,"['origin_cn', 'drug_linezolid', 'reaction_off_..."


In [6]:
print(f'Before removing duplicates: {len(arm_df)}')
print(f'After removing duplicates: {len(arm_df['combined_case_ls'].drop_duplicates())}')

Before removing duplicates: 184036
After removing duplicates: 149553


Since some of the transactions are duplicates, we need to remove as they could inflate and skew the results of ARM by returning overly optimistinc support and confidence metrics

In [7]:
arm_df_t = arm_df[['combined_case_ls']].drop_duplicates()
arm_df_t

Unnamed: 0,combined_case_ls
0,"['age_bin_child', 'reaction_general_physical_c..."
1,"['age_bin_teenager', 'outc_ot', 'drug_remicade..."
2,['reaction_inappropriate_schedule_of_product_a...
3,"['drug_sultamicillin', 'age_bin_teenager', 'dr..."
4,"['reaction_xray_abnormal', 'outc_ho', 'age_bin..."
...,...
184031,"['reaction_abdominal_pain_upper', 'origin_us',..."
184032,"['age_bin_child', 'weight_above 60 kg', 'drug_..."
184033,"['origin_us', 'weight_above 60 kg', 'age_bin_t..."
184034,"['origin_cn', 'drug_linezolid', 'reaction_off_..."


In [8]:
#Converting into a list for fpgrowth
import ast
arm_training_list = arm_df_t['combined_case_ls'].apply(ast.literal_eval).tolist()
arm_training_list

[['age_bin_child',
  'reaction_general_physical_condition_abnormal',
  'origin_de',
  'reaction_pyrexia',
  'sex_f',
  'reaction_haemophagocytic_lymphohistiocytosis',
  'reaction_blood_lactate_dehydrogenase_increased',
  'outc_ot',
  'reaction_necrosis',
  'reaction_bronchospasm',
  'outc_ho',
  'reaction_general_physical_health_deterioration',
  'reaction_pleural_effusion',
  'reaction_cardiac_arrest',
  'reaction_otitis_media',
  'reaction_haemorrhage',
  'drug_prednisolone',
  'reaction_sinus_tachycardia',
  'weight_10-20 kg',
  'reaction_haemoglobin_decreased',
  'reaction_somnolence',
  'reaction_hypocalcaemia',
  'reaction_serum_ferritin_increased',
  'reaction_hyponatraemia',
  'reaction_renal_failure',
  'reaction_circulatory_collapse',
  'drug_amoxicillinclavulanic_acid',
  'drug_canakinumab',
  'reaction_blood_uric_acid_increased',
  'drug_cotrim',
  'reaction_aspartate_aminotransferase_abnormal',
  'outc_de'],
 ['age_bin_teenager',
  'outc_ot',
  'drug_remicade',
  'weight_2

In [9]:
print(f'{len(arm_training_list)}')

149553


In [10]:
#Before doing the one hot encoding, it's important to know the #of unique itemsets
from itertools import chain

unq_items = set(chain.from_iterable(arm_training_list))
print(len(unq_items))

25385


## Association Rule Mining

Using `fpgrowth_py`

* Handles >10,000 unique items directly without needing one-hot encoding

In [None]:
from fpgrowth_py import fpgrowth

In [11]:
sample_set = arm_training_list[:10000]
len(sample_set)

10000

In [12]:
freqItemSet, rules = fpgrowth(sample_set, minSupRatio=0.005, minConf=0.9)

In [13]:
freqItemSet

[{'reaction_device_occlusion'},
 {'drug_solumedrol'},
 {'reaction_eczema'},
 {'drug_heparin'},
 {'reaction_bradycardia'},
 {'drug_sevoflurane'},
 {'drug_allopurinol'},
 {'drug_ceftazidime'},
 {'origin_fi'},
 {'origin_fi', 'sex_f'},
 {'origin_fi', 'weight_40-60 kg'},
 {'origin_fi', 'sex_f', 'weight_40-60 kg'},
 {'age_bin_teenager', 'origin_fi'},
 {'age_bin_teenager', 'origin_fi', 'weight_40-60 kg'},
 {'age_bin_teenager', 'origin_fi', 'sex_f'},
 {'age_bin_teenager', 'origin_fi', 'sex_f', 'weight_40-60 kg'},
 {'drug_hydroxyzine'},
 {'reaction_oxygen_saturation_decreased'},
 {'drug_zofran'},
 {'drug_cotrimoxazole'},
 {'reaction_feeling_abnormal'},
 {'drug_intuniv'},
 {'drug_polyethylene_glycol'},
 {'reaction_product_availability_issue'},
 {'reaction_needle_issue'},
 {'drug_remodulin'},
 {'drug_remodulin', 'origin_us'},
 {'reaction_product_quality_issue'},
 {'reaction_generalised_tonicclonic_seizure'},
 {'drug_topiramate'},
 {'reaction_rhinorrhoea'},
 {'drug_ciprofloxacin'},
 {'drug_cisplat

In [14]:
rules

[[{'origin_fi'}, {'sex_f'}, 1.0],
 [{'origin_fi'}, {'weight_40-60 kg'}, 1.0],
 [{'origin_fi'}, {'sex_f', 'weight_40-60 kg'}, 1.0],
 [{'origin_fi', 'sex_f'}, {'weight_40-60 kg'}, 1.0],
 [{'origin_fi', 'weight_40-60 kg'}, {'sex_f'}, 1.0],
 [{'origin_fi'}, {'age_bin_teenager'}, 1.0],
 [{'origin_fi'}, {'age_bin_teenager', 'weight_40-60 kg'}, 1.0],
 [{'age_bin_teenager', 'origin_fi'}, {'weight_40-60 kg'}, 1.0],
 [{'origin_fi', 'weight_40-60 kg'}, {'age_bin_teenager'}, 1.0],
 [{'origin_fi'}, {'age_bin_teenager', 'sex_f'}, 1.0],
 [{'origin_fi', 'sex_f'}, {'age_bin_teenager'}, 1.0],
 [{'age_bin_teenager', 'origin_fi'}, {'sex_f'}, 1.0],
 [{'origin_fi'}, {'age_bin_teenager', 'sex_f', 'weight_40-60 kg'}, 1.0],
 [{'origin_fi', 'sex_f'}, {'age_bin_teenager', 'weight_40-60 kg'}, 1.0],
 [{'age_bin_teenager', 'origin_fi'}, {'sex_f', 'weight_40-60 kg'}, 1.0],
 [{'origin_fi', 'weight_40-60 kg'}, {'age_bin_teenager', 'sex_f'}, 1.0],
 [{'age_bin_teenager', 'origin_fi', 'sex_f'}, {'weight_40-60 kg'}, 1.0],

## Mining the rules

In [15]:
freqItems, rules = fpgrowth(arm_training_list, minSupRatio=0.005, minConf=0.9)

In [16]:
freqItems

[{'drug_somatropin'},
 {'drug_sabril'},
 {'drug_voriconazole'},
 {'reaction_needle_issue'},
 {'drug_budesonide'},
 {'reaction_therapeutic_product_effect_incomplete'},
 {'drug_daybue'},
 {'drug_daybue', 'sex_f'},
 {'drug_daybue', 'origin_us', 'sex_f'},
 {'drug_daybue', 'origin_us'},
 {'reaction_haematochezia'},
 {'reaction_oropharyngeal_pain'},
 {'drug_doxorubicin_hydrochloride'},
 {'reaction_abdominal_discomfort'},
 {'origin_nl'},
 {'drug_melphalan'},
 {'drug_tylenol'},
 {'drug_vitamins'},
 {'drug_lacosamide'},
 {'reaction_product_storage_error'},
 {'reaction_ocular_hyperaemia'},
 {'reaction_therapeutic_response_decreased'},
 {'drug_methylphenidate'},
 {'reaction_cardiac_arrest'},
 {'reaction_epistaxis'},
 {'reaction_respiratory_failure'},
 {'reaction_device_delivery_system_issue'},
 {'reaction_anaphylactic_reaction'},
 {'drug_quetiapine'},
 {'drug_sirolimus'},
 {'drug_morphine'},
 {'reaction_skin_irritation'},
 {'origin_us', 'reaction_skin_irritation'},
 {'drug_thiotepa'},
 {'reaction

In [17]:
rules

[[{'drug_daybue'}, {'sex_f'}, 0.9567430025445293],
 [{'drug_daybue'}, {'origin_us', 'sex_f'}, 0.9567430025445293],
 [{'drug_daybue', 'sex_f'}, {'origin_us'}, 1.0],
 [{'drug_daybue', 'origin_us'}, {'sex_f'}, 0.9567430025445293],
 [{'drug_daybue'}, {'origin_us'}, 1.0],
 [{'reaction_skin_irritation'}, {'origin_us'}, 0.9733487833140209],
 [{'reaction_sickle_cell_anaemia_with_crisis'},
  {'origin_us'},
  0.9780461031833151],
 [{'reaction_hospitalisation'}, {'outc_ho'}, 0.9853095487932844],
 [{'reaction_device_physical_property_issue'},
  {'drug_genotropin'},
  0.9653808110781404],
 [{'drug_hemlibra', 'origin_us'}, {'sex_m'}, 0.9780361757105943],
 [{'drug_hemlibra'}, {'sex_m'}, 0.9813725490196078],
 [{'drug_proactiv_md_adapalene_acne_treatment',
   'reaction_skin_burning_sensation'},
  {'origin_us'},
  1.0],
 [{'age_bin_teenager', 'reaction_skin_burning_sensation'},
  {'origin_us'},
  0.9823899371069182],
 [{'reaction_skin_burning_sensation'}, {'origin_us'}, 0.9671361502347418],
 [{'origin_u

In [None]:
# Create support dictionary to calculate lift
support_dict = {tuple(sorted(item)): support for item, support in freqItems}

# Define helper functions
def count_prefix(items, prefix):
    return sum(1 for i in items if i.startswith(prefix))

def has_all_demographics(lhs, demo_features):
    return all(any(f == i for i in lhs) for f in demo_features)

# Define your full demographic feature list
# Example: Based on how you encoded them
demographic_features = [
    'demo_age_adult', 'demo_sex_female', 'demo_weight_normal', 'demo_country_US'
    # include ALL unique demographic keys you used
]

# Filter rules
filtered_rules = []
for lhs, rhs, conf in rules:
    lhs_set = set(lhs)
    rhs_set = set(rhs)

    # Conditions
    has_all_demo = has_all_demographics(lhs_set, demographic_features)
    has_drug = any(i.startswith('drug_') for i in lhs_set)
    has_reaction = any(i.startswith('reaction_') for i in rhs_set)
    has_outcome = any(i.startswith('outcome_') for i in rhs_set)

    if has_all_demo and has_drug and has_reaction and has_outcome:
        # Calculate lift
        lhs_sorted = tuple(sorted(lhs))
        rhs_sorted = tuple(sorted(rhs))
        combined = tuple(sorted(lhs + rhs))
        support_rhs = support_dict.get(rhs_sorted, 0)
        lift = conf / support_rhs if support_rhs > 0 else 0

        filtered_rules.append({
            'lhs': lhs,
            'rhs': rhs,
            'confidence': conf,
            'lift': lift
        })


In [None]:
filtered_freq_items = [i for i in freqItems if(len(i) >=7)]

ValueError: not enough values to unpack (expected 2, got 1)

In [30]:
[i for i in freqItems if(len(i) >=3)]

[{'drug_daybue', 'origin_us', 'sex_f'},
 {'drug_hemlibra', 'origin_us', 'sex_m'},
 {'drug_proactiv_md_adapalene_acne_treatment',
  'origin_us',
  'reaction_skin_burning_sensation'},
 {'age_bin_teenager', 'origin_us', 'reaction_skin_burning_sensation'},
 {'drug_genotropin', 'origin_us', 'reaction_device_use_error'},
 {'origin_us', 'reaction_haemorrhage', 'sex_m'},
 {'origin_us', 'reaction_acne', 'sex_f'},
 {'age_bin_teenager',
  'drug_proactiv_md_deep_cleansing_face_wash',
  'reaction_acne'},
 {'age_bin_teenager',
  'drug_proactiv_md_adapalene_acne_treatment',
  'drug_proactiv_md_deep_cleansing_face_wash',
  'reaction_acne'},
 {'age_bin_teenager',
  'drug_proactiv_md_adapalene_acne_treatment',
  'drug_proactiv_md_deep_cleansing_face_wash',
  'origin_us',
  'reaction_acne'},
 {'age_bin_teenager',
  'drug_proactiv_md_deep_cleansing_face_wash',
  'origin_us',
  'reaction_acne'},
 {'drug_proactiv_md_daily_oil_control_spf_30',
  'drug_proactiv_md_deep_cleansing_face_wash',
  'reaction_acne'}