In [None]:
# Upload and load the clean data
# The clean data

from google.colab import files
uploaded = files.upload()

Saving mapping_with_combo_dcis.xlsx to mapping_with_combo_dcis (1).xlsx


In [None]:
# Upload the cleaned excel file containing the ARV to DCI mapping
# The analysis uses the sheet where each row corresponds to a unique regiment


import pandas as pd
import io

filename = next(iter(uploaded))

# Load the normalized dataset (unique ARV specialities mapped to DCIs)
df = pd.read_excel(io.BytesIO(uploaded[filename]),
                            sheet_name="combo_specialite_to_dci")


# Display dataset dimensions and preview
print(df.shape)
print(df.columns.tolist())
df.head(3)

(16788, 4)


Unnamed: 0,combo_original,combo_specialites,combo_dcis,Unnamed: 3
0,Combivir + Kalétra,COMBIVIR+KALETRA,LAMIVUDINE+ZIDOVUDINE+LOPINAVIR+RITONAVIR,
1,Isentress + Truvada,ISENTRESS+TRUVADA,RALTEGRAVIR+EMTRICITABINE+TENOFOVIR DISOPROXIL...,
2,Isentress + Kivexa,ISENTRESS+KIVEXA,RALTEGRAVIR+ABACAVIR+LAMIVUDINE,
3,Triumeq,TRIUMEQ,ABACAVIR+DOLUTEGRAVIR+LAMIVUDINE,
4,Genvoya,GENVOYA,COBICISTAT+ELVITEGRAVIR+EMTRICITABINE+TENOFOVI...,


In [None]:
# Drop accidental empty columns like "Unnamed: 3" if they exist.

df = df.drop(columns=["Unnamed: 3"], errors="ignore")
print("After cleanup:", df.shape)
df.head(3)

In [None]:
# Build a clean DCI regimen
# Split molecules
# remove duplicates inside a regimen
# sort molecules to make order irrevelant

import re
import numpy as np

def split_dci(x): # new def
  if pd.isna(x):
    return []
#def split_dci(x):
  parts = re.split(r"\s*\+\s*", str(x).upper())
  parts = [p.strip()for p in parts if p.strip()] #new line
  #return sorted(set(p.strip()for p in parts if p.strip()))
  return sorted(set(parts))

# List of molecules per regimen
df["DCI_list"] = df["combo_dcis"].apply(split_dci)

# Number of molecules per regimen
df["n_DCI"] = df["DCI_list"].apply(len)

# Cononical identifier for each regimen (order - independent)
df["DCI_set"] = df["DCI_list"].apply(lambda x: "|".join(x))


# New line
df[["combo_dcis", "DCI_set", "n_DCI"]].head(5)

Unnamed: 0,combo_dcis,DCI_set,n_DCI
0,LAMIVUDINE+ZIDOVUDINE+LOPINAVIR+RITONAVIR,LAMIVUDINE|LOPINAVIR|RITONAVIR|ZIDOVUDINE,4
1,RALTEGRAVIR+EMTRICITABINE+TENOFOVIR DISOPROXIL...,EMTRICITABINE|RALTEGRAVIR|TENOFOVIR DISOPROXIL...,3
2,RALTEGRAVIR+ABACAVIR+LAMIVUDINE,ABACAVIR|LAMIVUDINE|RALTEGRAVIR,3
3,ABACAVIR+DOLUTEGRAVIR+LAMIVUDINE,ABACAVIR|DOLUTEGRAVIR|LAMIVUDINE,3
4,COBICISTAT+ELVITEGRAVIR+EMTRICITABINE+TENOFOVI...,COBICISTAT|ELVITEGRAVIR|EMTRICITABINE|TENOFOVI...,4


In [None]:
print("n_DCI exists?", "n_DCI" in df.columns)
df["n_DCI"].value_counts().head()

n_DCI exists? True


Unnamed: 0_level_0,count
n_DCI,Unnamed: 1_level_1
4,4674
5,3987
3,2688
6,2648
7,1320


In [None]:
df["n_DCI"].value_counts().sort_index()

Unnamed: 0_level_0,count
n_DCI,Unnamed: 1_level_1
0,16
1,102
2,692
3,2688
4,4674
5,3987
6,2648
7,1320
8,466
9,146


In [None]:
# Number of unique molecular regiments
df["DCI_set"].nunique()

7543

In [None]:
# ARV vs DCI counts

#  Number of unique ARV names
n_arv = df["combo_specialites"].nunique()

# Number of unique DCI molecules
n_dci = len(set().union(*df["DCI_list"]))

n_arv, n_dci

(16788, 70)

In [None]:
# Regimen diversity
# Count of unique DCI-based regimens
df["DCI_set"].nunique()

7543

In [None]:
# Optional
# Shannon diverity index
import numpy as np
p = df["DCI_set"].value_counts(normalize=True)
shannon = -(p * np.log(p)).sum()
shannon

np.float64(8.535983010285594)

In [None]:
# Regimen complexity

df["n_DCI"].mean(), df["n_DCI"].median()

# Average complexity

(df["n_DCI"] == 1).mean()*100, (df["n_DCI"] > 1).mean()*100

(np.float64(0.6075768406004289), np.float64(99.297116988325))

In [None]:
# DCI structural dominance

from collections import Counter

dci_counts = Counter(d for row in df["DCI_list"] for d in row)
dci_struct = pd.DataFrame(dci_counts.items(),
                          columns=["DCI","Regimen_count"])

dci_struct["%_of_regimens"] = 100 * dci_struct["Regimen_count"] / len(df)
dci_struct.sort_values("%_of_regimens", ascending=False)

Unnamed: 0,DCI,Regimen_count,%_of_regimens
2,RITONAVIR,10207,60.799381
0,LAMIVUDINE,8025,47.802001
6,TENOFOVIR DISOPROXIL FUMARATE,5610,33.416726
7,ABACAVIR,5396,32.142006
19,TENOFOVIR,4229,25.190612
...,...,...,...
62,APRICITABINE,1,0.005957
64,BMS955176,1,0.005957
66,ENTECAVIR,1,0.005957
68,PACLITAXEL,1,0.005957


In [None]:
# Molcule dependency (lift)

import itertools

pairs = Counter()
for row in df["DCI_list"]:
    for p in itertools.combinations(row, 2):
        pairs[p] += 1

pairs.most_common(10)

[(('LAMIVUDINE', 'RITONAVIR'), 4857),
 (('TENOFOVIR', 'TENOFOVIR DISOPROXIL FUMARATE'), 4229),
 (('RITONAVIR', 'TENOFOVIR DISOPROXIL FUMARATE'), 3731),
 (('ABACAVIR', 'RITONAVIR'), 3431),
 (('ABACAVIR', 'LAMIVUDINE'), 3223),
 (('LOPINAVIR', 'RITONAVIR'), 3215),
 (('RITONAVIR', 'TENOFOVIR'), 2908),
 (('DIDANOSINE', 'RITONAVIR'), 2536),
 (('LAMIVUDINE', 'ZIDOVUDINE'), 2502),
 (('LAMIVUDINE', 'TENOFOVIR DISOPROXIL FUMARATE'), 2222)]

In [None]:
# Pairs vs triplets

triplets = Counter()
for row in df["DCI_list"]:
    for t in itertools.combinations(row, 3):
        triplets[t] += 1

triplets.most_common(10)

[(('RITONAVIR', 'TENOFOVIR', 'TENOFOVIR DISOPROXIL FUMARATE'), 2908),
 (('ABACAVIR', 'LAMIVUDINE', 'RITONAVIR'), 2098),
 (('LAMIVUDINE', 'TENOFOVIR', 'TENOFOVIR DISOPROXIL FUMARATE'), 2023),
 (('LAMIVUDINE', 'LOPINAVIR', 'RITONAVIR'), 1666),
 (('LAMIVUDINE', 'RITONAVIR', 'TENOFOVIR DISOPROXIL FUMARATE'), 1499),
 (('LAMIVUDINE', 'RITONAVIR', 'ZIDOVUDINE'), 1427),
 (('ABACAVIR', 'TENOFOVIR', 'TENOFOVIR DISOPROXIL FUMARATE'), 1422),
 (('LAMIVUDINE', 'RITONAVIR', 'TENOFOVIR'), 1413),
 (('LOPINAVIR', 'RITONAVIR', 'TENOFOVIR DISOPROXIL FUMARATE'), 1306),
 (('ABACAVIR', 'LOPINAVIR', 'RITONAVIR'), 1169)]

In [None]:
# The results into ONE "results sheet"

import pandas as pd

# 1) Summary metrics (single sheet)
summary = pd.DataFrame({
    "metric": [
        "n_rows_clean",
        "unique_ARV_names",
        "unique_DCI_molecules",
        "unique_regimens (DCI_set)",
        "mean_DCI_per_regimen",
        "median_DCI_per_regimen",
        "% single-DCI regimens",
        "% multi-DCI regimens",
        "Shannon (natural log)"
    ],
    "value": [
        len(df),
        df["combo_specialites"].nunique(),
        len(set().union(*df["DCI_list"])),
        df["DCI_set"].nunique(),
        df["n_DCI"].mean(),
        df["n_DCI"].median(),
        (df["n_DCI"] == 1).mean() * 100,
        (df["n_DCI"] > 1).mean() * 100,
        shannon
    ]
})

# 2) Regimen size distribution
regimen_size_dist = df["n_DCI"].value_counts().sort_index().reset_index()
regimen_size_dist.columns = ["n_DCI", "count"]

# 3) Structural DCI dominance table (if you already computed it)
# If not, compute quickly:
from collections import Counter
dci_counts = Counter(d for row in df["DCI_list"] for d in row)
dci_struct = pd.DataFrame(dci_counts.items(), columns=["DCI", "Regimen_count"])
dci_struct["%_of_regimens"] = 100 * dci_struct["Regimen_count"] / len(df)
dci_struct = dci_struct.sort_values("%_of_regimens", ascending=False)

# Save to Excel (multiple sheets)
out_path = "clean_data_results.xlsx"
with pd.ExcelWriter(out_path, engine="openpyxl") as writer:
    summary.to_excel(writer, sheet_name="Summary", index=False)
    regimen_size_dist.to_excel(writer, sheet_name="Regimen_Size", index=False)
    dci_struct.to_excel(writer, sheet_name="DCI_Dominance", index=False)

out_path

'clean_data_results.xlsx'

In [None]:
import os

os.listdir()

# Download popup
from google.colab import files
files.download("clean_data_results.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>