In [None]:
from google.colab import files
import pandas as pd
from IPython.display import display
import unicodedata
import re


# ===========================================================
#  STEP 0 — Upload file
# ===========================================================
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
print("File uploaded:", file_name)


Saving ARV_JL_MBVR (1).xlsx to ARV_JL_MBVR (1) (15).xlsx
File uploaded: ARV_JL_MBVR (1) (15).xlsx


In [None]:
def remove_accents(text):
    if pd.isna(text):
        return text
    text = str(text)
    return unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

def strip_parentheses(text):
    """Remove anything like ' (....)' from a string."""
    if pd.isna(text):
        return text
    text = str(text)
    text = re.sub(r"\s*\([^)]*\)", "", text)   # remove ' (...)'
    text = re.sub(r"\s+", " ", text).strip()   # normalize spaces
    return text



In [None]:
# ===========================================================
#  STEP 1 — ARV BRUTES (feuille Data brutes)
# ===========================================================
col_name = "ARV présents dans NADIS"
df = pd.read_excel(file_name, sheet_name="Data brutes")

# garder uniquement la colonne utile et enlever les NA
df = df[[col_name]].dropna()

print("\nDonnées brutes ARV :")
display(df.head(10))

# 1) Normalisation : MAJUSCULES + trim
df[col_name] = df[col_name].astype(str).str.upper().str.strip()
print("\nÉtape 1 – Normalisation ARV :")
display(df.head(10))

# 2) Split sur "+"
def split_plus(value):
    if pd.isna(value):
        return None
    parts = [p.strip() for p in value.split("+") if p.strip() != ""]
    return parts if len(parts) > 0 else None

df["ARV Brutes"] = df[col_name].apply(split_plus)
print("\nÉtape 2 – Colonne 'ARV Brutes' après split '+':")
display(df.head(10))

# 3) Explode : une ligne par ARV brut
df_exploded = df.explode("ARV Brutes")
df_exploded = df_exploded[df_exploded["ARV Brutes"].notna()]
print("\nÉtape 3 – Après explode :")
display(df_exploded.head(10))

# 4) Liste unique des ARV (table 1)
df_unique = (
    df_exploded[["ARV Brutes"]]
    .drop_duplicates()
    .reset_index(drop=True)
)
df_unique = df_unique.rename(columns={"ARV Brutes": "ARV_BRUTES"})

# remove accents again for safety
df_unique["ARV_BRUTES"] = df_unique["ARV_BRUTES"].apply(remove_accents)

print("\nTable 1 – ARV_BRUTES (uniques) :")
display(df_unique.head(10))
print("\nTotal ARV uniques :", len(df_unique))



Données brutes ARV :


Unnamed: 0,ARV présents dans NADIS
0,Combivir + Kalétra
1,Isentress + Truvada
2,Isentress + Kivexa
3,Triumeq
4,Genvoya
5,Biktarvy
6,Lamivudine / Dolutégravir (Dovato)
7,Rilpivirine / Dolutégravir (Juluca)
8,Retrovir + Videx
9,Retrovir + Videx + Invirase



Étape 1 – Normalisation ARV :


Unnamed: 0,ARV présents dans NADIS
0,COMBIVIR + KALÉTRA
1,ISENTRESS + TRUVADA
2,ISENTRESS + KIVEXA
3,TRIUMEQ
4,GENVOYA
5,BIKTARVY
6,LAMIVUDINE / DOLUTÉGRAVIR (DOVATO)
7,RILPIVIRINE / DOLUTÉGRAVIR (JULUCA)
8,RETROVIR + VIDEX
9,RETROVIR + VIDEX + INVIRASE



Étape 2 – Colonne 'ARV Brutes' après split '+':


Unnamed: 0,ARV présents dans NADIS,ARV Brutes
0,COMBIVIR + KALÉTRA,"[COMBIVIR, KALÉTRA]"
1,ISENTRESS + TRUVADA,"[ISENTRESS, TRUVADA]"
2,ISENTRESS + KIVEXA,"[ISENTRESS, KIVEXA]"
3,TRIUMEQ,[TRIUMEQ]
4,GENVOYA,[GENVOYA]
5,BIKTARVY,[BIKTARVY]
6,LAMIVUDINE / DOLUTÉGRAVIR (DOVATO),[LAMIVUDINE / DOLUTÉGRAVIR (DOVATO)]
7,RILPIVIRINE / DOLUTÉGRAVIR (JULUCA),[RILPIVIRINE / DOLUTÉGRAVIR (JULUCA)]
8,RETROVIR + VIDEX,"[RETROVIR, VIDEX]"
9,RETROVIR + VIDEX + INVIRASE,"[RETROVIR, VIDEX, INVIRASE]"



Étape 3 – Après explode :


Unnamed: 0,ARV présents dans NADIS,ARV Brutes
0,COMBIVIR + KALÉTRA,COMBIVIR
0,COMBIVIR + KALÉTRA,KALÉTRA
1,ISENTRESS + TRUVADA,ISENTRESS
1,ISENTRESS + TRUVADA,TRUVADA
2,ISENTRESS + KIVEXA,ISENTRESS
2,ISENTRESS + KIVEXA,KIVEXA
3,TRIUMEQ,TRIUMEQ
4,GENVOYA,GENVOYA
5,BIKTARVY,BIKTARVY
6,LAMIVUDINE / DOLUTÉGRAVIR (DOVATO),LAMIVUDINE / DOLUTÉGRAVIR (DOVATO)



Table 1 – ARV_BRUTES (uniques) :


Unnamed: 0,ARV_BRUTES
0,COMBIVIR
1,KALETRA
2,ISENTRESS
3,TRUVADA
4,KIVEXA
5,TRIUMEQ
6,GENVOYA
7,BIKTARVY
8,LAMIVUDINE / DOLUTEGRAVIR (DOVATO)
9,RILPIVIRINE / DOLUTEGRAVIR (JULUCA)



Total ARV uniques : 218


In [None]:
# ===========================================================
#  STEP 2 — DCI + SPECIALITE (feuille "Analyse")
# ===========================================================
df_dci = pd.read_excel(file_name, sheet_name="Analyse")

# 🔥 Garder seulement les lignes 1 à 53 (index 0 → 52)
df_dci = df_dci.iloc[0:53]

# ne garder que DCI + SPECIALITE
df_dci = df_dci[["DCI", "SPECIALITE contenant la molécule"]].dropna(subset=["DCI"])

print("\nDonnées brutes DCI (lignes 1–53) :")
display(df_dci.head(10))

# Normalisation DCI
df_dci["DCI"] = (
    df_dci["DCI"]
    .astype(str)
    .apply(remove_accents)
    .str.upper()
    .str.strip()
)


# Normalisation SPECIALITE
df_dci["SPECIALITE contenant la molécule"] = (
    df_dci["SPECIALITE contenant la molécule"]
    .astype(str)
    .apply(remove_accents)
    .str.upper()
    .str.strip()
)

print("\nAprès normalisation DCI + SPECIALITE (1–53) :")
display(df_dci.head(10))

# 🔹 VERY IMPORTANT: copy BEFORE splitting on "=" (used for associations)

df_dci_base = df_dci.copy()
df_dci_base["SPECIALITE contenant la molécule"] = (
    df_dci_base["SPECIALITE contenant la molécule"]
    .apply(strip_parentheses)
)

# ===========================================================
# 2A — Associations pour ARV_to_DCI (DCI AVANT split "=")
#      ✅ keep SPECIALITE aliases from parentheses
# ===========================================================

def split_specialite(value):
    if pd.isna(value):
        return None
    parts = [p.strip() for p in str(value).split(",") if p.strip() != ""]
    return parts if len(parts) > 0 else None

def specialite_aliases(spec):
    """
    Return possible SPECIALITE keys to match ARV_BRUTES:
    - the text without parentheses
    - the text inside parentheses (if any)
    """
    if pd.isna(spec):
        return []
    s = remove_accents(str(spec)).upper().strip()

    aliases = []

    # 1) outside parentheses (base)
    outside = strip_parentheses(s).strip()
    if outside:
        aliases.append(outside)

    # 2) inside parentheses (alias)
    inside = re.findall(r"\(([^)]*)\)", s)  # can be multiple
    for token in inside:
        tok = remove_accents(token).upper().strip()
        if tok:
            aliases.append(tok)

    # unique, keep order
    seen = set()
    out = []
    for a in aliases:
        if a not in seen:
            seen.add(a)
            out.append(a)
    return out


df_dci_base = df_dci.copy()

# Normalize SPECIALITE (but DO NOT delete parentheses here)
df_dci_base["SPECIALITE contenant la molécule"] = (
    df_dci_base["SPECIALITE contenant la molécule"]
    .astype(str)
    .apply(remove_accents)
    .str.upper()
    .str.strip()
)

# split comma list first
df_dci_base["SPECIALITE_LIST"] = df_dci_base["SPECIALITE contenant la molécule"].apply(split_specialite)
df_spec_dci_raw = df_dci_base.explode("SPECIALITE_LIST")
df_spec_dci_raw = df_spec_dci_raw[df_spec_dci_raw["SPECIALITE_LIST"].notna()].copy()

# create aliases (outside + inside parentheses)
df_spec_dci_raw["SPECIALITE_ALIASES"] = df_spec_dci_raw["SPECIALITE_LIST"].apply(specialite_aliases)
df_spec_dci_raw = df_spec_dci_raw.explode("SPECIALITE_ALIASES")
df_spec_dci_raw = df_spec_dci_raw[df_spec_dci_raw["SPECIALITE_ALIASES"].notna()].copy()

df_spec_dci_raw = df_spec_dci_raw.rename(columns={"SPECIALITE_ALIASES": "SPECIALITE"})

# keep only DCI + SPECIALITE
df_spec_dci_raw = (
    df_spec_dci_raw[["DCI", "SPECIALITE"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

print("\nTable 2 (RAW associations) – DCI/SPECIALITE (with parentheses aliases):")
display(df_spec_dci_raw.head(10))

# SPECIALITE → liste DCI (for STEP 4)
spec_to_dci = (
    df_spec_dci_raw.groupby("SPECIALITE")["DCI"]
    .apply(lambda s: sorted(set(s)))
    .reset_index()
)
spec_to_dci["DCI_LIST_STR"] = spec_to_dci["DCI"].apply(lambda lst: ", ".join(lst))

print("\nAssociation SPECIALITE → DCI (with parentheses aliases):")
display(spec_to_dci.head(10))


# ===========================================================
# 2B — Table 2 pour l’export DCI_SPECIALITE (AVEC split "=")
# ===========================================================
def split_equal(value):
    if pd.isna(value):
        return None
    parts = [p.strip() for p in str(value).split("=") if p.strip() != ""]
    return parts if len(parts) > 0 else None

df_dci_split = df_dci.copy()
df_dci_split["DCI_LIST"] = df_dci_split["DCI"].apply(split_equal)

# Explode "="
df_dci_expanded = df_dci_split.explode("DCI_LIST")
df_dci_expanded = df_dci_expanded[df_dci_expanded["DCI_LIST"].notna()]
df_dci_expanded["DCI"] = df_dci_expanded["DCI_LIST"]
df_dci_expanded = df_dci_expanded.drop(columns=["DCI_LIST"])

print("\nDCI après split '=' (pour Table 2) :")
display(df_dci_expanded.head(10))

# split SPECIALITE
df_dci_expanded["SPECIALITE_LIST"] = df_dci_expanded["SPECIALITE contenant la molécule"].apply(split_specialite)

df_spec_dci = df_dci_expanded.explode("SPECIALITE_LIST")
df_spec_dci = df_spec_dci[df_spec_dci["SPECIALITE_LIST"].notna()]
df_spec_dci = df_spec_dci.rename(columns={"SPECIALITE_LIST": "SPECIALITE"})

df_spec_dci = df_spec_dci[["DCI", "SPECIALITE"]].drop_duplicates().reset_index(drop=True)

print("\nTable 2 – DCI/SPECIALITE après split '=' (final) :")
display(df_spec_dci.head(10))



Données brutes DCI (lignes 1–53) :


Unnamed: 0,DCI,SPECIALITE contenant la molécule
0,abacavir,"ZIAGEN, KIVEXA, TRIUMEQ, TRIZIVIR"
1,adefovir,"PREVEON, HEPSERA"
2,alovudine,MIV310
3,amprenavir,AGENERASE
4,amdoxovir,
5,aplaviroc,aplaviroc
6,Apricitabine,
7,atazanavir,"REYATAZ, EVOTAZ"
8,bictegravir=GS9883,BIKTARVY
9,BMS955176=GSK3532795,



Après normalisation DCI + SPECIALITE (1–53) :


Unnamed: 0,DCI,SPECIALITE contenant la molécule
0,ABACAVIR,"ZIAGEN, KIVEXA, TRIUMEQ, TRIZIVIR"
1,ADEFOVIR,"PREVEON, HEPSERA"
2,ALOVUDINE,MIV310
3,AMPRENAVIR,AGENERASE
4,AMDOXOVIR,NAN
5,APLAVIROC,APLAVIROC
6,APRICITABINE,NAN
7,ATAZANAVIR,"REYATAZ, EVOTAZ"
8,BICTEGRAVIR=GS9883,BIKTARVY
9,BMS955176=GSK3532795,NAN



Table 2 (RAW associations) – DCI/SPECIALITE (with parentheses aliases):


Unnamed: 0,DCI,SPECIALITE
0,ABACAVIR,ZIAGEN
1,ABACAVIR,KIVEXA
2,ABACAVIR,TRIUMEQ
3,ABACAVIR,TRIZIVIR
4,ADEFOVIR,PREVEON
5,ADEFOVIR,HEPSERA
6,ALOVUDINE,MIV310
7,AMPRENAVIR,AGENERASE
8,AMDOXOVIR,NAN
9,APLAVIROC,APLAVIROC



Association SPECIALITE → DCI (with parentheses aliases):


Unnamed: 0,SPECIALITE,DCI,DCI_LIST_STR
0,ABT 378,[LOPINAVIR],LOPINAVIR
1,AGENERASE,[AMPRENAVIR],AMPRENAVIR
2,ALUVIA,"[LOPINAVIR, RITONAVIR]","LOPINAVIR, RITONAVIR"
3,APLAVIROC,[APLAVIROC],APLAVIROC
4,APTIVUS,[TIPRANAVIR],TIPRANAVIR
5,ATRIPLA,"[EFAVIRENZ, EMTRICITABINE=FTC=F=EMIVIRINE, TEN...","EFAVIRENZ, EMTRICITABINE=FTC=F=EMIVIRINE, TENO..."
6,BIKTARVY,"[BICTEGRAVIR=GS9883, EMTRICITABINE=FTC=F=EMIVI...","BICTEGRAVIR=GS9883, EMTRICITABINE=FTC=F=EMIVIR..."
7,CELSENTRI,[MARAVIROC],MARAVIROC
8,COVIRACIL,[EMTRICITABINE=FTC=F=EMIVIRINE],EMTRICITABINE=FTC=F=EMIVIRINE
9,CRIVIXAN,[INDINAVIR],INDINAVIR



DCI après split '=' (pour Table 2) :


Unnamed: 0,DCI,SPECIALITE contenant la molécule
0,ABACAVIR,"ZIAGEN, KIVEXA, TRIUMEQ, TRIZIVIR"
1,ADEFOVIR,"PREVEON, HEPSERA"
2,ALOVUDINE,MIV310
3,AMPRENAVIR,AGENERASE
4,AMDOXOVIR,NAN
5,APLAVIROC,APLAVIROC
6,APRICITABINE,NAN
7,ATAZANAVIR,"REYATAZ, EVOTAZ"
8,BICTEGRAVIR,BIKTARVY
8,GS9883,BIKTARVY



Table 2 – DCI/SPECIALITE après split '=' (final) :


Unnamed: 0,DCI,SPECIALITE
0,ABACAVIR,ZIAGEN
1,ABACAVIR,KIVEXA
2,ABACAVIR,TRIUMEQ
3,ABACAVIR,TRIZIVIR
4,ADEFOVIR,PREVEON
5,ADEFOVIR,HEPSERA
6,ALOVUDINE,MIV310
7,AMPRENAVIR,AGENERASE
8,AMDOXOVIR,NAN
9,APLAVIROC,APLAVIROC


In [None]:
# ===========================================================
# STEP 3 — LISTE_DCI (1 row per DCI, 1 column per '=' value)
# ===========================================================

# Start from ORIGINAL DCI column (before explode)
df_liste_dci = df_dci[["DCI"]].dropna().drop_duplicates().reset_index(drop=True)

# 🔒 HARD-CODE TENOFOVIR as a DCI (parent molecule)
if not df_liste_dci["DCI"].astype(str).apply(remove_accents).str.upper().str.strip().eq("TENOFOVIR").any():
    df_liste_dci = pd.concat(
        [df_liste_dci, pd.DataFrame({"DCI": ["TENOFOVIR"]})],
        ignore_index=True
    )

# Split DCI on "=" into lists
df_liste_dci["DCI_SPLIT"] = df_liste_dci["DCI"].apply(
    lambda x: [v.strip() for v in str(x).split("=") if v.strip() != ""]
)

# Find max number of components (for dynamic columns)
max_len = df_liste_dci["DCI_SPLIT"].apply(len).max()

# Create one column per component
for i in range(max_len):
    df_liste_dci[f"DCI_{i+1}"] = df_liste_dci["DCI_SPLIT"].apply(
        lambda lst: lst[i] if i < len(lst) else None
    )

# Drop helper columns
df_liste_dci = df_liste_dci.drop(columns=["DCI", "DCI_SPLIT"])

print("\nTable 4 – LISTE_DCI (DCI split into columns, TENOFOVIR forced):")
display(df_liste_dci.head(20))
print("Total DCI rows:", len(df_liste_dci))

# ===========================================================
# ✅ Build DCI token set + token -> main DCI (DCI_1) mapping
# ===========================================================

dci_cols = [c for c in df_liste_dci.columns if c.startswith("DCI_")]

# All tokens (DCI_1, abbreviations, codes...)
dci_set = set(
    remove_accents(str(x)).upper().strip()
    for x in pd.unique(df_liste_dci[dci_cols].values.ravel("K"))
    if pd.notna(x) and str(x).strip() != ""
)

# Map any token -> DCI_1
token_to_main_dci = {}
for _, row in df_liste_dci.iterrows():
    main = row.get("DCI_1")
    if pd.isna(main) or str(main).strip() == "":
        continue
    main_norm = remove_accents(str(main)).upper().strip()

    for c in dci_cols:
        v = row.get(c)
        if pd.isna(v) or str(v).strip() == "":
            continue
        token_norm = remove_accents(str(v)).upper().strip()
        token_to_main_dci[token_norm] = main_norm



Table 4 – LISTE_DCI (DCI split into columns, TENOFOVIR forced):


Unnamed: 0,DCI_1,DCI_2,DCI_3,DCI_4
0,ABACAVIR,,,
1,ADEFOVIR,,,
2,ALOVUDINE,,,
3,AMPRENAVIR,,,
4,AMDOXOVIR,,,
5,APLAVIROC,,,
6,APRICITABINE,,,
7,ATAZANAVIR,,,
8,BICTEGRAVIR,GS9883,,
9,BMS955176,GSK3532795,,


Total DCI rows: 52


In [None]:

# Columns that contain split DCI tokens (DCI_1, DCI_2, ...)
dci_cols = [c for c in df_liste_dci.columns if c.startswith("DCI_")]

# 1) Build a set of ALL DCI tokens (main names + abbreviations + codes)
dci_set = set(
    remove_accents(str(x)).upper().strip()
    for x in pd.unique(df_liste_dci[dci_cols].values.ravel("K"))
    if pd.notna(x) and str(x).strip() != ""
)

# 2) Build a mapping: ANY token (DTG, GSK..., etc.) -> MAIN DCI (DCI_1)
token_to_main_dci = {}
for _, row in df_liste_dci.iterrows():
    main = row.get("DCI_1")
    if pd.isna(main) or str(main).strip() == "":
        continue
    main_norm = remove_accents(str(main)).upper().strip()

    for c in dci_cols:
        v = row.get(c)
        if pd.isna(v) or str(v).strip() == "":
            continue
        token_norm = remove_accents(str(v)).upper().strip()
        token_to_main_dci[token_norm] = main_norm


In [None]:
# ===========================================================
# STEP 4 — Construire la table ARV_TO_DCI à partir de EXTRACTED_VALUE
# (sheet: ARV_TO_DCI)
# ===========================================================

def extract_x(arv_brutes_value):
    """
    Rule:
    - If there are parentheses in ARV_BRUTES: EXTRACTED_VALUE = inside parentheses
    - Else: EXTRACTED_VALUE = the full ARV_BRUTES cell value
    """
    s = str(arv_brutes_value).strip()

    # ✅ Parentheses → use ONLY inside parentheses
    if "(" in s and ")" in s and s.find("(") < s.find(")"):
        x = s[s.find("(") + 1 : s.find(")")].strip()
    else:
        # ✅ No parentheses → use full value
        x = s

    # Normalize EXTRACTED_VALUE
    x = strip_parentheses(x)               # safety
    x = remove_accents(x).upper().strip()  # normalize for matching
    return x


# --- Build base table
df_map = df_unique.copy()

# NEW COLUMN: extracted_value used for ALL reasoning
df_map["EXTRACTED_VALUE"] = df_map["ARV_BRUTES"].apply(extract_x)

# Decide if extracted_value is a DCI token (exists anywhere in LISTE_DCI tokens)
df_map["IS_DCI"] = df_map["EXTRACTED_VALUE"].apply(lambda v: v in dci_set)

# Initialize output
df_map["DCI_ASSOCIES"] = ""

# -----------------------------------------------------------
# Case 1: extracted_value is DCI token -> map to MAIN DCI (DCI_1)
# -----------------------------------------------------------
df_map.loc[df_map["IS_DCI"], "DCI_ASSOCIES"] = (
    df_map.loc[df_map["IS_DCI"], "EXTRACTED_VALUE"]
    .apply(lambda v: token_to_main_dci.get(v, v))  # fallback = itself
)

# -----------------------------------------------------------
# Case 2: extracted_value is SPECIALITE -> use SPECIALITE → DCI association
# ✅ FIX: use a dict + map (no index alignment bugs)
# -----------------------------------------------------------

spec_to_dci_norm = spec_to_dci.copy()
spec_to_dci_norm["SPECIALITE"] = spec_to_dci_norm["SPECIALITE"].apply(
    lambda s: remove_accents(str(s)).upper().strip()
)

spec_dict = spec_to_dci_norm.set_index("SPECIALITE")["DCI_LIST_STR"].to_dict()

# Fill only where IS_DCI is False
df_map.loc[~df_map["IS_DCI"], "DCI_ASSOCIES"] = (
    df_map.loc[~df_map["IS_DCI"], "EXTRACTED_VALUE"]
    .map(spec_dict)
    .fillna("")
)


# ===========================================================
# Clean DCI_ASSOCIES: keep only first part before "=" + sort unique
# ===========================================================
def clean_and_sort_dci_list(val):
    if not val or pd.isna(val):
        return ""
    items = []
    for v in str(val).split(","):
        v = v.strip()
        if not v:
            continue
        main_name = v.split("=")[0].strip()
        items.append(main_name)
    return ", ".join(sorted(set(items)))

df_map["DCI_ASSOCIES"] = df_map["DCI_ASSOCIES"].apply(clean_and_sort_dci_list)

# 🔥 SPECIAL OVERRIDE: COMBIVIR
def override_combivir(row):
    if "COMBIVIR" in str(row["ARV_BRUTES"]).upper():
        return "LAMIVUDINE, ZIDOVUDINE"
    return row["DCI_ASSOCIES"]

df_map["DCI_ASSOCIES"] = df_map.apply(override_combivir, axis=1)

# ✅ Final columns order for the sheet
df_map = df_map[["ARV_BRUTES", "EXTRACTED_VALUE", "DCI_ASSOCIES"]]

print("\nTable ARV_TO_DCI – ARV_BRUTES / EXTRACTED_VALUE / DCI_ASSOCIES (reasoning based on EXTRACTED_VALUE):")
display(df_map.head(20))



Table ARV_TO_DCI – ARV_BRUTES / EXTRACTED_VALUE / DCI_ASSOCIES (reasoning based on EXTRACTED_VALUE):


Unnamed: 0,ARV_BRUTES,EXTRACTED_VALUE,DCI_ASSOCIES
0,COMBIVIR,COMBIVIR,"LAMIVUDINE, ZIDOVUDINE"
1,KALETRA,KALETRA,"LOPINAVIR, RITONAVIR"
2,ISENTRESS,ISENTRESS,RALTEGRAVIR
3,TRUVADA,TRUVADA,"EMTRICITABINE, TENOFOVIR DISOPROXIL FUMARATE"
4,KIVEXA,KIVEXA,"ABACAVIR, LAMIVUDINE"
5,TRIUMEQ,TRIUMEQ,"ABACAVIR, DOLUTEGRAVIR, LAMIVUDINE"
6,GENVOYA,GENVOYA,"COBICISTAT, ELVITEGRAVIR, EMTRICITABINE, TENOF..."
7,BIKTARVY,BIKTARVY,"BICTEGRAVIR, EMTRICITABINE, TENOFOVIR ALAFENAMIDE"
8,LAMIVUDINE / DOLUTEGRAVIR (DOVATO),DOVATO,"DOLUTEGRAVIR, LAMIVUDINE"
9,RILPIVIRINE / DOLUTEGRAVIR (JULUCA),JULUCA,"DOLUTEGRAVIR, RILPIVIRINE"


In [None]:
# ===========================================================
#  STEP 5  — Sauvegarder les 3 tables dans un même fichier
# ===========================================================
output_path = "/content/OCTAVIA_cleaned_with_map.xlsx"

with pd.ExcelWriter(output_path) as writer:
    df_unique.to_excel(writer, sheet_name="ARV_BRUTES", index=False)
    df_spec_dci.to_excel(writer, sheet_name="DCI_SPECIALITE", index=False)
    df_map.to_excel(writer, sheet_name="ARV_TO_DCI", index=False)
    df_liste_dci.to_excel(writer, sheet_name="LISTE_DCI", index=False)

print(f"\n📁 Fichier Excel généré : {output_path}")

files.download(output_path)



📁 Fichier Excel généré : /content/OCTAVIA_cleaned_with_map.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>