# References

Dataset Source: https://figshare.com/articles/dataset/DDXPlus_Dataset/20043374

Dataset Files for local use with notebook (unzip in main dir of repo, should be ignored in .gitignore for commits/push to repo): https://gtvault-my.sharepoint.com/:f:/g/personal/agullapalli3_gatech_edu/ElSECU9qSLdAghI4zvFlXegBczZ9LFDRCFIo5SEWSeejQw?e=CKC9V8

Notes: Translate from French to English

# Imports

In [69]:
import pandas as pd
import numpy as np
import json

# Load Dataset

In [70]:
dataset_base_path = './data/original_dataset/'

In [71]:
train_data = f'{dataset_base_path}release_train_patients.csv'
val_data = f'{dataset_base_path}release_validate_patients.csv'
test_data = f'{dataset_base_path}release_test_patients.csv'

In [72]:
with open(f'{dataset_base_path}release_evidences.json', 'r', encoding='utf-8') as f:
    release_evidences = json.load(f)
release_evidences

{'fievre': {'name': 'fievre',
  'code_question': 'fievre',
  'question_fr': 'Avez-vous objectivé ou ressenti de la fièvre?',
  'question_en': 'Do you have a fever (either felt or measured with a thermometer)?',
  'is_antecedent': False,
  'default_value': 0,
  'value_meaning': {},
  'possible-values': [],
  'data_type': 'B'},
 'douleurxx_endroitducorps': {'name': 'douleurxx_endroitducorps',
  'code_question': 'douleurxx',
  'question_fr': 'Avez-vous de la douleur quelque part?',
  'question_en': 'Do you feel pain somewhere?',
  'is_antecedent': False,
  'default_value': 'nulle_part',
  'value_meaning': {'nulle_part': {'fr': 'nulle part', 'en': 'nowhere'},
   'aile_iliaque_D_': {'fr': 'aile iliaque(D)', 'en': 'iliac wing(R)'},
   'aile_iliaque_G_': {'fr': 'aile iliaque(G)', 'en': 'iliac wing(L)'},
   'aine_D_': {'fr': 'aine(D)', 'en': 'groin(R)'},
   'aine_G_': {'fr': 'aine(G)', 'en': 'groin(L)'},
   'aisselle_G_': {'fr': 'aisselle(G)', 'en': 'axilla(L)'},
   'aisselle_D_': {'fr': 'aiss

In [73]:
with open(f'{dataset_base_path}release_conditions.json', 'r', encoding='utf-8') as f:
    release_conditions = json.load(f)
release_conditions

{'Pneumothorax spontané': {'condition_name': 'Pneumothorax spontané',
  'cond-name-fr': 'Pneumothorax spontané',
  'cond-name-eng': 'Spontaneous pneumothorax',
  'icd10-id': 'J93',
  'symptoms': {'douleurxx_endroitducorps': {},
   'douleurxx': {},
   'douleurxx_irrad': {},
   'douleurxx_carac': {},
   'douleurxx_soudain': {},
   'douleurxx_intens': {},
   'douleurxx_precis': {},
   'dyspn': {},
   'ww_respi': {},
   'ww_effort': {},
   'angor_repos': {},
   'oedeme': {}},
  'antecedents': {'f17.210': {},
   'pneumothorax': {},
   'ap_pneumothorax': {},
   'j44_j42': {},
   'trav1': {}},
  'severity': 2},
 'Céphalée en grappe': {'condition_name': 'Céphalée en grappe',
  'cond-name-fr': 'Céphalée en grappe',
  'cond-name-eng': 'Cluster headache',
  'icd10-id': 'g44.009',
  'symptoms': {'douleurxx_endroitducorps': {},
   'douleurxx': {},
   'douleurxx_irrad': {},
   'douleurxx_carac': {},
   'douleurxx_soudain': {},
   'douleurxx_intens': {},
   'douleurxx_precis': {},
   'larmes': {},
  

In [74]:
train_df = pd.read_csv(train_data, encoding='utf-8')
train_df

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE
0,18,"[['Bronchite', 0.19171203430383882], ['Pneumon...",M,IVRS ou virémie,"['crowd', 'diaph', 'douleurxx', 'douleurxx_car...",fievre
1,21,"[['VIH (Primo-infection)', 0.5189500564407601]...",M,VIH (Primo-infection),"['adp_dlr', 'atcd_its', 'diaph', 'diarrhee', '...",diaph
2,19,"[['Bronchite', 0.11278064619119596], ['Pneumon...",F,Pneumonie,"['douleurxx', 'douleurxx_carac_@_un_coup_de_co...",expecto
3,34,"[['IVRS ou virémie', 0.23859396799565236], ['C...",F,IVRS ou virémie,"['crowd', 'douleurxx', 'douleurxx_carac_@_une_...",douleurxx
4,36,"[['IVRS ou virémie', 0.23677812769175735], ['P...",M,IVRS ou virémie,"['dayc', 'diaph', 'douleurxx', 'douleurxx_cara...",toux
...,...,...,...,...,...,...
1025597,18,"[['Épiglottite', 0.28156957795466475], ['VIH (...",M,Épiglottite,"['bw_bending', 'douleurxx', 'douleurxx_carac_@...",fievre
1025598,28,"[['Épiglottite', 0.3703962237298842], ['Laryng...",F,Épiglottite,"['douleurxx', 'douleurxx_carac_@_vive', 'doule...",fievre
1025599,0,"[['Épiglottite', 0.13193905052537108], ['Laryn...",F,Épiglottite,"['bw_bending', 'douleurxx', 'douleurxx_carac_@...",stridor
1025600,26,"[['Épiglottite', 0.3028258988138983], ['Laryng...",F,Épiglottite,"['douleurxx', 'douleurxx_carac_@_un_coup_de_co...",stridor


In [75]:
val_df = pd.read_csv(val_data, encoding='utf-8')
val_df

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE
0,55,"[['Anémie', 0.25071110167158567], ['Fibrillati...",F,Anémie,"['Mauv_aliment', 'atcd_anem', 'atcd_fam_anem',...",pale
1,10,"[['Syndrome de Guillain-Barré', 0.135558991316...",F,Attaque de panique,"['anxiete_s', 'atcdpsyfam', 'diaph', 'douleurx...",psy_depers
2,68,[['Possible influenza ou syndrome virémique ty...,F,Possible influenza ou syndrome virémique typique,"['diaph', 'douleurxx', 'douleurxx_carac_@_une_...",douleurxx
3,13,"[['Anémie', 0.18697604010451876], ['Fibrillati...",M,Anémie,"['Mauv_aliment', 'atcd_anem', 'atcd_fam_anem',...",douleurxx
4,48,"[['Syndrome de Boerhaave', 1.0]]",M,Syndrome de Boerhaave,"['douleurxx', 'douleurxx_carac_@_déchirante', ...",douleurxx
...,...,...,...,...,...,...
132443,27,"[['Pharyngite virale', 0.22702125813983617], [...",M,Pharyngite virale,"['contact', 'crowd', 'douleurxx', 'douleurxx_c...",toux
132444,57,"[['OAP/Surcharge pulmonaire', 0.12078088376840...",M,OAP/Surcharge pulmonaire,"['J81', 'douleurxx', 'douleurxx_carac_@_pénibl...",oedeme
132445,52,"[['RGO', 0.24494427036287517], ['Bronchite', 0...",F,RGO,"['douleurxx', 'douleurxx_carac_@_lancinante_/_...",pyrosis
132446,10,"[['Épiglottite', 0.2969684152571116], ['VIH (P...",M,Épiglottite,"['douleurxx', 'douleurxx_carac_@_un_coup_de_co...",fievre


In [76]:
test_df = pd.read_csv(test_data, encoding='utf-8')
test_df

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE
0,49,"[['Bronchite', 0.20230062181160519], ['RGO', 0...",F,RGO,"['douleurxx', 'douleurxx_carac_@_lancinante_/_...",toux
1,2,"[['Asthme exacerbé ou bronchospasme', 0.080220...",M,Bronchite,"['douleurxx', 'douleurxx_carac_@_une_brûlure_o...",douleurxx
2,49,"[['Réaction dystonique aïgue', 0.6267050848165...",M,Réaction dystonique aïgue,"['antipsy_récent', 'laryngospasme', 'nau_psy_r...",laryngospasme
3,64,"[['Bronchite', 0.2748608320637265], ['Laryngit...",M,Laryngite aigue,"['crowd', 'dayc', 'douleurxx', 'douleurxx_cara...",douleurxx
4,70,"[['IVRS ou virémie', 0.21257615919851483], ['P...",F,IVRS ou virémie,"['contact', 'diaph', 'douleurxx', 'douleurxx_c...",toux
...,...,...,...,...,...,...
134524,52,"[['Possible NSTEMI / STEMI', 0.268768209851499...",M,Lupus érythémateux disséminé (LED),"['I30', 'douleurxx', 'douleurxx_carac_@_vive',...",douleurxx
134525,88,"[['néoplasie pulmonaire', 0.09094757620611861]...",F,néoplasie pulmonaire,"['crach_sg', 'douleurxx', 'douleurxx_carac_@_u...",fatig_mod
134526,29,"[['Attaque de panique', 0.29281344656090524], ...",F,Syndrome de Boerhaave,"['douleurxx', 'douleurxx_carac_@_déchirante', ...",dyspn
134527,8,"[['Scombroïde', 0.1389590231491235], ['TSVP', ...",M,Scombroïde,"['dyspn', 'faible', 'flushing', 'lesions_peau'...",palpit


# Translate Pathology

In [77]:
def translate_pathology(df):
    def translate_pathology_french_to_english(x):
        if x in release_conditions:
            return release_conditions[x]['cond-name-eng']
        else:
            print('not found')
            return ''
    df['English Pathology'] = df['PATHOLOGY'].map(translate_pathology_french_to_english)
    return df

# Translate Evidences

In [78]:
# sample_evids = test_df['EVIDENCES'][500]
# print(sample_evids)

#we didn't have to split the symptoms

def translate_evidences(df):
    def get_qa_en(sample_evids):
        sample_evids = [evid[1:-1] for evid in sample_evids.strip('][').split(', ')]
        new_evid = []
        final_dict = {}
        for i in sample_evids:
            temp = i.split('_@_')
            new_evid.append(temp)
        for i in new_evid:
            if len(i) == 1:
                i.append('True')

        for i in new_evid:
            q_en = release_evidences[i[0]]["question_en"] 
            answ = ''
            if i[1] == "True" or i[1].isnumeric():
                answ = i[1]
            else:
                answ = release_evidences[i[0]]['value_meaning'][i[1]]['en']  
                if answ == 'Y':
                    answ = "Yes"
                elif answ == 'N':
                    answ = "No"

            final_dict[q_en] = answ # answer
        return final_dict
    df['English Evidences'] = df['EVIDENCES'].map(get_qa_en)
    return df
        
# print(get_qa_en(sample_evids))            
    
    


# Add ICD-10

In [79]:
icd10_data = './data/englishPathology2ICD.xlsx'

In [80]:
icd10_df = pd.read_excel(icd10_data)
icd10_df

Unnamed: 0,Pathology,ICD-10
0,acute copd exacerbation / infection,J44.1
1,acute dystonic reactions,G24.9
2,acute laryngitis,J04.0
3,acute otitis media,H66.9
4,acute pulmonary edema,J81.0
5,acute rhinosinusitis,J01.9
6,allergic sinusitis,J01
7,anaphylaxis,T78.0
8,anemia,D64.9
9,atrial fibrillation,I48.9


In [81]:
def add_icd10_code(df):
    def return_icd10_code(x):
        return icd10_df[icd10_df['Pathology '] == x.lower()].iloc[0]['ICD-10']
    df['ICD-10'] = df['English Pathology'].map(return_icd10_code)
    return df

# Apply Translation for Pathology & Evidences and Add ICD10 Code

In [82]:
train_df = translate_pathology(train_df)
train_df = translate_evidences(train_df)
train_df = add_icd10_code(train_df)
train_df

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE,English Pathology,English Evidences,ICD-10
0,18,"[['Bronchite', 0.19171203430383882], ['Pneumon...",M,IVRS ou virémie,"['crowd', 'diaph', 'douleurxx', 'douleurxx_car...",fievre,URTI,"{'Do you live with 4 or more people?': 'True',...",J06.9
1,21,"[['VIH (Primo-infection)', 0.5189500564407601]...",M,VIH (Primo-infection),"['adp_dlr', 'atcd_its', 'diaph', 'diarrhee', '...",diaph,HIV (initial infection),{'Do you have swollen or painful lymph nodes?'...,B20
2,19,"[['Bronchite', 0.11278064619119596], ['Pneumon...",F,Pneumonie,"['douleurxx', 'douleurxx_carac_@_un_coup_de_co...",expecto,Pneumonia,"{'Do you have pain somewhere, related to your ...",J18.9
3,34,"[['IVRS ou virémie', 0.23859396799565236], ['C...",F,IVRS ou virémie,"['crowd', 'douleurxx', 'douleurxx_carac_@_une_...",douleurxx,URTI,"{'Do you live with 4 or more people?': 'True',...",J06.9
4,36,"[['IVRS ou virémie', 0.23677812769175735], ['P...",M,IVRS ou virémie,"['dayc', 'diaph', 'douleurxx', 'douleurxx_cara...",toux,URTI,{'Do you attend or work in a daycare?': 'True'...,J06.9
...,...,...,...,...,...,...,...,...,...
1025597,18,"[['Épiglottite', 0.28156957795466475], ['VIH (...",M,Épiglottite,"['bw_bending', 'douleurxx', 'douleurxx_carac_@...",fievre,Epiglottitis,{'Do you have pain that improves when you lean...,J05.1
1025598,28,"[['Épiglottite', 0.3703962237298842], ['Laryng...",F,Épiglottite,"['douleurxx', 'douleurxx_carac_@_vive', 'doule...",fievre,Epiglottitis,"{'Do you have pain somewhere, related to your ...",J05.1
1025599,0,"[['Épiglottite', 0.13193905052537108], ['Laryn...",F,Épiglottite,"['bw_bending', 'douleurxx', 'douleurxx_carac_@...",stridor,Epiglottitis,{'Do you have pain that improves when you lean...,J05.1
1025600,26,"[['Épiglottite', 0.3028258988138983], ['Laryng...",F,Épiglottite,"['douleurxx', 'douleurxx_carac_@_un_coup_de_co...",stridor,Epiglottitis,"{'Do you have pain somewhere, related to your ...",J05.1


In [83]:
val_df = translate_pathology(val_df)
val_df = translate_evidences(val_df)
val_df = add_icd10_code(val_df)
val_df

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE,English Pathology,English Evidences,ICD-10
0,55,"[['Anémie', 0.25071110167158567], ['Fibrillati...",F,Anémie,"['Mauv_aliment', 'atcd_anem', 'atcd_fam_anem',...",pale,Anemia,"{'Do you have a poor diet?': 'True', 'Have you...",D64.9
1,10,"[['Syndrome de Guillain-Barré', 0.135558991316...",F,Attaque de panique,"['anxiete_s', 'atcdpsyfam', 'diaph', 'douleurx...",psy_depers,Panic attack,"{'Do you feel anxious?': 'True', 'Do any membe...",F41.0
2,68,[['Possible influenza ou syndrome virémique ty...,F,Possible influenza ou syndrome virémique typique,"['diaph', 'douleurxx', 'douleurxx_carac_@_une_...",douleurxx,Influenza,{'Have you had significantly increased sweatin...,J10 or J11
3,13,"[['Anémie', 0.18697604010451876], ['Fibrillati...",M,Anémie,"['Mauv_aliment', 'atcd_anem', 'atcd_fam_anem',...",douleurxx,Anemia,"{'Do you have a poor diet?': 'True', 'Have you...",D64.9
4,48,"[['Syndrome de Boerhaave', 1.0]]",M,Syndrome de Boerhaave,"['douleurxx', 'douleurxx_carac_@_déchirante', ...",douleurxx,Boerhaave,"{'Do you have pain somewhere, related to your ...",K22.3
...,...,...,...,...,...,...,...,...,...
132443,27,"[['Pharyngite virale', 0.22702125813983617], [...",M,Pharyngite virale,"['contact', 'crowd', 'douleurxx', 'douleurxx_c...",toux,Viral pharyngitis,{'Have you been in contact with a person with ...,J02.9
132444,57,"[['OAP/Surcharge pulmonaire', 0.12078088376840...",M,OAP/Surcharge pulmonaire,"['J81', 'douleurxx', 'douleurxx_carac_@_pénibl...",oedeme,Acute pulmonary edema,{'Have you ever had fluid in your lungs?': 'Tr...,J81.0
132445,52,"[['RGO', 0.24494427036287517], ['Bronchite', 0...",F,RGO,"['douleurxx', 'douleurxx_carac_@_lancinante_/_...",pyrosis,GERD,"{'Do you have pain somewhere, related to your ...",K21
132446,10,"[['Épiglottite', 0.2969684152571116], ['VIH (P...",M,Épiglottite,"['douleurxx', 'douleurxx_carac_@_un_coup_de_co...",fievre,Epiglottitis,"{'Do you have pain somewhere, related to your ...",J05.1


In [88]:
test_df = translate_pathology(test_df)
test_df = translate_evidences(test_df)
test_df = add_icd10_code(test_df)
test_df

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE,English Pathology,English Evidences,ICD-10
0,49,"[['Bronchite', 0.20230062181160519], ['RGO', 0...",F,RGO,"['douleurxx', 'douleurxx_carac_@_lancinante_/_...",toux,GERD,"{'Do you have pain somewhere, related to your ...",K21
1,2,"[['Asthme exacerbé ou bronchospasme', 0.080220...",M,Bronchite,"['douleurxx', 'douleurxx_carac_@_une_brûlure_o...",douleurxx,Bronchitis,"{'Do you have pain somewhere, related to your ...",J20.9
2,49,"[['Réaction dystonique aïgue', 0.6267050848165...",M,Réaction dystonique aïgue,"['antipsy_récent', 'laryngospasme', 'nau_psy_r...",laryngospasme,Acute dystonic reactions,{'Have you started or taken any antipsychotic ...,G24.9
3,64,"[['Bronchite', 0.2748608320637265], ['Laryngit...",M,Laryngite aigue,"['crowd', 'dayc', 'douleurxx', 'douleurxx_cara...",douleurxx,Acute laryngitis,"{'Do you live with 4 or more people?': 'True',...",J04.0
4,70,"[['IVRS ou virémie', 0.21257615919851483], ['P...",F,IVRS ou virémie,"['contact', 'diaph', 'douleurxx', 'douleurxx_c...",toux,URTI,{'Have you been in contact with a person with ...,J06.9
...,...,...,...,...,...,...,...,...,...
134524,52,"[['Possible NSTEMI / STEMI', 0.268768209851499...",M,Lupus érythémateux disséminé (LED),"['I30', 'douleurxx', 'douleurxx_carac_@_vive',...",douleurxx,SLE,"{'Have you ever had a pericarditis?': 'True', ...",M32.9
134525,88,"[['néoplasie pulmonaire', 0.09094757620611861]...",F,néoplasie pulmonaire,"['crach_sg', 'douleurxx', 'douleurxx_carac_@_u...",fatig_mod,Pulmonary neoplasm,"{'Have you been coughing up blood?': 'True', '...",C34.9
134526,29,"[['Attaque de panique', 0.29281344656090524], ...",F,Syndrome de Boerhaave,"['douleurxx', 'douleurxx_carac_@_déchirante', ...",dyspn,Boerhaave,"{'Do you have pain somewhere, related to your ...",K22.3
134527,8,"[['Scombroïde', 0.1389590231491235], ['TSVP', ...",M,Scombroïde,"['dyspn', 'faible', 'flushing', 'lesions_peau'...",palpit,Scombroid food poisoning,{'Are you experiencing shortness of breath or ...,T61.1


# Write DataFrames to CSV files for train-val-test splits

In [85]:
train_df.to_csv('./data/translated_train.csv', encoding='utf-8')

In [86]:
val_df.to_csv('./data/translated_val.csv', encoding='utf-8')

In [89]:
test_df.to_csv('./data/translated_test.csv', encoding='utf-8')

# Load Data Splits into DFs

In [24]:
import pandas as pd
import numpy as np

In [25]:
np.random.seed(42)

In [26]:
test_df = pd.read_csv('./data/translated_test.csv', encoding='utf-8')
test_df

Unnamed: 0.1,Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE,English Pathology,English Evidences
0,0,49,"[['Bronchite', 0.20230062181160519], ['RGO', 0...",F,RGO,"['douleurxx', 'douleurxx_carac_@_lancinante_/_...",toux,GERD,"{'Do you have pain somewhere, related to your ..."
1,1,2,"[['Asthme exacerbé ou bronchospasme', 0.080220...",M,Bronchite,"['douleurxx', 'douleurxx_carac_@_une_brûlure_o...",douleurxx,Bronchitis,"{'Do you have pain somewhere, related to your ..."
2,2,49,"[['Réaction dystonique aïgue', 0.6267050848165...",M,Réaction dystonique aïgue,"['antipsy_récent', 'laryngospasme', 'nau_psy_r...",laryngospasme,Acute dystonic reactions,{'Have you started or taken any antipsychotic ...
3,3,64,"[['Bronchite', 0.2748608320637265], ['Laryngit...",M,Laryngite aigue,"['crowd', 'dayc', 'douleurxx', 'douleurxx_cara...",douleurxx,Acute laryngitis,"{'Do you live with 4 or more people?': 'True',..."
4,4,70,"[['IVRS ou virémie', 0.21257615919851483], ['P...",F,IVRS ou virémie,"['contact', 'diaph', 'douleurxx', 'douleurxx_c...",toux,URTI,{'Have you been in contact with a person with ...
...,...,...,...,...,...,...,...,...,...
134524,134524,52,"[['Possible NSTEMI / STEMI', 0.268768209851499...",M,Lupus érythémateux disséminé (LED),"['I30', 'douleurxx', 'douleurxx_carac_@_vive',...",douleurxx,SLE,"{'Have you ever had a pericarditis?': 'True', ..."
134525,134525,88,"[['néoplasie pulmonaire', 0.09094757620611861]...",F,néoplasie pulmonaire,"['crach_sg', 'douleurxx', 'douleurxx_carac_@_u...",fatig_mod,Pulmonary neoplasm,"{'Have you been coughing up blood?': 'True', '..."
134526,134526,29,"[['Attaque de panique', 0.29281344656090524], ...",F,Syndrome de Boerhaave,"['douleurxx', 'douleurxx_carac_@_déchirante', ...",dyspn,Boerhaave,"{'Do you have pain somewhere, related to your ..."
134527,134527,8,"[['Scombroïde', 0.1389590231491235], ['TSVP', ...",M,Scombroïde,"['dyspn', 'faible', 'flushing', 'lesions_peau'...",palpit,Scombroid food poisoning,{'Are you experiencing shortness of breath or ...


In [27]:
val_df = pd.read_csv('./data/translated_val.csv', encoding='utf-8')
val_df

Unnamed: 0.1,Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE,English Pathology,English Evidences
0,0,55,"[['Anémie', 0.25071110167158567], ['Fibrillati...",F,Anémie,"['Mauv_aliment', 'atcd_anem', 'atcd_fam_anem',...",pale,Anemia,"{'Do you have a poor diet?': 'True', 'Have you..."
1,1,10,"[['Syndrome de Guillain-Barré', 0.135558991316...",F,Attaque de panique,"['anxiete_s', 'atcdpsyfam', 'diaph', 'douleurx...",psy_depers,Panic attack,"{'Do you feel anxious?': 'True', 'Do any membe..."
2,2,68,[['Possible influenza ou syndrome virémique ty...,F,Possible influenza ou syndrome virémique typique,"['diaph', 'douleurxx', 'douleurxx_carac_@_une_...",douleurxx,Influenza,{'Have you had significantly increased sweatin...
3,3,13,"[['Anémie', 0.18697604010451876], ['Fibrillati...",M,Anémie,"['Mauv_aliment', 'atcd_anem', 'atcd_fam_anem',...",douleurxx,Anemia,"{'Do you have a poor diet?': 'True', 'Have you..."
4,4,48,"[['Syndrome de Boerhaave', 1.0]]",M,Syndrome de Boerhaave,"['douleurxx', 'douleurxx_carac_@_déchirante', ...",douleurxx,Boerhaave,"{'Do you have pain somewhere, related to your ..."
...,...,...,...,...,...,...,...,...,...
132443,132443,27,"[['Pharyngite virale', 0.22702125813983617], [...",M,Pharyngite virale,"['contact', 'crowd', 'douleurxx', 'douleurxx_c...",toux,Viral pharyngitis,{'Have you been in contact with a person with ...
132444,132444,57,"[['OAP/Surcharge pulmonaire', 0.12078088376840...",M,OAP/Surcharge pulmonaire,"['J81', 'douleurxx', 'douleurxx_carac_@_pénibl...",oedeme,Acute pulmonary edema,{'Have you ever had fluid in your lungs?': 'Tr...
132445,132445,52,"[['RGO', 0.24494427036287517], ['Bronchite', 0...",F,RGO,"['douleurxx', 'douleurxx_carac_@_lancinante_/_...",pyrosis,GERD,"{'Do you have pain somewhere, related to your ..."
132446,132446,10,"[['Épiglottite', 0.2969684152571116], ['VIH (P...",M,Épiglottite,"['douleurxx', 'douleurxx_carac_@_un_coup_de_co...",fievre,Epiglottitis,"{'Do you have pain somewhere, related to your ..."


In [28]:
train_df = pd.read_csv('./data/translated_train.csv', encoding='utf-8')
train_df

Unnamed: 0.1,Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE,English Pathology,English Evidences
0,0,18,"[['Bronchite', 0.19171203430383882], ['Pneumon...",M,IVRS ou virémie,"['crowd', 'diaph', 'douleurxx', 'douleurxx_car...",fievre,URTI,"{'Do you live with 4 or more people?': 'True',..."
1,1,21,"[['VIH (Primo-infection)', 0.5189500564407601]...",M,VIH (Primo-infection),"['adp_dlr', 'atcd_its', 'diaph', 'diarrhee', '...",diaph,HIV (initial infection),{'Do you have swollen or painful lymph nodes?'...
2,2,19,"[['Bronchite', 0.11278064619119596], ['Pneumon...",F,Pneumonie,"['douleurxx', 'douleurxx_carac_@_un_coup_de_co...",expecto,Pneumonia,"{'Do you have pain somewhere, related to your ..."
3,3,34,"[['IVRS ou virémie', 0.23859396799565236], ['C...",F,IVRS ou virémie,"['crowd', 'douleurxx', 'douleurxx_carac_@_une_...",douleurxx,URTI,"{'Do you live with 4 or more people?': 'True',..."
4,4,36,"[['IVRS ou virémie', 0.23677812769175735], ['P...",M,IVRS ou virémie,"['dayc', 'diaph', 'douleurxx', 'douleurxx_cara...",toux,URTI,{'Do you attend or work in a daycare?': 'True'...
...,...,...,...,...,...,...,...,...,...
1025597,1025597,18,"[['Épiglottite', 0.28156957795466475], ['VIH (...",M,Épiglottite,"['bw_bending', 'douleurxx', 'douleurxx_carac_@...",fievre,Epiglottitis,{'Do you have pain that improves when you lean...
1025598,1025598,28,"[['Épiglottite', 0.3703962237298842], ['Laryng...",F,Épiglottite,"['douleurxx', 'douleurxx_carac_@_vive', 'doule...",fievre,Epiglottitis,"{'Do you have pain somewhere, related to your ..."
1025599,1025599,0,"[['Épiglottite', 0.13193905052537108], ['Laryn...",F,Épiglottite,"['bw_bending', 'douleurxx', 'douleurxx_carac_@...",stridor,Epiglottitis,{'Do you have pain that improves when you lean...
1025600,1025600,26,"[['Épiglottite', 0.3028258988138983], ['Laryng...",F,Épiglottite,"['douleurxx', 'douleurxx_carac_@_un_coup_de_co...",stridor,Epiglottitis,"{'Do you have pain somewhere, related to your ..."


# Sample Test Data for project

In [90]:
num_test_samples = 10000

In [91]:
sample_test_df = test_df.sample(n=num_test_samples, random_state=42)
sample_test_df

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE,English Pathology,English Evidences,ICD-10
109830,21,"[['Asthme exacerbé ou bronchospasme', 0.117008...",F,Asthme exacerbé ou bronchospasme,"['dyspn', 'fam_j45', 'j32', 'j45', 'toux', 'tr...",toux,Bronchospasm / acute asthma exacerbation,{'Are you experiencing shortness of breath or ...,J98.0
11402,31,"[['Réaction dystonique aïgue', 0.1781816671868...",M,Réaction dystonique aïgue,"['drogues_stimul', 'dyspn', 'laryngospasme', '...",protu_langue,Acute dystonic reactions,{'Do you regularly take stimulant drugs?': 'Tr...,G24.9
30840,3,"[['Rhinosinusite chronique', 0.189267488945076...",F,Rhinosinusite chronique,"['douleurxx', 'douleurxx_carac_@_une_brûlure_o...",hyponos,Chronic rhinosinusitis,"{'Do you have pain somewhere, related to your ...",J32.9
14889,46,"[['VIH (Primo-infection)', 0.48414955175409613...",F,VIH (Primo-infection),"['atcd_its', 'diaph', 'diarrhee', 'douleurxx',...",diaph,HIV (initial infection),{'Have you ever had a sexually transmitted inf...,B20
131903,72,"[['Attaque de panique', 0.09172200566348035], ...",F,Attaque de panique,"['anxiete_s', 'atcdpsyfam', 'diaph', 'douleurx...",palpit,Panic attack,"{'Do you feel anxious?': 'True', 'Do any membe...",F41.0
...,...,...,...,...,...,...,...,...,...
113311,0,"[['VIH (Primo-infection)', 0.3030252901252109]...",M,VIH (Primo-infection),"['atcd_its', 'diarrhee', 'douleurxx', 'douleur...",fatig_ext,HIV (initial infection),{'Have you ever had a sexually transmitted inf...,B20
46200,35,"[['Possible NSTEMI / STEMI', 0.170304300298058...",M,Possible NSTEMI / STEMI,"['HIV', 'diaph', 'douleurxx', 'douleurxx_carac...",fatig_mod,Possible NSTEMI / STEMI,{'Are you infected with the human immunodefici...,I21.4
54048,71,"[['Pharyngite virale', 0.5232406274878201], ['...",F,Pharyngite virale,"['crach_sg', 'dayc', 'douleurxx', 'douleurxx_c...",crach_sg,Viral pharyngitis,"{'Have you been coughing up blood?': 'True', '...",J02.9
27233,61,"[['Possible NSTEMI / STEMI', 0.479718656849398...",M,Sarcoïdose,"['convulsion', 'douleurxx', 'douleurxx_carac_@...",dyspn,Sarcoidosis,{'Have you lost consciousness associated with ...,D86.9


In [92]:
sample_test_df['English Evidences'][109830]

{'Are you experiencing shortness of breath or difficulty breathing in a significant way?': 'True',
 'Do you have any family members who have asthma?': 'True',
 'Have you been diagnosed with chronic sinusitis?': 'True',
 'Do you have asthma or have you ever had to use a bronchodilator in the past?': 'True',
 'Do you have a cough?': 'True',
 'Have you traveled out of the country in the last 4 weeks?': 'No',
 'Do you live in in a big city?': 'True',
 'Have you noticed a wheezing sound when you exhale?': 'True'}

In [93]:
sample_test_df.to_csv('./data/Project_Test_Data.csv', encoding='utf-8')

# Sample Training Data for Few-Shot Learning (one sample for each gender of each pathology)

In [94]:
train_samples_df = train_df.groupby(["SEX", "English Pathology"], group_keys=False).apply(lambda x: x.sample(n=1, random_state=1))
train_samples_df

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE,English Pathology,English Evidences,ICD-10
890172,67,[['Exacerbation aigue de MPOC et/ou surinfecti...,F,Exacerbation aigue de MPOC et/ou surinfection ...,"['bode', 'dyspn', 'expecto', 'f17.210', 'j44_j...",wheez,Acute COPD exacerbation / infection,{'Do you have severe Chronic Obstructive Pulmo...,J44.1
1013918,33,"[['Réaction dystonique aïgue', 0.3003962474292...",F,Réaction dystonique aïgue,"['antipsy_récent', 'drogues_stimul', 'dyspn', ...",spasme_trapeze,Acute dystonic reactions,{'Have you started or taken any antipsychotic ...,G24.9
713643,53,"[['Laryngite aigue', 0.23688497116098903], ['B...",F,Laryngite aigue,"['crowd', 'douleurxx', 'douleurxx_carac_@_une_...",volume_parole,Acute laryngitis,"{'Do you live with 4 or more people?': 'True',...",J04.0
629267,75,"[['Bronchite', 0.5744711710103535], ['Otite mo...",F,Otite moyenne aigue (OMA),"['H6690', 'douleurxx', 'douleurxx_carac_@_sens...",douleurxx,Acute otitis media,{'Are you currently being treated or have you ...,H66.9
441568,95,"[['OAP/Surcharge pulmonaire', 0.07955369034832...",F,OAP/Surcharge pulmonaire,"['J81', 'Z99.2', 'douleurxx', 'douleurxx_carac...",dyspn_noct,Acute pulmonary edema,{'Have you ever had fluid in your lungs?': 'Tr...,J81.0
...,...,...,...,...,...,...,...,...,...
925686,32,"[['Tuberculose', 0.13622675902987222], ['Bronc...",M,Tuberculose,"['HIV', 'crach_sg', 'drogues_IV', 'dyspn', 'f1...",dyspn,Tuberculosis,{'Are you infected with the human immunodefici...,A15.9
52609,61,"[['IVRS ou virémie', 0.1947788896958362], ['Po...",M,IVRS ou virémie,"['crowd', 'dayc', 'diaph', 'douleurxx', 'doule...",msk_dlr,URTI,"{'Do you live with 4 or more people?': 'True',...",J06.9
684587,43,"[['Possible NSTEMI / STEMI', 0.095954308798863...",M,Angine instable,"['angor_accelere', 'diaph', 'douleurxx', 'doul...",diaph,Unstable angina,{'Do you find that your symptoms have worsened...,I20.0
132645,19,"[['Pharyngite virale', 0.2603485862680125], ['...",M,Pharyngite virale,"['contact', 'crach_sg', 'crowd', 'douleurxx', ...",rhino_clair,Viral pharyngitis,{'Have you been in contact with a person with ...,J02.9


In [95]:
# sample test
train_samples_df[train_samples_df['PATHOLOGY']=='Coqueluche']

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE,English Pathology,English Evidences,ICD-10
24112,71,"[['Coqueluche', 0.5125164486996661], ['Bronchi...",F,Coqueluche,"['cont_coq', 'e66', 'j45', 'posttus_emesis', '...",posttus_emesis,Whooping cough,{'Have you been in contact with someone who ha...,A37.9
109079,19,"[['Coqueluche', 0.5363081514145991], ['Bronchi...",M,Coqueluche,"['cont_coq', 'insp_siffla', 'j45', 'posttus_em...",insp_siffla,Whooping cough,{'Have you been in contact with someone who ha...,A37.9


In [96]:
train_samples_df.to_csv('./data/Project_Train_Data_FewShot_Learning.csv', encoding='utf-8')