## Scan given patient for symptoms/tests/medications
This is to speed up the manual inspection of patient files for extraction of features in file 'infos_dossier_gyneco'.
This manual creation of information was done to compare NLP performance to ML performance using these informations (that NLP is supposed to extract)

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
import re

import nltk
nltk.download('stopwords')
from spellchecker import SpellChecker

from preprocess_NLP import load_dict_correction, tokenize_words
from preprocess_NLP import *

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nwolpert\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nwolpert\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df_nlp = pd.read_csv('./../../Data/Generate/Donnees_avec_endo_concat.csv', usecols=['Anonymisation', 'Résumé'])
df_nlp.Résumé = df_nlp.Résumé.apply(remove_special_characters)
df_nlp.Résumé = df_nlp.Résumé.apply(lowercase_text)
df_nlp.Résumé = df_nlp.Résumé.apply(correction_series)
df_nlp.head()

Unnamed: 0,Anonymisation,Résumé
0,AA-071,depuis chirurgie cycle suivant idem avec dou...
1,AB-067,1 en cas d hystéroscopie avant la ménopause ...
2,AC-131,externe balland mélina antécedent med a...
3,AE-060,appel téléphonique patiente moins douloureuse...
4,AE-062,rectosonographie échographie pelvienne avec c...


In [3]:
### Helper functions to print out relevants bits from the text for each feature/column

from fuzzywuzzy import fuzz    # library that helps finding matching words that might be misspelled or so
import re

def extract_context(text, keywords, nwords_around=10, threshold=80):
    # Split the text into words
    words = re.findall(r'\b\w+\b', text)
    
    # Initialize an empty list to store the results
    results = []
    
    # Iterate over the words with their positions
    for i, word in enumerate(words):
        # Check fuzzy match against each keyword
        for keyword in keywords:
            if fuzz.ratio(word.lower(), keyword) >= threshold:
                # Extract words before and after the matched word
                start_index = max(i - nwords_around, 0)
                end_index = i + nwords_around + 1
                context = words[start_index:end_index]
                results.append(' '.join(context))
                break  # Stop checking other keywords if one is matched
    
    return results


def extract_context_prefix(text, prefix, nwords_around=5):
    # Split the text into words
    words = re.findall(r'\b\w+\b', text)
    
    # Initialize an empty list to store the results
    results = []
    
    # Iterate over the words with their positions
    for i, word in enumerate(words):
        # Check if the word starts with the given prefix (default: 'dys')
        if word.lower().startswith(prefix):
            # Extract the window of words before and after the matched word
            start_index = max(i - nwords_around, 0)
            end_index = i + nwords_around + 1
            context = words[start_index:end_index]
            results.append(' '.join(context))
    
    return results

import re


def extract_context_abbreviation(text, abbreviation, nwords_around=3):
    # Split the text into words
    words = re.findall(r'\b\w+\b', text)
    
    # Initialize an empty list to store the results
    results = []
    
    # Iterate over the words with their positions
    for i, word in enumerate(words):
        # Check if the word matches the abbreviation (case-insensitive)
        if word.lower() == abbreviation.lower():
            # Extract words before and after the matched word
            start_index = max(i - nwords_around, 0)
            end_index = i + nwords_around + 1
            context = words[start_index:end_index]
            results.append(' '.join(context))
    
    return results

In [None]:
patient = 'AC-131'
text_patient = df_nlp.loc[df_nlp.Anonymisation==patient, 'Résumé'].values[0]

### Scan the text and print out any piece of text that could be relevant for the symptom in question

print(f'##################### Patient: {patient} #####################\n')

print('############ Douleurs: ############')
keywords=["douleur", "douloureux"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ Dysménorrhées: ############')
keywords=["dysménorrhées", "dysme", "menhorr", "menorr", "règles"]
if len(extract_context(text_patient, keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords): print(phrase)
    print()

print('############ Douleur exonération de selles: ############')
keywords=["exoneration", "selles", "défécation", "defec", "défec"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ Toucher vaginal: ############')
if len(extract_context(text_patient, keywords=["vaginal", "toucher", "tv"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["vaginal", "toucher"]): print(phrase)
    print()

print('############ douleurs mictionnelles/ dysurie: ############')
if len(extract_context(text_patient, keywords=["miction", "dysurie"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["miction", "dysurie"]): print(phrase)
    print()

print('############ rectorragie: ############')
if len(extract_context(text_patient, keywords=["rectorragie", "rectorr"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["rectorragie", "rectorr"]): print(phrase)
    print()

print('############ Douleurs rapports intimes / dyspareunies: ############')
keywords=["rapport", "sexe", "sexuel", "intime", "dyspareunies", "dyspareunie", "coital", "coitales"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords, nwords_around=20): print(phrase)
    print()

print('############ Spotting: ############')
if len(extract_context(text_patient, keywords=["spotting", "spottings"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["spotting", "spottings"]): print(phrase)
    print()

print('############ Amenorrhées: ############')
if len(extract_context(text_patient, keywords=["amenorrhées"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["amenorrhées"]): print(phrase)
    print()

print('############ Ménorragies: ############')
if len(extract_context(text_patient, keywords=["ménorragies", "ménorragie", "ménorr", "menorr"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["ménorragies", "ménorragie", "ménorr", "menorr"]): print(phrase)
    print()

print('############ Métrorragies: ############')
if len(extract_context(text_patient, keywords=["métrorragies"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["métrorragies"]): print(phrase)
    print()

print('############ Hyperménorrhées: ############')
if len(extract_context(text_patient, keywords=["hyperménorrhées", "hyperménorrhée", "hypermenorrhees", "hypermeno"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["hyperménorrhées", "hyperménorrhée", "hypermenorrhees", "hypermeno"]): print(phrase)
    print()

print('############ Mictions excessives: ############')
if len(extract_context(text_patient, keywords=["miction", "excessives"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["miction", "excessives"]): print(phrase)
    print()

print('############ Spasmes abdominales: ############')
if len(extract_context(text_patient, keywords=["spasmes", "abdominales", "cramp", "crampes"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["spasmes", "abdominales"]): print(phrase)
    print()

print('############ pollakiurie: ############')
if len(extract_context(text_patient, keywords=["pollakiurie", "pollakiuries", "uriner"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["pollakiurie", "pollakiuries", "uriner"]): print(phrase)
    print()

print('############ faux besoins: ############')
if len(extract_context(text_patient, keywords=["faux"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["faux"]): print(phrase)
    print()

print('############ infection urinaire: ############')
if len(extract_context(text_patient, keywords=["infection", "urinaire"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["infection", "urinaire"]): print(phrase)
    print()

print('############ perte appétit: ############')
if len(extract_context(text_patient, keywords=["appetit", "appétit", "mange"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["appetit", "appétit", "mange"]): print(phrase)
    print()

print('############ nausées: ############')
if len(extract_context(text_patient, keywords=["nausées", "nausée"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["nausées", "nausée"]): print(phrase)
    print()

print('############ vomissements: ############')
keywords=["vomissements", "vomit", "vomir"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ dyspnée: ############')
if len(extract_context(text_patient, keywords=["dyspnée"])) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=["dyspnée"]): print(phrase)
    print()

print('############ constipation / troubles transit : ############')
keywords = ["constipation", "constipé", "transit"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ diarrhées / diarrhées cataméniales : ############')
keywords = ["diarrhées", "cataméniales"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ incontinence : ############')
keywords = ["incontinence", "incontinent"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ hyperthermie : ############')
keywords = ["hyperthermie", "therm"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ vessie spastique : ############')
keywords = ["vessie", "spastique"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ Désir/essai de grossesse : ############')
keywords = ["désir", "essai", "grossesse", "enceinte"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ infertilité : ############')
keywords = ["infertilité", "infertile", "infert"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ PMA / FIV : ############')
abbreviation = "PMA"
if len(extract_context_abbreviation(text_patient, abbreviation=abbreviation)) == 0: print('/')
else:
    for phrase in extract_context_abbreviation(text_patient, abbreviation=abbreviation, nwords_around = 20): print(phrase)
    print()
abbreviation = "FIV"
if len(extract_context_abbreviation(text_patient, abbreviation=abbreviation)) == 0: print('/')
else:
    for phrase in extract_context_abbreviation(text_patient, abbreviation=abbreviation, nwords_around = 20): print(phrase)
    print()
keywords = ["fécondation", "vitro"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ fausses couches : ############')
keywords = ["fausses", "couches"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ Antécédent d''endométriose : ############')
keywords = ["endométriose", "endo", "antécédent", "ant", "atcd"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords, nwords_around = 20): print(phrase)
    print()

print('############ Antécédent de chirurgie d''endométriose : ############')
keywords = ["chirurgie", "chirurgies", "chir", "opération", "opéré", "ablation", "Coelioscopie", "coelio", "coelio", "adhesiolyse", "adhes", "hysteroscopie", "exérèse", "ligament", "lus", "gauche", "droite", "g", "endométriose", "endo"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords, nwords_around = 20): print(phrase)
    print()

print('############ Nodule : ############')
keywords = ["nodule"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ Epaississement : ############')
keywords = ["épaississement", "épais"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ Chirurgie bariatrique : ############')
keywords = ["bariatrique"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords, nwords_around = 20): print(phrase)
    print()

print('############ résection : ############')
keywords = ["résection"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ anapath : ############')
keywords = ["anapath"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ frottis cervico vaginal : ############')
keywords = ["frottis"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords, nwords_around=20): print(phrase)
    print()

print('############ échographie : ############')
keywords = ["échographie", "écho", "echo"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords, nwords_around = 40): print(phrase)
    print()

print('############ IRM : ############')
abbreviation = "IRM"
if len(extract_context_abbreviation(text_patient, abbreviation=abbreviation, nwords_around=30)) == 0: print('/')
else:
    for phrase in extract_context_abbreviation(text_patient, abbreviation=abbreviation, nwords_around = 20): print(phrase)
    print()

print('############ rectosonographie : ############')
keywords = ["rectosonographie", "recto", "sonogr", "rsg"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords, nwords_around = 30): print(phrase)
    print()

print('############ test hpv: ############')
keywords = ["hpv"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

print('############ absentéisme scolaire/professionnel: ############')
keywords = ["absentéisme", "absentisme", "scolaire", "professionnel", "école", "travail"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

### Print out all the medications appearing:

print('############ Medications: ############')

keywords = ["antalgique", "antiinflammatoire", "inflammatoire", "anti", "antispasmodique", "spasmodique", "hormonal", "hormon", "antadys", "dihydrocodéine",
            "paracétamol", "ibuprofène", "dafalgan", "efferalgan", "diclofénac", "kétoprofène", "celecoxib", "naproxène", "indométacine", "piroxicam", "flurbiprofène", "voltarène", "profenid", "celebrex", "advil", "apranax", "indocid", "feldène", "cebutid", "néfopam", "paralyoc",    # antalgiques palier 1
            "codéine", "codoliprane", "algisedal", "antalvic", "dicodin", "tramadol", "topalgic", "contramal", "zamudol", "ixprim", "zaldiar", "contramal", "topalgic",                                                                                                  # antalgiques palier 2
            "actiskenan", "sévrédol", "skénan", "moscontin", "kapanol", "sophidone", "oxycodone", "oxynorm", "oxycontin", "morphine", "hydromorphone", "oxycodone", "tapentadol", "buprénorphine", "fentanyl", "méthadone",         # antalgiques palier 3
            "lamaline", "izalgi", "ains", "biprofenid", "ponstyl", "nifluril", "acupan", "skénan", "utrogestan", "surgestone", "norlevo", "naproxène", "progestérone", "progestatif", "oestro",
            "toviaz", "spasfon", "vesicare", "meteospasmyl", "optilova", "optidril", "minidril", "mirena", "izéane", "optimizette", "décapeptyl", "duphaston", "estréva", "visanne", "lutenyl", "lutéran", "ditropan", "microval", "misolfa",
            "cp", "prendre"]
if len(extract_context(text_patient, keywords=keywords)) == 0: print('/')
else:
    for phrase in extract_context(text_patient, keywords=keywords): print(phrase)
    print()

from gliner import GLiNER
model = GLiNER.from_pretrained("almanach/camembert-bio-gliner-v0.1")
entities = model.predict_entities(text_patient, labels=["médicaments"], threshold=0.5)
for e in entities: print(e['text'])

##################### Patient: AC-131 #####################

############ Douleurs: ############
le 09 09 c lioscopie endometriose depuis le 09 09 douleur abdominal basse persistante depuis la chirurgie à type de douleur
douleur abdominal basse persistante depuis la chirurgie à type de douleur de règle eva 4 10 exacerbée par la palpation partiellement
effort rapportée malaise le 15 09 depuis le 17 09 douleur hypogastrique en fin de miction pollakiurie a vue son mt
pour endométriose 09 09 2020 résection lus ept coelio ambulatoire douloureuse depuis la chirurgie 1sem douleurs au niveau des cicatrices douleurs
résection lus ept coelio ambulatoire douloureuse depuis la chirurgie 1sem douleurs au niveau des cicatrices douleurs abdominales lancinantes diffuses impression de
douloureuse depuis la chirurgie 1sem douleurs au niveau des cicatrices douleurs abdominales lancinantes diffuses impression de ballonnement abdominal douleurs non latéralisé
des cicatrices douleurs abdominales lancinantes

## Compare results of manual extraction with receuil data

In [52]:
recueil_imc  = pd.read_excel('./../../Data/Raw/Recueil (1).xlsx').drop('Unnamed: 90', axis=1)
recueil_imc = recueil_imc.rename(columns={"Numéro anonymat": "Anonymisation"})

infos_dossier_gyneco  = pd.read_excel('./../../Data/Generate/infos_dossiers_gyneco.xlsx')
infos_dossier_gyneco

Unnamed: 0,Anonymisation,douleurs_pelviennes,douleurs_abdomidales,douleurs lus droit,douleurs lus gauche,douleurs torus,douleurs_vésicales,douleurs sacro lombaires (dl ),douleurs_prémenstruelles,dysménorrhées,...,traitement_chlorhydrate,traitement_kétoprofène,traitement_novasure,réponse_traitement_mirena,traitement_pilule_progestative,traitement_trinordiol,traitement_amoxicilline,traitement_oflocet,traitement_eupantol,traitement_innohep
0,AA-071,1.0,,0.0,0,,,,,1.0,...,,,,,1.0,1.0,,,,
1,AB-067,,,,,,,,,0.0,...,,,,,,,,,,
2,AC-131,,1.0,,,,,0.0,,,...,,,,,,,,,,
3,AE-060,,,,,,1.0,,,,...,,,,,,,,,,
4,AE-062,,,1.0,,1,,,,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,WM-089,1.0,1.0,,,,,,,,...,,,,,,,,,,
195,YC-171,0.0,,,,,,,,1.0,...,,,,,,,,,,
196,YL-124,1.0,,,,sensibilité,,,,1.0,...,,,,,,,,,,
197,6061322,1.0,,,,,,,,1.0,...,,,,,,,,,,


In [65]:
columns_to_compare = ['ttt.p', "traitement progestatif"] #['sf.dpc', 'douleurs_pelviennes'] # ['tv.douleur.lusg', "douleurs lus gauche"]
df = pd.merge(infos_dossier_gyneco, recueil_imc, on=['Anonymisation'])
try: 
    df[columns_to_compare[0]] = pd.array(df[columns_to_compare[0]], dtype=pd.Int64Dtype())
except:
    pass
try: 
    df[columns_to_compare[1]] = pd.array(df[columns_to_compare[1]], dtype=pd.Int64Dtype())
except:
    pass
pd.set_option('display.max_rows', df.shape[0]+10)
df[['Anonymisation'] + columns_to_compare]

Unnamed: 0,Anonymisation,ttt.p,traitement progestatif
0,AA-071,1.0,0
1,AB-067,1.0,0
2,AC-131,1.0,0
3,AE-060,,1
4,AE-062,0.0,0
5,AM-023,0.0,0
6,AM-038,1.0,0
7,AM-164,0.0,0
8,AM-172,0.0,0
9,AS-121,1.0,1


In [64]:
single_column = 'test_hpv'
df_single = pd.merge(infos_dossier_gyneco[['Anonymisation'] + [single_column]], recueil_imc, on=['Anonymisation'])[['Anonymisation']+[single_column]]
try: 
    df_single[single_column] = pd.array(df_single[single_column], dtype=pd.Int64Dtype())
except:
    pass
df_single

Unnamed: 0,Anonymisation,test_hpv
0,AA-071,1
1,AB-067,
2,AC-131,
3,AE-060,
4,AE-062,
5,AM-023,
6,AM-038,
7,AM-164,
8,AM-172,
9,AS-121,


In [8]:
list(df.columns)

['Anonymisation',
 'douleurs_pelviennes',
 'douleurs_abdomidales',
 'douleurs lus droit',
 'douleurs lus gauche',
 'douleurs torus',
 'douleurs_vésicales',
 'douleurs sacro lombaires (dl )',
 'douleurs_prémenstruelles',
 'dysménorrhées',
 'dysménorrhées charactérisation',
 'douleurs exoneration selles / douleurs defecations',
 'douleurs toucher vaginal',
 'douleurs mictionnelles/ dysurie',
 'rectorragie',
 'dyspareunies / douleurs rapports intimes',
 'spotting',
 'amenorrhées',
 'ménorragies',
 'métrorragies',
 'hyperménorrhées',
 'miction_excessive',
 'spasmes_abdominales',
 'pollakiurie',
 'faux besoins',
 'infection_urinaire',
 'perte_appetit',
 'nausées',
 'vomissements',
 'dyspnée\xa0',
 'constipation / troubles transit ',
 'diarrhées / diarrhées cataméniales',
 'incontinence',
 'hyperthermie',
 'vessie_spastique',
 'désir/essai de grossesse',
 'infertilité',
 'infertilité charactérisation',
 'PMA / FIV',
 'fausses_couches',
 "antécédent d'endométriose",
 "antécédent de chirurgie 

In [9]:
column_of_interest = 'sf.dpc'
df = pd.merge(df_nlp, recueil_imc, on=['Anonymisation'])
pd.set_option('display.max_rows', df.shape[0]+10)
df[['Anonymisation', column_of_interest]]

Unnamed: 0,Anonymisation,sf.dpc
0,AA-071,1.0
1,AB-067,0.0
2,AC-131,1.0
3,AE-060,0.0
4,AE-062,0.0
5,AM-023,0.0
6,AM-038,1.0
7,AM-164,1.0
8,AM-172,
9,AS-121,1.0
