# VIRTUAL PATIENT GENERATOR

**INPUTS**:
* *data/02_CIE_tokenized_info.xlsx*

**OUTPUTS**:
* *data/03_virtual_patients_db.xlsx*
* *data/03_virtual_patients_cie_692,76.xlsx*

**NOTAS**

### Imports

In [1]:
import yaml
import random
import pandas as pd

## CONFIGURATION

### Set config from yml file

In [2]:
with open("config.yml", "r") as ymlfile:
    variables = yaml.load(ymlfile, Loader=yaml.FullLoader)
    globals().update(variables["patient"])
    globals().update(variables["other"])

### Get data

In [3]:
cie_data = pd.read_excel("data/02_CIE_tokenized_info.xlsx")[["CIE","TOKENIZED","RISKWORDS"]]

cie_data['CIE'] = cie_data['CIE'].apply(str)
cie_data.fillna('[]', inplace=True)

cie_data['TOKENIZED'] = cie_data['TOKENIZED'].apply(eval)
cie_data['RISKWORDS'] = cie_data['RISKWORDS'].apply(eval)

def delete_risk_tokenized(df):
    return [w for w in df['TOKENIZED'] if w not in df['RISKWORDS']]

cie_data['NONRISKWORDS'] = cie_data.apply(delete_risk_tokenized, axis = 1)

## FUNCTIONS

In [4]:
def create_virtual_patient(cie):
    risk = random.choices([1, 0], weights=[probability_risk, 1 - probability_risk], k=1)[0]
    return {
        "CIE": cie,
        "AGE": random.randint(min_age, max_age),
        "SEX": get_sex(cie),
        "RISK": risk,
        "TEXT": get_new_text(cie, risk),
    }

def get_sex(cie):
    if cie in just_women:
        return 'M'
    elif cie in just_men:
        return 'H'
    return random.choices(['M', 'H'], weights=[probability_man, 1 - probability_man], k=1)[0]
        

#cie: str; risk: int(0,1)
def get_new_text(cie, risk):
    ordinary_words = cie_data.loc[cie_data['CIE'] == cie]['NONRISKWORDS'].iloc[0]
    risk_words = cie_data.loc[cie_data['CIE'] == cie]['RISKWORDS'].iloc[0]
    
    text_length = random.randint(min_length, max_length)
    number_risk_words = random.randint(risk, int(text_length/2) * risk) # admite repetición, no necesario min

    text = random.choices(risk_words, k = number_risk_words)
    text += random.choices(ordinary_words, k = text_length - number_risk_words)
    
    random.shuffle(text)
    return " ".join(map(str, text))

def create_patients(cie): # Creates total_cases of one cie
    virtual_patients = []
    for i in range(total_cases):
        virtual_patients.append(create_virtual_patient(cie))
    return pd.DataFrame(virtual_patients)

def create_df_patients(list_of_cies): # Creates total_cases of different cies and suffles
    virtual_patients = pd.DataFrame()
    for cie in list_of_cies:
        virtual_patients = virtual_patients.append(create_patients(cie), ignore_index=True)
    return virtual_patients.sample(frac=1)

def get_cies_with_risk_words(): # Get list of CIEs that have list of tokenized risk words
    return cie_data[cie_data.RISKWORDS.str.len() != 0]["CIE"].tolist()

## CREATE VIRTUAL PATIENTS DATABASE

In [5]:
create_df_patients(get_cies_with_risk_words()).to_excel("./data/03_virtual_patients_db.xlsx", index=False)

In [7]:
create_patients('692.76').to_excel("./data/03_virtual_patients_cie_692,76.xlsx", index=False)