In [11]:
%load_ext pycodestyle_magic

The pycodestyle_magic extension is already loaded. To reload it, use:
  %reload_ext pycodestyle_magic


In [12]:
# %%pycodestyle  # --> raises "27:1: W391 blank line at end of file"
# while no obvious blank line after the last code line!
# Disabled because it inhibits effect of variables and print lines.
'''
it should be used in a different markdown that %load_est pycodestyle_magic
otherwise, it does not work!
I got the message "UsageError: Line magic function `%%pycodestyle` not found."
'''
# High level variables of the script for dev
__version__ = "1.1.0"
__date__ = "17-oct-2022"
__author__ = "L.COSTA (ATR)"

__debug_script__ = False
# if True, display more feedback to ease debugging

# Display rules:
# no tab / no symbol: High level title
# '>' Action step, tab length according to sub-level
# '-' Result, tab lentgth according to the sub-level
# 'CAUTION': abnormal/unexpected behaviour/result

print(10 * "-", "UDACITY - DATASICENCE PROJECT #01 / START", 10 * "-")
print("\tversion:", __version__)

if __debug_script__:
    print('INFO! Debug mode is active!')

---------- UDACITY - DATASICENCE PROJECT #01 / START ----------
	version: 1.1.0


In [13]:
# %%pycodestyle  # Disabled because it raises the error message
# "too many values to unpack (expected 3)" ...
# "do not subtract 1 for line for %%pycodestyle, inc pre py3.6 string"
# According to
# "https://stackoverflow.com/questions/61230004/
#  ho-to-fix-valueerror-too-many-values-to-unpack-expected-3"
#  that seems to not to be linked to a mistaken code.

# PRE-ANALYSIS OF THE RAW DATASET

# Before using the dataset for usual 'data science' processing,
# we gonna scan its content and raise data with the most missing data.
# It could be help for selecting most relevant data for furrther modeling.
# What are the most missing data provided by hospitals about Covid-19
#  patients ?

import pandas as pd


def get_input_filepath() -> str:
    '''
    returns the path of the selected csv data source file

    input: None
    ouput:
           string of the data file's path (separator: back-slash)
    '''
    # function: Return input data file's path
    directory = "C:\\Users\\to202835\\Documents"
    path = "exploitation\\formation\\db_covid19"
    filename = "donnees-hospitalieres-covid-19-dep-france.csv"

    return "\\".join((directory, path, filename))


def show_df_shape(df:object, description_name:str = ''):
    '''
    returns only a display of the shape and column's names of 
     a dataframe

    input: 
           df  : dataframe 
           description_name : additional descriptive name to add at display
    ouput: None
    '''
    # function: Display the shape and column's names of a dataframe

    if (df is not None) and __debug_script__:
        print('  -', description_name, 'df shape  :', df.shape)
        print('  -', description_name, 'df columns:', df.columns)


def read_csv_file(filepath:str) -> object:
    '''
    returns a dataframe of the data contained into the csv designated 
     by its file path

    input:
           filepath : Path of the csv data file that contains
            all raw data
    ouput:
           df_list  : List of all df read on csv files content,
            None otherwise
    '''
    # function: Read a csv data file and get its data into a pandas dataframe

    # output
    df = None

    print ("> Read data")
    if filepath is None:
        print ("  - No file selected")
    else:  # Read the file
        print ("  - Input file:", filepath)
        try:
            df = pd.read_csv(filepath, sep=';', encoding='latin-1', low_memory=False)
            # Read file with encoding latin-1 due to occurrence of non-utf8
            # Add a low_memory=False to avoid error
        except:
            print("  CAUTION: Unable to read the file")

         # is df empty ? case of no column, whatever rows index
        if (df is not None) and (df.shape[1] == 0):
            df = None  # we reset df to avoid further abnormal use
        show_df_shape(df)
    del filepath
    return df


# Gather data
filepath = get_input_filepath()  # get path of the csv data file
df = read_csv_file(filepath)     # read the data and get a pandas df

# No assessment, no cleaning; I actually want to know the most missing data

# Analyze
if df is not None:
    print("> Preliminary monitoring of raw data for raising missing data and so potential difficulty for hospitals to report some data.")
    print("  Is there any parameters more missing than others over the", len(df), "of data entries reported by hospitals ?")
    print("  - Ratio of missing data by categories of data:")
    df_num_missing_rate = 100 *  df.isna().sum() / len(df)
    # "df_init_copy.isna().sum()" can be replaced by "df_init_copy.isnull().sum()""
    #  we can have the same result with "df_init_copy_num_missing_mean = df_init_copy.isna().mean().round(4) * 100"
    print(df_num_missing_rate.sort_values(ascending=False).head(10))
    # Sort rates of missing values in the descending order to get interesting values first (for answering our question)
    # Conversely, we could have the opposite result,
    #  i.e. available data ratio with "df_init_copy_num_rate = 100*df_init_copy.count() / len(df_init_copy)""

# Conclusion
# We see that categories 'Nb_Quotidien_Retour_a_Domicile', 'Nb_Quotidien_Deces',
#  'Nb_Quotidien_Admis_Reanimation' and 'Nb_Quotidien_Admis_Hospitalisation'
#  have more than 66% of missing data; hostpitals can't or don't want fulfil these information.
# Inaddition, categories 'autres', 'SSR_USLD' and 'HospConv' reach a bit less than 40%.
# Maybe, I would have a better model using other remaining data which have less than 1% of missing data. 

# No modeling, no vizualization; I did not built a model at this stage; I'll do it after

> Read data
  - Input file: C:\Users\to202835\Documents\exploitation\formation\db_covid19\donnees-hospitalieres-covid-19-dep-france.csv
> Preliminary monitoring of raw data for raising missing data and so potential difficulty for hospitals to report some data.
  Is there any parameters more missing than others over the 280600 of data entries reported by hospitals ?
  - Ratio of missing data by categories of data:
Nb_Quotidien_Retour_a_Domicile        66.593728
Nb_Quotidien_Deces                    66.593728
Nb_Quotidien_Admis_Reanimation        66.593728
Nb_Quotidien_Admis_Hospitalisation    66.593728
autres                                39.239130
SSR_USLD                              39.239130
HospConv                              39.239130
geo_point_2d                           0.655738
Nom_departement                        0.655738
Nom_region                             0.655738
dtype: float64


In [14]:
# %%pycodestyle  # Disabled because it raises an error message
#  "too many values to unpack (expected 3)" linked to pycodestyle
# See above.

# Now that we identify on which categories we could built a model,
# go ahead on building this model, unfolding the whole process

# Running several times the process below allows definint the set
#  of categories that provide a good model

# GATHER


def get_model_input() -> dict:
    '''
    returns a dictionary of pre-defined response category and a list
     of variables categories

    input: None
    ouput:
           dictionary of strings with 'response' as target
            for modeling and 'variables' as input categories
    '''
    # function: Get inputs (model's category target & input cat.) for modeling

    cat_response = None   # name of the response category as model's target
    cat_variables = None  # list of categories selected as input for modeling

    cat_response = 'Total_Deces'  # My target for the modeling

    '''
    # File's variables that I can use for the project
    cat_variables = ['Code_du_Departement','Date',
                     'Nb_actuellement_hospitalises',
                     'Nb_actuellement_en_soins_intensifs',
                     'Total_retour_a_domicile', 'Total_Deces', 'Code_region',
                     'Code_ISO_3166_de_la_zone','Nom_region',
                     'Nom_departement','Sexe','geo_point_2d','HospConv',
                     'SSR_USLD', 'autres',
                     'Nb_Quotidien_Admis_Hospitalisation',
                     'Nb_Quotidien_Admis_Reanimation', 'Nb_Quotidien_Deces',
                     'Nb_Quotidien_Retour_a_Domicile']
    #   r2_scores_train: 0.9795083631226138
    #   r2_scores_test: 0.9788894211053454

    # Reduced set of variables considered for modeling

    cat_variables = ['Code_region', 'Code_du_Departement',
                     'Nb_actuellement_hospitalises',
                     'Nb_actuellement_en_soins_intensifs',
                     'Total_retour_a_domicile','Total_Deces','Sexe',
                     'HospConv', 'Nb_Quotidien_Admis_Hospitalisation',
                     'Nb_Quotidien_Admis_Reanimation', 'Nb_Quotidien_Deces',
                     'Nb_Quotidien_Retour_a_Domicile']
    #   r2_scores_train: 0.9761588302088899
    #   r2_scores_test: 0.9755838001304158

    cat_variables = ['Code_region', 'Code_du_Departement',
                     'Nb_actuellement_hospitalises',
                     'Nb_actuellement_en_soins_intensifs',
                     'Total_retour_a_domicile','Total_Deces', 'Sexe',
                     'HospConv']
    # out ,'Nb_Quotidien_Admis_Hospitalisation',
    #  'Nb_Quotidien_Admis_Reanimation','Nb_Quotidien_Deces',
    #  'Nb_Quotidien_Retour_a_Domicile'
    #   r2_scores_train: 0.9761588302088899
    #   r2_scores_test: 0.9755838001304158

    cat_variables = ['Code_region', 'Code_du_Departement',
                     'Total_retour_a_domicile', 'Total_Deces','Sexe',
                     'HospConv']
    # out ,'Nb_Quotidien_Admis_Hospitalisation',
    #  'Nb_Quotidien_Admis_Reanimation','Nb_Quotidien_Deces',
    #  'Nb_Quotidien_Retour_a_Domicile', 'Nb_actuellement_hospitalises',
    #  'Nb_actuellement_en_soins_intensifs'
    #   r2_scores_train: 0.9739281412849445
    #   r2_scores_test: 0.9734957171394152

    cat_variables = ['Code_region', 'Code_du_Departement',
                     'Total_retour_a_domicile', 'Total_Deces','Sexe']
    # out ,'Nb_Quotidien_Admis_Hospitalisation',
    #  'Nb_Quotidien_Admis_Reanimation','Nb_Quotidien_Deces',
    #  'Nb_Quotidien_Retour_a_Domicile', 'Nb_actuellement_hospitalises',
    #  'Nb_actuellement_en_soins_intensifs','HospConv'
    #   r2_scores_train: 0.9739281412849445
    #   r2_scores_test: 0.9734957171394152

    cat_variables = ['Code_region', 'Code_du_Departement',
                     'Nb_actuellement_hospitalises',
                     'Nb_actuellement_en_soins_intensifs',
                     'Total_retour_a_domicile','Total_Deces','Sexe']
    # out ,'Nb_Quotidien_Admis_Hospitalisation',
    #  'Nb_Quotidien_Admis_Reanimation','Nb_Quotidien_Deces',
    #  'Nb_Quotidien_Retour_a_Domicile','Nb_actuellement_hospitalises',
    #  'Nb_actuellement_en_soins_intensifs','HospConv'
    #   r2_scores_train: 0.9761588302088899
    #   r2_scores_test: 0.9755838001304158
    # We keep this settings

    cat_variables = ['Nb_actuellement_hospitalises',
                     'Nb_actuellement_en_soins_intensifs',
                     'Total_retour_a_domicile', 'Total_Deces','Sexe']
    # out ,'Nb_Quotidien_Admis_Hospitalisation',
    #  'Nb_Quotidien_Admis_Reanimation','Nb_Quotidien_Deces',
    #  'Nb_Quotidien_Retour_a_Domicile', 'Nb_actuellement_hospitalises',
    #  'Nb_actuellement_en_soins_intensifs','HospConv',
    #  'Code_region', 'Code_du_Departement'
    #   r2_scores_train: 0.9523048607117208
    #   r2_scores_test: 0.9511261411557373
    '''

    cat_variables = ['Code_du_Departement', 'Nb_actuellement_hospitalises',
                     'Nb_actuellement_en_soins_intensifs',
                     'Total_retour_a_domicile', 'Total_Deces', 'Sexe']
    # out ,'Nb_Quotidien_Admis_Hospitalisation',
    # 'Nb_Quotidien_Admis_Reanimation','Nb_Quotidien_Deces',
    # 'Nb_Quotidien_Retour_a_Domicile','Nb_actuellement_hospitalises',
    # 'Nb_actuellement_en_soins_intensifs','HospConv', 'Code_region'
    #   r2_scores_train: 0.9761588302089482
    #   r2_scores_test: 0.9755838001373758
    # I keep them, optimizing quantity of variables to establish the model

    return {'response': cat_response, 'variables': cat_variables}


def get_df_selection(df:object, categories_selection:dict):
    '''
    returns a dataframe only with selected categories and a dictionary
     of the response and input categories, updated of what is actually
     available within the initial dataframe.
    
     input:
            df : dataframe with all categories
            categories_selection: dict. of response & inputs cat.
             to keep in df
    ouput:
            df : dataframe only with categories selection
            categories_selection : Update (if needed) of the input dict.
    '''
    # function: get dataframe with selection of categories

    # sub-inputs variables:
    cat_variables = categories_selection['variables']
    cat_response = categories_selection['response']
    avail_variables, cat_variable, df_selection = None, None, None
    df_complete = True

    print ("> Check availability of selected categories in the dataframe.")
    if df is not None:
        avail_variables = []  # categories actualy present into the dataframe
                              #  and considered for modeling
        # If no category is selected, define a default sel.
        if cat_variables is None:
            print('  CAUTION: No useful variables pre-defined')            
            # default selection = all cat. available in file
            cat_variables = df.columns
            if __debug_script__:
                print("variables: ", cat_variables)
            # Update dict. with new input
            categories_selection['variables'] = cat_variables
            avail_variables = df.columns
            print('  - All', len(cat_variables), 'categories found & used in data file')
        else:
            # If categories of variables are selected
            #  check all useful categories are available in the raw data
            for cat_variable in cat_variables:
                if cat_variable in df.columns:
                    avail_variables.append(cat_variable)

        if cat_response not in avail_variables:
            print("  CAUTION: The model's response category was not found in dataframe")
            df_complete = False         

        if len(cat_variables) != len(avail_variables):
            # if we do not have the expected nb of categories

            print('  CAUTION: Unable to get all usefull variables')
            print('  - Found categories:', avail_variables)
            df_complete = False      
        
        if df_complete:
            # Get only useful declared categories
            df_selection = df[cat_variables] # Keep only useful categories in a df
            # df = df_use.copy(deep=True) # copy result into df
            if __debug_script__:
                print('  All', len(cat_variables),'selected categories available.')

        show_df_shape(df_selection, 'Selection')

    del cat_variables, cat_response, avail_variables, cat_variable, df_complete, df
    return df_selection, categories_selection


# Define a selection of categories to use for modeling
categories_selection = get_model_input()

# get dataframe of the selected categories 
df, categories_selection = get_df_selection(df, categories_selection)

> Check availability of selected categories in the dataframe.


In [15]:
# %%pycodestyle  # Disabled because it raises an error message
#  "too many values to unpack (expected 3)" linked to pycodestyle
# See above.

# MANAGE DUMMY DATA

# Now that we get the selected raw dataset, we gonna make some assessment
# on this dataset


def manage_dummy_df(df: object, categories_selection:dict) -> object:
    '''
    returns  the initial dataframe completed with all appropriate 
     new categories of dummy categories

    input:
           df : pandas dataframe with categorical var. that
            i'd like to dummy
    output:
           df : dataframe with non categorical & categorical
            data like:
                1. contains all columns that were not specified
                 as categorical
                2. removes all the original columns in cat_df
                3. dummy columns for each of the categorical
                 columns in cat_df
                4. if dummy_na is True, it contains dummy columns
                 for NaN values
                5. Use a prefix of column's name with underscore
                 (_) as separator 
    '''
    # purpose : Dummy categorical variables within the dataframe
    # Actually, it concerns gender category and France's department 
    #  category; indeed, integer code number used for designated these
    #  department are not recognized as figures.
        
    # sub-variables
    cat_df, df_col, df_samp, df_samp_col = None, None, None, None
    nb_add_var, drop_var, add_var = None, None, None
    init_df_size, delta = None, None
    response = categories_selection['response']    

    if df is not None:

        print ("> Dummy cat. data")

        if __debug_script__ and (df is not None):
            init_df_size = df.shape
            show_df_shape(df, 'Before dummy')

        try:
            if __debug_script__:                
                types_occur = df.dtypes.value_counts()
                print("  - types_occur:\n", types_occur)
            # Return a subset of categorical df's columns,
            #  in addition of non-categorical columns
            cat_df = df.select_dtypes(include=['object'])
           
            # Identify cat / non-cat columns
            list_df = df.columns
            list_df_cat = cat_df.columns
            list_df_non_cat = []
            for var in list_df:
                if var not in list_df_cat:
                    list_df_non_cat.append(var)
            if __debug_script__:
                print("  - list cat    :", list_df_cat)
                print("  - list non cat:", list_df_non_cat)
            del list_df, list_df_cat, list_df_non_cat
        except:
            print("  CAUTION: Unable to find categorical var in df") 
            cat_df = pd.DataFrame([])
        
        if __debug_script__:
            print('  - Nb of categorical data detected:', cat_df.shape[1])

        # is df empty ? case of no column, whatever rows index
        if cat_df.shape[1] == 0: 
            print("  CAUTION: No categorical data found in raw data") 
        else:            
            for var in cat_df:  # Run along the categorical data columns
                if var != response: # We exclude the response var.
                    if __debug_script__:
                        print("  - var '", var, "'")
                    try:
                        df = pd.concat([df.drop(var, axis=1),
                                        pd.get_dummies(df[var],
                                        prefix=var, prefix_sep='_',
                                        drop_first=False)], axis=1)
                        # Initially, I've limited the number of new dummy variables
                        #  to create into df to 10 but it was not enough to get
                        #  a memory error at this step. Finally, I reduce the
                        #  file's content to only 3 years instead of a century.
                    except:
                        print("    CAUTION: Unable to concat cat. var. '", var, "'")
                        continue

        show_df_shape(df, 'After dummy')
        
    del cat_df, df_samp, df_col, nb_add_var, df_samp_col,\
        drop_var, add_var, init_df_size, delta
    return df

# Replace cat column by related dummy columns
df = manage_dummy_df(df, categories_selection)

> Dummy cat. data


In [16]:
# %%pycodestyle  # Disabled because it raises the error message
# "294:1: W391 blank line at end of file" while no

import numpy as np

# Now that we process with dummy data
# we can remove useless rows and columns

# CLEAN DATA


def remove_empty_row_col(df: object) -> object:  # = df
    '''
    returns the initial dataframe without empty rows and columns

    input:
           df : pandas dataframe to filter
    ouput:
           df : initial dataframe after removal of empty rows and columns
    '''

    # sub-variables
    msg, df_rows_cleaned = None, None

    if df is not None:

        if __debug_script__:
            print("  > Remove empty rows and columns")
        # try:
        if df is not None:
            # Manage rows with none data
            # Drop rows with all missing values
            df_rows_cleaned = df.dropna(axis=0, how='all')
            # Manage columns with none data
            # Drop columns with all missing values
            df = df_rows_cleaned.dropna(axis=1, how='all')
            if __debug_script__:
                # Result of the clean ops
                msg = "- df's size after removing empty rows/columns:"
                print("    ", msg, df.shape)
        # except:
        #     print("  CAUTION: Unable to remove empty row/column")

    del msg, df_rows_cleaned
    return df


def remove_rows_from_sel_col(df: object, c_name: str) -> object:  # = df
    '''
    returns the initial dataframe without rows where the selected
     column has no value.

    input:
           df : pandas dataframe to filter
    ouput:
           df : initial dataframe after removal of empty rows and columns
    '''

    # sub-variables
    msg1, msg2 = None, None

    if df is not None:

        if __debug_script__:
            msg1 = '> Remove rows with missing value from the response column'
            print("  ", msg1)

        # try:
        if df is not None:
            # remove rows (axis=0 by default) with missing value
            #  in response column
            df = df.dropna(subset=[c_name])
            if __debug_script__:
                # Result of the clean ops
                msg1 = "- df's size after removing"
                msg2 = "empty rows in the sel. column:"
                print("    ", msg1, msg2, df.shape)
        # except:
        #     msg1 = "CAUTION: Unable to remove "
        #     msg2 = "missing values from the sel. column."
        #     print("  ", msg1, msg2)

    del c_name, msg1, msg2
    return df


def clean_infinite_data(df: object) -> object:  # = df
    '''
    returns the initial dataframe with no more infinite data: they have
     been either removed or replaced according to the case.

    input:
           df : pandas dataframe to clean
    output:
           df : df with infinite/invalid values replaced by nan
    '''

    # sub-variables
    var_values, isfinite_sts, values_clean, msg = None, None, None, None

    #  column with finite status for related column value

    if df is not None:

        if __debug_script__:
            print("    > Manage infinite data")

        # I run over all columns of the dataframe
        for var in df.columns:
            # content of a column for one given variable
            var_values = df[var]

            # Identify the type of value than can be infinite
            #  it excepts object type which is not concerned
            if var_values.dtypes != object:
                # Get the finite True/False status for every value
                #  of the column
                isfinite_sts = np.isfinite(var_values)
                # I apply a treatment when not all values are finite
                if not np.all(isfinite_sts):
                    if __debug_script__:
                        print("      - ", var, "contains infinite values")
                    # try:
                    # I have to distinguish type 'float64' that I met
                    #  on a previous dataset I worked with because
                    #  it has required a specific treatment,
                    #  full removal because no other solution worked
                    if var_values.dtypes == 'float64':
                        # Drop/Remove var from df
                        df = df.drop(columns=[var])
                        if __debug_script__:
                            print("        - float64 variable removed")

                    # otherwise, I can replace infinite value by Nan
                    else:
                        values_clean = np.where(not isfinite_sts,
                                                np.nan, var_values)
                        df[var] = pd.Series(values_clean)
                        if __debug_script__:
                            msg = '- Infinite val. replaced by Nan'
                            print("       ", msg)
                    # except:
                    #     msg = 'CAUTION: Unable to replace infinite values on'
                    #     print("      ", msg, var, ' (', a.dtypes, ')')
                    #     continue

        if __debug_script__:
            msg = "- df's size after removal of infinite values:"
            print("    ", msg, df.shape)

    del var_values, isfinite_sts, values_clean, msg
    return df


def manage_missing_num(df: object) -> object:  # = df
    '''
    returns the initial dataframe with null values of numerical column
     replaced by the appopropriate of the column

    input:
           df : initial pandas dataframe
    output:
           df : initial dataframe with missing values of
                numerical column which are replaced by
                the appropriate value
    '''

    # sub-variables
    msg, num_col_list, c_name = None, None, None
    null_val_rate, l_ref = None, None

    if df is not None:

        if __debug_script__:
            print("  > Manage missing numerical data")

        # I identify columns with numerical data (float and int types)
        num_col_list = df.select_dtypes(include=['float', 'int']).columns
        # When the list of numerical columns is not empty
        #  i.e. there is numerical column to check
        if len(num_col_list) > 0:
            if __debug_script__:
                print("      List of num. columns:", num_col_list)
            # I run the list, column by column
            for c_name in num_col_list:
                if __debug_script__:
                    print("      > column '", c_name, "'")
                # When at least one value of the column is null (Nan ...),
                #  When the number of null value is higher than 50%
                #  of the total number of value of the column,
                #  I assumed not to have enough information to replace
                #  null value by the median, so I replace null value by
                #  the most frequent value,
                #  otherwise, I replace it by the median value of
                #  the column
                #  I use median or most frequent value to ensure having
                #  integer values as other values of the column since,
                #  here, we are talking about number of people (no float)
                if df[c_name].isnull().values.any():
                    # try:
                    l_ref = len(df[c_name])
                    null_val_rate = df[c_name].isnull().sum() / l_ref
                    if __debug_script__:
                        msg = '- Null value ratio of'
                        print("        ", null_val_rate)
                    if null_val_rate > 0.5:
                        subst_val = df.mode(axis=0, numeric_only=True)
                        if __debug_script__:
                            msg = '- Add most frequent value on:'
                            print("        ", c_name)
                    else:
                        subst_val = df[c_name].median()
                        if __debug_script__:
                            msg = '- Add median value on:'
                            print("        ", c_name)
                    df[c_name].fillna(subst_val, inplace=True)
                    # except:
                    #     if __debug_script__:
                    #         msg = 'CAUTION: Unable to fill the mean for var'
                    #         print("      ", msg, "'", col, "'")
                else:
                    if __debug_script__:
                        print("        - No null value detected")
        else:
            if __debug_script__:
                print("      - No float/int variable found in df")

    del msg, num_col_list, c_name
    return df


def clean_df(df: object, categories_selection: dict) -> object:  # = df
    '''
    returns the initial dataframe cleaned as desired, after management
     of missing, null and infinite data

    input:
           df : pandas dataframe
    ouput:
           df : input dataframe after undesirable data cleaning
    '''
    # function : remove infinite values

    # sub-variables
    msg = None

    print("> Clean df")  # Manage missing data
    if __debug_script__:
        print("  - df's size before cleaning:", df.shape)

    # Remove complete missing data
    # I decided to remove rows and columns that do not contain
    #  any value; they would bring nothing to the building of the
    #  model, except take time and memory space for nothing.
    df = remove_empty_row_col(df)

    # Check that response is still in the dataframe
    #  after removal of all empty rows and columns
    # It may be useful to now that when you change the response
    response = categories_selection['response']
    if response not in df.columns:
        msg = 'CAUTION: the Response variable'
        print("  ", msg, "'", response, "' is not available in the df")
        df = None

    # Drop rows with missing response values
    # I decided to remove row of all columns when related value in the
    #  response column is missing since then, in the frame of the model
    #  building, I would have information to assess the response, but
    #  no response to linked with.
    df = remove_rows_from_sel_col(df, response)

    # Clean infinite values
    #  because infinite values may not be properly managed by the code.
    #  They have no reason to be here and they cause memory errors.
    df = clean_infinite_data(df)

    # Manage null values of the numerical column
    #  because it may happen that hospitals do not
    #  fulfill all information once but, here we work on
    #  a lot of different hospitals and the same at different time
    #  so I assume that I can complete some blanks.
    #  Here fortunately, I do not have null value.
    df = manage_missing_num(df)

    del categories_selection, response, msg
    return df


# Clean data
# drop rows/columns with all missing values
# + replace infinite values by Nan
df = clean_df(df, categories_selection)

> Clean df


In [17]:
# %%pycodestyle  # Disabled because raised an error message
# "75:1: W391 blank line at end of file", while no

# PREPARING FOR MODELING

# Now that we clean the dataframe
# we can prepare the dataset for modeling


def get_X_y(df: object, categories_selection: dict):
    '''
    returns dataframes of response y and input data X for aiming at
     defining a model

    input:
           df       : pandas dataframe
           response : response column's name
    ouput:
           X : A matrix holding all of the variables you want to consider
                when predicting the response
           y : the corresponding response vector
    '''
    # function : split df into exploratory X data and response y data

    # output
    X, y = None, None

    # sub-variables
    response = categories_selection['response']

    print("> Split data into X/y")
    if __debug_script__:
        print("  - response:", response)
        print("  - df.columns:", df.columns)

    # Get the Response var
    if __debug_script__:
        print("  > Get y")
    if response in df.columns:
        # try:
        # Split into explanatory and response variables (1/2)
        #  Get response variable
        y = df[response]
        df = df.drop(columns=[response])  # Remove pred_name from df
        # except:
        #     print("    CAUTION: Unable to get the response data in df")
        #     y = None
    else:
        print("    CAUTION: Unable to find the response in df")
        y = None

    if __debug_script__ and (y is not None):
        print("    - y shape:", y.shape)

    # Get the Exploratory vars
    if __debug_script__:
        print("  > Get X")
    # try:
    # Split into explanatory and response variables (2/2)
    #  Get the input variables i.e. at this level just a copy of df
    X = df.copy(deep=True)
    # except:
    #     print("    CAUTION: Unable to get the exploratory variables (X)")
    #     X = None

    if __debug_script__ and (X is not None):
        print("    - X shape:", X.shape)

    del df, categories_selection, response
    return X, y


# Split into Response y / Exploratory variables X
X, y = get_X_y(df, categories_selection)

> Split data into X/y


In [18]:
# %%pycodestyle  # Disabled because raised an error message
# 162:1: W391 blank line at end of file, while no

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Now that we prepare dataset for modeling,
# it's time to model

# MODELING


def get_model(X: object, y: object, testrate=.3):
    '''
    returns a linear model, set of data used to establish,
     test this model and the associated r2 scores.

    input:
           X          : explanatory variables object
           y          : response variable object
           testrate   : proportion of the dataset to include in
                         the test split,between 0.0 and 1.0;
                         default value = 0.3
    output:
            model : linear regression model object from sklearn
            score : Merge of mean square error value between Train &
                     Test data set according to the proposed model
            list of X_train and y_train
            list of X_test and y_test
    '''
    # function: Give a prediction model according to
    #  its X/y_test/train inputs/outputs values

    model, score = None, 0
    Xtrain, Xtest, ytrain, ytest = None, None, None, None

    # sub-variables
    y_pred, acc_score_train, acc_score_test, msg = None, None, None, None
    r2_scores_train, r2_scores_test, mdl_score = None, None, None

    if (X is not None) and (y is not None):

        print("> Get model")

        # Split into train and test X/y data set
        #  to establish the model and score it
        print("  > Split Train / Test")
        # try:
        Xtrain, Xtest, ytrain, ytest = train_test_split(X, y,
                                                        test_size=testrate,
                                                        random_state=42)
        if __debug_script__:
            print("    - X/y size:", X.shape, y.shape)
            print("    - Train X/y size:", Xtrain.shape, ytrain.shape)
            print("    - Test  X/y size:", Xtest.shape, ytest.shape)
        split = True
        # except:
        #     msg = 'CAUTION: Unable to split X/y into train & test dataset'
        #     print("  ", msg)
        #     split = False

        # Work on dtype
        print("  > Type of data")

        if __debug_script__:
            print("    - y:", y.dtypes)

        recensed_type = {}
        for var in X.columns:
            tip = X[var].dtypes
            if tip not in recensed_type:
                recensed_type[tip] = 1
            else:
                recensed_type[tip] = recensed_type[tip] + 1
        if len(recensed_type) > 0:
            for key, value in recensed_type.items():
                if __debug_script__:
                    print("    - X", key, " : x", value)
            if __debug_script__:
                print("    - Over", X.shape[1], " columns")

        # Establish model
        print("  > Modeling")
        if split:
            if __debug_script__:
                print("    - Train X/y size:", Xtrain.shape, ytrain.shape)

            # https://scikit-learn.org/stable/modules/generated/sklearn.linear
            #  _model.LinearRegression.html#sklearn.linear_model.
            #  LinearRegression.predict
            # https://www.codegrepper.com/code-examples/python/reg.predict+python
            # Fit linear model
            #  further methods:
            #  https://scikit-learn.org/stable/modules/linear_model.html
            model = LinearRegression().fit(Xtrain, ytrain)

            # https://scikit-learn.org/stable/modules/linear_model.
            #  html#ridge-regression-and-classification
            # and https://scikit-learn.org/stable/modules/generated/sklearn.
            #  linear_model.Ridge.html
            # clf = linear_model.Ridge(alpha=1.0)
            # Ridge(alpha=1.0)
            # model = clf.fit(X_train, y_train)
            # It does not work!

            # Get the model's score
            # Return the coefficient of determination of the prediction
            mdl_score = model.score(Xtrain, ytrain)

            if __debug_script__:
                print("    - Train mdl_score:", mdl_score)
            if __debug_script__:
                print("    - Test  X/y size:", Xtest.shape, ytest.shape)
            y_pred = model.predict(Xtest)
        else:
            y_pred = None
        del split

        # Evaluate this model
        if y_pred is None:
            if __debug_script__:
                print("    - Model not found")

        # Get metrics with a model by Regression
        print("  > Metrics")
        if y_pred is not None:
            # Accuracy_score (https://scikit-learn.org/stable/modules/
            # generated/sklearn.metrics.accuracy_score.html)
            # Confusion matrix (https://scikit-learn.org/stable/modules/
            # generated/sklearn.metrics.confusion_matrix.html)
            #  not appropriate for my purpose
            # Common pitfalls (https://scikit-learn.org/stable/common_
            #  pitfalls.html): mean_sqaured_error and r2_score

            # According to https://stackoverflow.com/questions/37367405/
            #  python-scikit-learn-cant-handle-mix-of-multiclass-and-continuous
            # ... Accuracy score is only for classification problems
            # acc_score_train = accuracy_score(y_train, model.predict(X_train))
            # acc_score_test = accuracy_score(y_test, model.predict(X_test))
            # if debug: print("    - acc_score_train:", acc_score_train)
            # if debug: print("    - acc_score_test:", acc_score_test)

            # always according to https://stackoverflow.com/questions/37367405/
            #  python-scikit-learn-cant-handle-mix-of-multiclass-and-continuous
            # For regression problems, use: R2 Score, MSE (Mean Squared Error),
            # RMSE (Root Mean Squared Error).
            r2_scores_train = r2_score(ytrain, model.predict(Xtrain))
            r2_scores_test = r2_score(ytest, model.predict(Xtest))
            print("    - r2_scores_train:", r2_scores_train)
            print("    - r2_scores_test:", r2_scores_test)

            score = r2_scores_train

    del X, y, testrate, y_pred, acc_score_train, acc_score_test
    del r2_scores_train, r2_scores_test, mdl_score
    return model, score, [Xtrain, ytrain], [Xtest, ytest]


# modeling
model, score, Xy_train, Xy_test = get_model(X, y)

> Get model
  > Split Train / Test
  > Type of data
  > Modeling
  > Metrics
    - r2_scores_train: 0.9761588302089482
    - r2_scores_test: 0.9755838001373758


In [19]:
# %%pycodestyle # Disabled because it raises an error message
# 57:1: W391 blank line at end of file, while no

# ANALYZE THE MODEL

# Now that we have a model, it's time ask some questions

# First question
# Which categories have the most effect on the model ?


def coef_weights(model, X_train) -> object:
    '''
    returns a dataframe with coefficients of the model
     (real and absolute values) sorted in the descending order
     of the absolute values

    input:
           model     : model for which we are looking coefficients
           X_train   : the training data
    output:
            coefs_df : dataframe with model's coefficients; that can be
                        used to understand the most influential coefficients
                        in a linear model by providing the coefficient
                        estimates along with the name of the variable
                        attached to the coefficient.
    '''
    # function: get model's coefficients

    coefs_df = pd.DataFrame()
    # Get name of every column in front  of its coefficients
    coefs_df['est_int'] = X_train.columns
    # get coefficients of the linear model
    coefs_df['coefs'] = model.coef_
    # get absolute value of these coefficients
    coefs_df['abs_coefs'] = np.abs(model.coef_)
    # Sort coefficient by descending order
    coefs_df = coefs_df.sort_values('abs_coefs', ascending=False)

    del model, X_train
    return coefs_df


print('  > Define impact of categories on the model:')
coef_df = None

# Compute coefficient of weight on every category for this model
coef_df = coef_weights(model, Xy_train[0])  # Xy_train = [Xtrain, ytrain]
pd.set_option("display.max_rows", None, "display.max_columns", None)
# If max_rows is exceeded, switch to truncate view
# If max_cols is exceeded, switch to truncate view
print('    ', coef_df)

# Result
# display of model's weight coefficients show that gender of patients
#  is the first category of the model (mortality), before department code.

  > Define impact of categories on the model:
                                     est_int         coefs     abs_coefs
107                           Sexe_Tous  4.629496e+08  4.629496e+08
106                          Sexe_Homme  4.629496e+08  4.629496e+08
105                          Sexe_Femme  4.629495e+08  4.629495e+08
60               Code_du_Departement_57  2.284012e+08  2.284012e+08
78               Code_du_Departement_75  2.284010e+08  2.284010e+08
97               Code_du_Departement_94  2.284009e+08  2.284009e+08
62               Code_du_Departement_59  2.284009e+08  2.284009e+08
63               Code_du_Departement_60  2.284009e+08  2.284009e+08
71               Code_du_Departement_68  2.284008e+08  2.284008e+08
98               Code_du_Departement_95  2.284008e+08  2.284008e+08
70               Code_du_Departement_67  2.284008e+08  2.284008e+08
65               Code_du_Departement_62  2.284008e+08  2.284008e+08
83               Code_du_Departement_80  2.284008e+08  2.284008e+

In [20]:
# %%pycodestyle # Disabled because it raises an error message
#  "too many values to unpack (expected 3)" linked to pycodestyle
# See above at the beginning.

# STATS ANALYSIS

# Finally, I'd like to have the stats over recorded patients
# Since the principe will be the same, only categories will change,
#  I gather three questions here that will be treated in sequence
#  the same way.

# Questions:
# What is the proportion of hospitalized Covid-19 patients
#  which go in critical care ?
# According to the result of the preceding block that raises
#  the importance of the gender for the model,
#  what is the proportion of female and male Covid-19 patients
#   that finally died due to Covid-19 ?


def df_logical_no(df, c_name):
    '''
    returns the initial dataframe completed with a new column
     with result of a logical no operation on a selected data,
     reporting also the new column's name

    input:
      df         : initial dataframe to modify
      c_name: name of the df's column to compute a 'logical no' for
    output:
      df : df completed with the 'logical no' column of the given column
      new_c_name : name of the new column
    '''
    # function: add values of a 'logical no' column for a given category
    #  of the dataframe

    liste, r, no_b, msg = None, None, None, None
    if __debug_script__:
        print('   - LOGICAL NO:', c_name)
    # try:
    # Create a list that gives result of a boolean logical no,
    #  returning None for None value
    liste = df[c_name].values.tolist()
    for i, j in enumerate(liste):
        if j is None:
            liste[i] = -1
    complement = np.add(liste, 1)
    no_liste = np.where(complement > 1, False,
                        np.where(complement == 0, None, True))

    # Implement the new column with its name into the dataframe
    new_c_name = c_name + '_logical_no'
    df[new_c_name] = pd.Series(no_liste)
    if __debug_script__:
        print('     - df: -> no', column, '\n')
    # except:
    #     msg = '- Unable to compute logical no of column'
    #     print("    ", msg, "'" + column + "'")

    del column_name, liste, r, no_b, msg
    return df, new_c_name


def divide_N_by_D(df, col_ND_names, new_c_name, min_qty_col_D) -> object:
    '''
    returns the initial dataframe completed with a new column
     containing the required divide values

    input:
      df          : initial dataframe
      col_ND_names: names of df's columns for numerator of the division
                     and denominator of the division
      new_c_name: name of new df's column with result of the division
     min_qty_col_D: Minimum value to consider on denominator do perform
                     the computation
    output:
      df          : df completed with result of the divide
    '''
    # function: divide values of two selected columns from a df
    #            and add result into this df

    division = []

    # Get numerator and denominator values
    list_N = df[col_ND_names[0]].values.tolist()
    list_D = df[col_ND_names[1]].values.tolist()

    # Run the list, divide index by index with requested management
    for i, numerator in enumerate(list_N):
        if list_D[i] != 0:
            if list_D[i] >= min_qty_col_D:
                r = 100 * numerator / list_D[i]  # in percent
                # For debug, display start & too high values of computation
                if __debug_script__ and ((i < 10) or (r > 99.9)):
                    print('i=', i, '->', list_N[i], '/', list_D[i],
                          '=', r, '%')
            else:
                r = None
        else:
            r = None
        division.append(r)
    # Tring to process with numpy functions and list/np.Array did not work
    # it generates more issues to deal with such it was cancelled.

    df[new_c_name] = pd.Series(division)  # Add result in df
    # Sort df in descending order by result of the divide
    df = df.sort_values(new_c_name, ascending=False)

    del col_ND_names, new_c_name, list_N, list_D,
    del division, min_qty_col_D
    return df


def get_value_at_tgt_quantil(df:object, c_name: str, tgt_qtil: float,
                             nb_valid_data=None):
    '''
    returns a quantil value and the associated selected data value

    input:
      df           : dataframe to work on
      c_name     : name of the df's column to work on
      tgt_qtil     : target quantil (%)
      nb_valid_data: (option) size of the valid dataset to consider
                      in the column; default = df_column's full size
    output:
      found_qtil   : actual quantil value (normally close to the
                      tgt_qtil) that reach the closest the target quantil
      found_value  : value of the data that reach the tgt_qtil
                      of data at or below this value
    '''
    # function: Find the quantil value closest to the target quantil of 
    #            the selected df column's dataset


    # Primary variables declaration
    found_value, found_quantil = 0, 0
    # sub-variables declaration
    delta, min_delta = 100, 100
    moving_value_int, moving_value, moving_qtil = 0, 0, 0

    data = df[c_name]  # Get df column's data to work on

    if nb_valid_data is None:  # get a default value of valid data if needed
        nb_valid_data = len(data)

    # We run the 'moving_value' from 0 to 100% by 0.01%-step
    for moving_value_int in range(0, 10000, 1):
        moving_value = float(moving_value_int)/100
        # Get 'moving quantil' of 'value' below the 'moving_value'
        moving_qtil = 100 * data[data <= moving_value].count() / nb_valid_data
        # Compute the diffrence btw 'moving_qtil' and 'tgt_qtil'
        #  we are trying to reach
        delta = abs(tgt_qtil - moving_qtil)
        if __debug_script__:
            print('    for moving_value:', "{:.3f}".format(moving_value),
                  '\t we reach moving_quantil:', "{:.3f}".format(moving_qtil),
                  ' => delta ', "{:.3f}".format(delta),
                  'vs min_delta:', "{:.3f}".format(min_delta))
        # When the difference at its lowest, it means we find value
        #  of the 'value' for which we reach the targeted 'target quantil'
        if delta <= min_delta:
            min_delta = delta
            found_value = moving_value
            found_qtil = moving_qtil
            if __debug_script__:
                print('    change of delta')
    if __debug_script__:
        print('    value:', found_value)

    del df, c_name, tgt_qtil, data, delta, min_delta,\
        moving_value_int, moving_value, moving_qtil
    return found_qtil, found_value


def get_basic_stats(df, c_name):
    '''
    returns a dictionary with stats title and results

     input:
       df      : dataframe to cope with
       c_name: name of the df's column to work on
     output:
       stats: dictionnary with 'maximum', 'average' and 'median' values 
              of the selected dataset.
    '''
    # Function: Compute and display basic stats on a given column of a df

    column = df[c_name] # Get values of the divide
    # Compute ratio of computed result 'c_name'
    #  over the full size of data
    ratio_valid_result = 100 * column.isna().sum() / len(df)
    msg1, msg2 = '- Stats computed on', 'of the full dataset'
    print('   ', msg1, "{:.2f}".format(ratio_valid_result), '% (', "{:d}".format(column.isna().sum()), 'values)', msg2)
    maximum_value = column.max()
    median_value = column.median()
    average_value = column.mean()
    print('    - maximum:', "{:.2f}".format(maximum_value), '%',
            '\n    - average:', "{:.2f}".format(average_value), '%',
            '\n    - median :', "{:.2f}".format(median_value), '%')
    
    del df, c_name, column, ratio_valid_result, msg1, msg2
    return {'maximum': maximum_value, 'average': average_value, 'median': median_value}


def get_rate_stats(df, c_name, tgt_pct):
    '''
    returns a dictionary of quantil targets

    input:
      df      : dataframe to cope with
      c_name: name of the df's column to work on
      tgt_pct : list of target percentage values of the population
                to consider, spreading most critical cases
    output:
      stats: dictionnary with 'maximum', 'average' and 'median' values 
             of the selected dataset.
    '''
    # Function: Compute and display stats values of the selected category
    #            for a couple of target quantil

    # output
    stats = {}
    # sub-variables
    tgt_rate, value_of_qtil, value_at_qtil = None, None, None

    # Give quantity of valid data
    column_data = df[c_name]
    nb_valid_data = len(df) - column_data.isna().sum()

    # We give target quantil and we want to know the value of the c_name 
    #  where we met the closest the target quantil.
    #  For instance, we define a target at 95.4% (2 sigma) and we want
    #  to know value of the divide values where 95.4% of the values
    #  are equal or below this target. Because the sample's distribution
    #  is not continue and maybe big enough, the % may be close but not
    #  exactly at the target quantil.
    # Make the computation for all targets defined in the list tgt_pct:
    for tgt_rate in tgt_pct:
        # Try reaching a quantil as close as possible of the target
        #  a return the associated value at the quantil reached.
        value_of_qtil, value_at_qtil = get_value_at_tgt_quantil(df, c_name, tgt_rate, nb_valid_data)
        
        # Record the result into a dictionay
        stats[tgt_rate] = {'quantil': value_of_qtil, 'value': value_at_qtil}

        # Display the reached (vs target) population rate spreading 
        #  the critical cases and the related rate according to the
        #  categories used in the divide.
        print('    - It may concern', "{:.2f}".format(value_at_qtil), '%',
              'of COVID-19 patients'
              ' when considering', "{:.2f}".format(value_of_qtil), '%',
              'of the less critical patients',
              '(for an initial target at', "{:.2f}".format(tgt_rate), '%)')

    del df, c_name, tgt_pct, nb_valid_data, tgt_rate, column_data
    return stats


def get_selected_rate(df, input_dict, min_sample_size, tgt_pct):
    '''
    returns a dictionary of pre-defined stats values about the 
     selected data in a dataframe

    input:
        df          : dataframe to cope with
     input_dict     : dictionnary with result's titles as keys and
                       a list with df's column names for numerator
                       and denominator as values
                       = {title1: [col_N1, col_D1], title2: [...] ...}
    min_sample_size : Minimum value to consider on denominator do perform
                       the computation
      tgt_pct       : list of target percentage values of the population
                       to consider, spreading most critical cases
    output:
      outputs       : dictionary with result's titles as keys and
                       a sub-dictionary with target percentage and
                       result values
    '''
    # function: provide stats (max, average, median) and find value at
    #  three target quantil on a category of dataframe over another.

    # output
    outputs = {}

    # sub-variables
    stats_basic, stats_rate = {}, {}
    title = None

    for title in input_dict:

        # Second step of the stats consists in looking for the value (%)
        #  of transfer of the Covid-19 patients from one state to another

         # get column's title of [numerator; denominator]
        num_denum = input_dict[title]
        # Define name of the df's column from defined title of results
        result_name = title.replace(" ", "_")
        # Perform the divide of Numerator by Denominator
        df = divide_N_by_D(df, num_denum, result_name, min_sample_size)

        print("  > Provide stats related to", title, ":")

        # First stats are basic stats.
        # I call a function to get basic stats on this new computed values
        stats_basic = get_basic_stats(df, result_name)

        # Second step of the stats consists in looking for the value (%)
        #  of transfer of the Covid-19 patients from one state to another
        # For that, I call a function to get stats on given proportion 
        #  of the considered population of patients
        stats_rate = get_rate_stats(df, result_name, tgt_pct)

        # Implement result of stats on the divide into output dict.
        outputs[title] = title  # feed the result dict with inputs
        outputs[title] = stats_rate  # ... and with the stats results

    del df, input_dict, min_sample_size, tgt_pct, num_denum,\
        result_name, title
    return outputs


# Note literally the requested rates and associated categories 
#  of the dataframe to compute the requested stats
inputs = {'rate of patients in critical care over hospitalization':
          ['Nb_actuellement_en_soins_intensifs', 'Nb_actuellement_hospitalises'],
          'rate of female that died due to covid-19': ['Sexe_Femme', 'Total_Deces'],
          'rate of male that died due to covid-19': ['Sexe_Homme', 'Total_Deces']}
min_sample_size = 10  # minimum sample size, below this quantity of available values,
                      #  the data are not considered for computing stats.
# According to Gaussian theory: %-values refer to 3sigma, 2sigma, 1sigma
tgt_pct = [99.7, 95.4, 68.3]
# I've prepared a set of three entries to do the same operation of three
#  different factors. I send these entries in a first function that will apply the 
# same process on a loop. The unitary processing will go a step deeper.
stats_result = get_selected_rate(df, inputs, min_sample_size, tgt_pct)

# END OF THE PROJECT
del inputs, min_sample_size, stats_result

# RESULT
# Processing time is a bit long, notably because the script
#  realizes three times a loop between 0 to 100 with 0.01-step
#  A better way to code is very probably possible, but here, the time 
#  is not a key factor for the evaluation
# Concerning Covid-19 patients (13.8% of reported case), when the 5%-most critical
#  cases are spread (i.e. the 95.4% value = 2sigma):
# - Transfer rate from hospitalization to critical care is about 30%; So 70% of luck
#   to avoid critical care
# - Mortality is about 2.4% for female patients and 1.9% for male patients;
#   so respectively 97.6% and 98.1% of chance avoiding death.

print('\n' + 10 * "-", "PROJECT #01 /  END ", 10 * "-", "\n")

  > Provide stats related to rate of patients in critical care over hospitalization :
    - Stats computed on 13.81 % ( 38739 values) of the full dataset
    - maximum: 80.00 % 
    - average: 12.23 % 
    - median : 10.13 %
    - It may concern 49.99 % of COVID-19 patients when considering 99.65 % of the less critical patients (for an initial target at 99.70 %)
    - It may concern 30.76 % of COVID-19 patients when considering 95.36 % of the less critical patients (for an initial target at 95.40 %)
    - It may concern 15.15 % of COVID-19 patients when considering 68.26 % of the less critical patients (for an initial target at 68.30 %)
  > Provide stats related to rate of female that died due to covid-19 :
    - Stats computed on 5.30 % ( 14870 values) of the full dataset
    - maximum: 10.00 % 
    - average: 0.41 % 
    - median : 0.00 %
    - It may concern 9.09 % of COVID-19 patients when considering 99.66 % of the less critical patients (for an initial target at 99.70 %)
    - It