In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [5]:
dataset = pd.read_excel('dataset.xlsx')
df = dataset.copy()

## Création des sous-ensembles de données

In [7]:
missing_rate = df.isna().sum() / df.shape[0]
missing_rate

Patient ID                                               0.000000
Patient age quantile                                     0.000000
SARS-Cov-2 exam result                                   0.000000
Patient addmited to regular ward (1=yes, 0=no)           0.000000
Patient addmited to semi-intensive unit (1=yes, 0=no)    0.000000
                                                           ...   
HCO3 (arterial blood gas analysis)                       0.995216
pO2 (arterial blood gas analysis)                        0.995216
Arteiral Fio2                                            0.996456
Phosphor                                                 0.996456
ctO2 (arterial blood gas analysis)                       0.995216
Length: 111, dtype: float64

In [9]:
blood_columns = list(df.columns[(missing_rate < 0.90) & (missing_rate > 0.88)])
viral_columns = list(df.columns[(missing_rate < 0.80) & (missing_rate > 0.75)])

In [10]:
key_columns = ['Patient age quantile', 'SARS-Cov-2 exam result']

In [12]:
df = df[key_columns + blood_columns + viral_columns]
df.head()

Unnamed: 0,Patient age quantile,SARS-Cov-2 exam result,Hematocrit,Hemoglobin,Platelets,Mean platelet volume,Red blood Cells,Lymphocytes,Mean corpuscular hemoglobin concentration (MCHC),Leukocytes,...,Parainfluenza 3,Chlamydophila pneumoniae,Adenovirus,Parainfluenza 4,Coronavirus229E,CoronavirusOC43,Inf A H1N1 2009,Bordetella pertussis,Metapneumovirus,Parainfluenza 2
0,13,negative,,,,,,,,,...,,,,,,,,,,
1,17,negative,0.236515,-0.02234,-0.517413,0.010677,0.102004,0.318366,-0.95079,-0.09461,...,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected
2,8,negative,,,,,,,,,...,,,,,,,,,,
3,5,negative,,,,,,,,,...,,,,,,,,,,
4,15,negative,,,,,,,,,...,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected


## Train test

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
trainset, testset = train_test_split(df, test_size=0.2, random_state=0)

In [15]:
trainset['SARS-Cov-2 exam result'].value_counts()

negative    4068
positive     447
Name: SARS-Cov-2 exam result, dtype: int64

In [16]:
testset['SARS-Cov-2 exam result'].value_counts()

negative    1018
positive     111
Name: SARS-Cov-2 exam result, dtype: int64

## Encodage des données

In [18]:
dict_map = {'negative': 0, 'positive': 1, 'not_detected': 0, 'detected': 1}

In [22]:
for col in df.select_dtypes('object'):
    trainset[col] = pd.Series.map(trainset[col], dict_map)
trainset.head()

Unnamed: 0,Patient age quantile,SARS-Cov-2 exam result,Hematocrit,Hemoglobin,Platelets,Mean platelet volume,Red blood Cells,Lymphocytes,Mean corpuscular hemoglobin concentration (MCHC),Leukocytes,...,Parainfluenza 3,Chlamydophila pneumoniae,Adenovirus,Parainfluenza 4,Coronavirus229E,CoronavirusOC43,Inf A H1N1 2009,Bordetella pertussis,Metapneumovirus,Parainfluenza 2
543,18,0,1.358055,1.356092,-0.228491,-0.438097,1.142196,-0.517481,0.244149,0.275501,...,,,,,,,,,,
4937,11,0,,,,,,,,,...,,,,,,,,,,
2884,3,0,,,,,,,,,...,,,,,,,,,,
1837,15,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2228,7,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
for col in df.select_dtypes('object'):
    testset[col] = pd.Series.map(testset[col], dict_map)
testset.head()

Unnamed: 0,Patient age quantile,SARS-Cov-2 exam result,Hematocrit,Hemoglobin,Platelets,Mean platelet volume,Red blood Cells,Lymphocytes,Mean corpuscular hemoglobin concentration (MCHC),Leukocytes,...,Parainfluenza 3,Chlamydophila pneumoniae,Adenovirus,Parainfluenza 4,Coronavirus229E,CoronavirusOC43,Inf A H1N1 2009,Bordetella pertussis,Metapneumovirus,Parainfluenza 2
5113,7,0,,,,,,,,,...,,,,,,,,,,
2343,7,0,,,,,,,,,...,,,,,,,,,,
2552,1,0,,,,,,,,,...,,,,,,,,,,
895,11,0,-0.15259,0.416252,-0.21593,1.693575,0.87774,1.785361,2.235712,-0.553771,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4762,4,0,0.442512,0.353596,0.186048,1.693575,0.313568,1.188328,-0.253742,-0.848747,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
