## Code pour integration du modèle en production : 
* Etape 0 : Sert à générer le signal pour tester les fonctions (ne pas en tenir compte pour l'intégration du code)
* Etape 1 & 2 : Déclaration des fonctions preprocessing et classifier, puis exécution pour avoir en sortie la valeur de la prédiction : 1 -> non bruité, 2 -> moyennement bruité, 3 -> fortement bruité
* Fichier complémentaire : le dossier "CNN_model_all_patients" qui le modèle enregistré et que l'on charge à l'aide de Keras

## Etape 0 - Generation du jeu de données d'entrée afin de générer un signal d'entrée pour tester les fonctions (liste de 2000 valeurs)

In [3]:
# Read the file path we have in the current folder 

# Save image path of training data
import pathlib
import re
file_paths = pathlib.Path('../aura-data')

# Convert all paths into a string with comprehension list
file_paths = [str(file_path) for file_path in list(file_paths.glob("*/*"))] # list python files in the directory tree
print(file_paths[:5])
# Select dat_path_file for patients 111001, 100001, 105001
r = re.compile(".*(111001|100001|105001)_ECG\.dat$") # create regex to match ECG.dat files
ecg_file_dat_paths = list(filter(r.match, file_paths)) # Filter only paths for .dat files
print(ecg_file_dat_paths[:5])
# Select csv_path_file for patients 111001, 100001, 105001
r = re.compile(".*(111001|100001|105001)_ANN\.csv$") # create regex to match ECG.dat files
ecg_file_csv_paths = list(filter(r.match, file_paths)) # Filter only paths for .dat files
print(ecg_file_csv_paths[:5])
# Create list of patient ids for patients 111001, 100001, 105001
r = re.compile(".*(111001|100001|105001)")
folder_paths = pathlib.Path('../aura-data')
patient_ids = [str(folder_name) for folder_name in list(folder_paths.iterdir()) if folder_name.is_dir()] # extract folder paths in the current directory
patient_ids = list(filter(r.findall, patient_ids))
patient_ids = [re.findall('\d+', x )[0] for x in patient_ids]
patient_ids

['../aura-data/118001/118001_ECG.hea', '../aura-data/118001/118001_ACC.hea', '../aura-data/118001/118001_ANN.csv', '../aura-data/118001/118001_ACC.dat', '../aura-data/118001/118001_ECG.dat']
['../aura-data/100001/100001_ECG.dat', '../aura-data/111001/111001_ECG.dat', '../aura-data/105001/105001_ECG.dat']
['../aura-data/100001/100001_ANN.csv', '../aura-data/111001/111001_ANN.csv', '../aura-data/105001/105001_ANN.csv']


['100001', '111001', '105001']

In [4]:
def resampling_data(sample_size, dat_file_path, csv_file_path):
    # Import csv annotation file and remove lines with null start/ end/ class values
    header_list = ["start1", "end1", "class1","start2", "end2", "class2","start3", "end3", "class3","start", "end", "class"]
    df_csv = pd.read_csv(csv_file_path, names=header_list)
    df_csv = df_csv[df_csv["class"].notnull() & df_csv["start"].notnull() & df_csv["end"].notnull()]
    df_csv["start"] = df_csv["start"].astype(int)
    df_csv["end"] = df_csv["end"].astype(int)
    df_csv["class"] = df_csv["class"].astype(int)
    df_csv = df_csv[df_csv["class"]!=0].reset_index(drop=True)
    df_csv['signal_length']=df_csv['end'] - df_csv['start'] 
    #df_csv["sample"]=round(df_csv["signal_length"]/sample_size)+1
    display(df_csv.head())
    display(df_csv.tail())
    display(df_csv["class"].value_counts()/len(df_csv))

    # Import data and reshape it to array matrix with 1000 features 
    # Import all data in array
    arr_data = wfdb.rdrecord(dat_file_path.replace(".dat",""))
    arr_data = arr_data.adc()
    display(f"Input dat file size : {arr_data.shape}")

    # Create a second column to set label value
    arr_zero = np.zeros((arr_data.shape[0],arr_data.shape[1]), dtype=int)
    arr_data = np.append(arr_data, arr_zero, axis=1)

    # Create list with class label for each range of 1000 values
    arr_data[0,1] = df_csv["class"][0]
    for i in tqdm(range(0,len(df_csv))):
        label = df_csv["class"][i]
        start = int(df_csv.start[i])
        end = int(df_csv.end[i])
        for ii in range(start, end):
            arr_data[ii,1] = label

    # Display the number of row with null label
    display(f"Number of element with label == 0 : {arr_data[arr_data[:,1]==0].shape[0]}")

    # Remove row with null label
    # arr_data = arr_data[arr_data[:,1]!=0]
    # need to have number of rows which is a multiple of sample_size
    arr_data_size = (arr_data.shape[0]//sample_size)*sample_size

    # Reshape vector data into 2 Matrix (values, labels)
    arr_data_values = arr_data[:,0]
    arr_data_labels = arr_data[:,1]
    arr_data_values = arr_data_values[:arr_data_size].reshape(arr_data[:arr_data_size].shape[0]//sample_size,sample_size)
    arr_data_labels = arr_data_labels[:arr_data_size].reshape(arr_data[:arr_data_size].shape[0]//sample_size,sample_size)

    # Choose the mode as final label for each sample
    arr_data_label_results = np.zeros((arr_data_labels.shape[0],1), dtype=int)
    for i in range(len(arr_data_labels)):
        arr_data_label_results[i]=stats.mode(arr_data_labels[i])[0]

    # Concatenate data array matrix with label column vector array
    arr_data_resampled = np.concatenate((arr_data_values,arr_data_label_results), axis=1)

    # Remove rows with null element in label column
    arr_data_resampled = arr_data_resampled[arr_data_resampled[:,sample_size]!=0]
    display(f"Final array size : {arr_data_resampled.shape}")
    return arr_data_resampled

In [5]:
def prepare_class(sample_size, arr_data_all, resample_scale):
    arr_class_1 = arr_data_all[arr_data_all[:,sample_size]==1][:,:sample_size:resample_scale]
    arr_class_1 = arr_class_1#/1000
    display(arr_class_1.shape)
    np.random.shuffle(arr_class_1)
    display(arr_class_1)
    arr_class_2 = arr_data_all[arr_data_all[:,sample_size]==2][:,:sample_size:resample_scale]
    arr_class_2 = arr_class_2#/1000
    #display(arr_class_2.shape)
    np.random.shuffle(arr_class_2)
    #display(arr_class_2)
    arr_class_3 = arr_data_all[arr_data_all[:,sample_size]==3][:,:sample_size:resample_scale]
    arr_class_3 = arr_class_3#/1000
    #display(arr_class_3.shape)
    np.random.shuffle(arr_class_3)
    #display(arr_class_3)
    arr_class_0 = np.concatenate((arr_class_2, arr_class_3), axis=0) 
    display(arr_class_0.shape)
    np.random.shuffle(arr_class_0)
    display(arr_class_0)
    return arr_class_0, arr_class_1

In [6]:
# Use function for all patients 
import numpy as np
import pandas as pd
import wfdb
from tqdm.notebook import tqdm
from scipy import stats
sample_size = 2000
arr_data_all = np.zeros((0,sample_size+1), dtype=int)
for patient_id, dat_file_path, csv_file_path in zip(patient_ids, ecg_file_dat_paths,ecg_file_csv_paths):
    display(f"patient id : {patient_id}")
    arr_data_labelized = resampling_data(sample_size, dat_file_path, csv_file_path)
    arr_data_all = np.concatenate((arr_data_all,arr_data_labelized), axis=0)
arr_data_all

'patient id : 100001'

Unnamed: 0,start1,end1,class1,start2,end2,class2,start3,end3,class3,start,end,class,signal_length
0,1.0,198867.0,2.0,1.0,19525.0,1.0,1,7047,2,1,7047,2,7046
1,198868.0,320282.0,1.0,19526.0,28694.0,2.0,7048,17209,1,7048,17209,1,10161
2,320283.0,373109.0,2.0,28695.0,32739.0,1.0,17210,28390,2,17210,28694,2,11484
3,373110.0,2197974.0,1.0,32740.0,96699.0,2.0,28391,32653,1,28695,32653,1,3958
4,2197975.0,2582746.0,2.0,96700.0,110564.0,1.0,32654,71061,2,32654,112474,2,79820


Unnamed: 0,start1,end1,class1,start2,end2,class2,start3,end3,class3,start,end,class,signal_length
758,,,,,,,59818365,59824575,1,86486096,86580752,2,94656
759,,,,,,,59824576,59826028,2,86580753,86588415,1,7662
760,,,,,,,59826029,59830915,1,86588416,86757975,2,169559
761,,,,,,,59830916,59832492,2,86757976,86827127,1,69151
762,,,,,,,59832493,59858995,1,86827128,87087000,2,259872


2    0.500655
1    0.496723
3    0.002621
Name: class, dtype: float64

'Input dat file size : (87087000, 1)'

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=763.0), HTML(value='')))




'Number of element with label == 0 : 762'

'Final array size : (43543, 2001)'

'patient id : 111001'

Unnamed: 0,start1,end1,class1,start2,end2,class2,start3,end3,class3,start,end,class,signal_length
0,1.0,4315694.0,2.0,1.0,411254.0,2.0,1,7523,1,1,415143,2,415142
1,4315695.0,4334794.0,1.0,411255.0,421291.0,1.0,7524,16977,2,415144,421291,1,6147
2,4334795.0,7908996.0,2.0,421292.0,433271.0,2.0,16978,18848,1,421292,433515,2,12223
3,7908997.0,7915643.0,1.0,433272.0,447315.0,1.0,18849,105566,2,433516,447315,1,13799
4,7915644.0,13719093.0,2.0,447316.0,451027.0,2.0,105567,108563,1,447316,451027,2,3711


Unnamed: 0,start1,end1,class1,start2,end2,class2,start3,end3,class3,start,end,class,signal_length
2734,,,,,,,35206925,35212255,1,88835329,88857959,1,22630
2735,,,,,,,35212256,35217108,2,88857960,89860280,2,1002320
2736,,,,,,,35217109,35222483,1,89860281,89872911,1,12630
2737,,,,,,,35222484,35270820,2,89872912,90622551,2,749639
2738,,,,,,,35270821,35273851,1,90622552,90645000,3,22448


2    0.495071
1    0.396130
3    0.108799
Name: class, dtype: float64

'Input dat file size : (90645000, 1)'

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2739.0), HTML(value='')))




'Number of element with label == 0 : 2738'

'Final array size : (45322, 2001)'

'patient id : 105001'

Unnamed: 0,start1,end1,class1,start2,end2,class2,start3,end3,class3,start,end,class,signal_length
0,1.0,47323379.0,3.0,1.0,47323871.0,3.0,1,47323500,3,1,47323500,3,47323499
1,47323380.0,47388207.0,2.0,47323872.0,47329547.0,1.0,47323501,47323539,1,47323501,47326788,2,3287
2,47388208.0,47404719.0,1.0,47329548.0,47333899.0,2.0,47323540,47326788,2,47326789,47329547,1,2758
3,47404720.0,47449035.0,2.0,47333900.0,47370615.0,1.0,47326789,47329715,1,47329548,47333904,2,4356
4,47449036.0,47492819.0,1.0,47370616.0,47377747.0,2.0,47329716,47333904,2,47333905,47370615,1,36710


Unnamed: 0,start1,end1,class1,start2,end2,class2,start3,end3,class3,start,end,class,signal_length
1349,,,,,,,79063217,79158543,1,138907088,138909568,2,2480
1350,,,,,,,79158544,79159920,2,138909569,138989087,1,79518
1351,,,,,,,79159921,79163967,1,138989088,138997600,2,8512
1352,,,,,,,79163968,79166160,2,138997601,139120079,1,122478
1353,,,,,,,79166161,79174599,1,139120080,139147000,3,26920


2    0.499261
1    0.499261
3    0.001477
Name: class, dtype: float64

'Input dat file size : (139147000, 1)'

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1354.0), HTML(value='')))




'Number of element with label == 0 : 1353'

'Final array size : (69573, 2001)'

array([[32292, 32270, 32272, ..., 15422, 15422,     2],
       [15450, 15486, 15498, ...,  6051,  6037,     2],
       [ 6019,  5995,  5957, ..., -1206, -1200,     2],
       ...,
       [  -77,  -323,  -441, ...,  -293,  -516,     3],
       [ -530,  -615,  -661, ...,  -529,  -540,     3],
       [ -698,  -817,  -845, ...,   133,   -72,     3]])

In [7]:
arr_data_all.shape

(158438, 2001)

In [8]:
resample_scale = 1
arr_class_0, arr_class_1 = prepare_class(sample_size, arr_data_all, resample_scale)

(78077, 2000)

array([[  -114,   -116,   -116, ...,   -183,   -183,   -188],
       [  -102,   -101,   -101, ...,    -88,    -92,   -106],
       [-12446, -12461, -12455, ..., -11805, -11819, -11815],
       ...,
       [    90,     88,     90, ...,    -66,    -59,    -54],
       [    13,      7,     10, ...,    144,    198,    255],
       [   198,    197,    192, ...,   -133,   -137,   -131]])

(80361, 2000)

array([[-120, -126, -131, ...,  106,  105,  108],
       [ -97, -103, -107, ...,  289,  294,  290],
       [ -33,  -25,  -47, ...,  -36,  -26,   -6],
       ...,
       [-185, -189, -179, ...,  351,  334,  324],
       [ 124,  130,  136, ..., -336, -336, -329],
       [ 365,  394,  379, ..., -184, -181, -174]])

In [9]:
list_ = list(arr_class_1[0]) # On charge le signal (liste de 2000 valeurs dans la variable list_)

## Etape 1 - Declaration de la fonction de preprocessing : Transformation de la liste en Scaleogramme pour tester le modèle -> X_prod

In [17]:
from sklearn.preprocessing import StandardScaler
import pywt
def preprocessing(list_):
    signal_length = 2000 # nous prenons des portions de 2000 valeurs du signal brut
    waveletname = 'morl'
    signal = np.asarray(list_[:signal_length:8]) # we reduce the number of values by 8 : signal of 2000 values to 250 values
    scaler = StandardScaler()
    signal = scaler.fit_transform(signal.reshape(-1, 1)).reshape(250,) # apply standard scalar in the signal
    signal = list(signal)
    size_dataset = 1 # 1 seul signal est donnée en entrée
    fs = len(signal)
    scales = range(1, fs)
    X_prod = np.ndarray(shape=(size_dataset, fs-1, fs-1, 3))
    for j in tqdm(range(0, 3)):
        coeff, freq = pywt.cwt(signal, scales, waveletname, 1)
        X_prod[0, :, :, j] = coeff[:,:fs-1]
    return X_prod

## Etape 2 - Declaration fonction de classification (chargement du modèle, X_prod en input)

In [18]:
from tensorflow import keras
def classifier(X_prod):
    model = keras.models.load_model("CNN_model_all_patients_v2")
    return int(model.predict_classes(X_prod)) # La fonction renvoie en sortie la valeur de la prédiction : 1 -> non bruité, 2 -> moyennement bruité, 3 -> fortement bruité

## Etape 3 - Execution des fonctions

In [19]:
X_prod = preprocessing(list_)  # On applique la fonction de preprocessing avec en input list_ (liste de 2000 valeurs dans la variable list_)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [20]:
result = classifier(X_prod) # On execute la fonction classifier avec X_prod en input (valeurs en sortie : 1 -> non bruité, 2 -> moyennement bruité, 3 -> fortement bruité)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [21]:
print(result)

1
