In [26]:
import pandas as pd
import pandas_profiling as pp
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import xgboost
import sklearn
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, mean_squared_error,median_absolute_error

In [27]:
df = pd.read_csv('/home/junio/Desktop/Ju/mywaytohealth2020/jupyter/data/00.MARS_VISITES.csv',sep=',')

**Drop incomplete examns (less than 60 minutes)**

In [29]:
print(df.shape)
print(df.exploration_sommeil_type.value_counts())
df = df[df['exploration_sommeil_type'] != 1]
df.exploration_sommeil_type.value_counts()
print(df.shape)
df = df[(df['polysomnographie_time_st1']+df['polysomnographie_time_st2']+
         df['polysomnographie_time_st3_4']+df['polysomnographie_time_st5']) > 60]
print(df.shape)
df.exploration_sommeil_type.value_counts()

(3063, 59)
2.0    1099
1.0     155
3.0       1
Name: exploration_sommeil_type, dtype: int64
(2908, 59)
(1056, 59)


2.0    1054
Name: exploration_sommeil_type, dtype: int64

**Exclude some fields**

In [30]:
df = df[['patient_id','age','sexe','egfdrs001','alcool',
         'nycturie_nb','mrc','nyha','score_epworth','score_asthenie','score_depression','score_sjsr',
         'imc','perimetre_cervical','perimetre_abdominulll','tour_de_hanches','pasyst','padiast','fc','polysomnographie_iah',
         'iah_class','tabagisme_nb_paquets_annee', 'excercice_physique_nb_min_semaine']]

In [31]:
df.shape

(1056, 23)

**Percentage of null values in each field**

In [32]:
df.isnull().sum()/len(df)*100

patient_id                            0.000000
age                                   0.189394
sexe                                  0.000000
egfdrs001                             0.000000
alcool                                0.473485
nycturie_nb                          45.549242
mrc                                   1.988636
nyha                                 21.685606
score_epworth                        12.310606
score_asthenie                       17.803030
score_depression                     18.087121
score_sjsr                           22.916667
imc                                   1.231061
perimetre_cervical                   31.439394
perimetre_abdominulll                32.765152
tour_de_hanches                      37.689394
pasyst                               13.257576
padiast                              13.257576
fc                                   26.325758
polysomnographie_iah                  0.189394
iah_class                             0.189394
tabagisme_nb_

NOTE: attributes tabagisme_nb_paquets_annee and excercice_physique_nb_min_semaine excluded because too many null values;
NOTE: attribute mrc excluded because it did not demonstrated importance on naive-bayes feature selection and neither on xgb feature importance.

In [9]:
#removing fields with over 50% missing values: excercice_physique_nb_min_semaine and tabagisme_nb_paquets_annee
df = df[['patient_id','age','sexe','egfdrs001','alcool','nycturie_nb','mrc','nyha','score_epworth','score_asthenie',
         'score_depression','score_sjsr','imc','perimetre_cervical','perimetre_abdominulll','tour_de_hanches','pasyst',
         'padiast','fc','polysomnographie_iah','iah_class']]

**Detecting outliers:**
https://statisticsbyjim.com/basics/outliers/
https://statisticsbyjim.com/basics/remove-outliers/

In [10]:
if True:
    %matplotlib inline
    #note: excercice_physique_nb_min_semaine and imc are out of the plot, as their ranges distort axis y
    df.boxplot(figsize=(30,10),column=['age','sexe','egfdrs001','alcool',
             'nycturie_nb','mrc','nyha','score_epworth','score_asthenie','score_depression','score_sjsr',
             'perimetre_cervical','perimetre_abdominulll','tour_de_hanches','pasyst','padiast','fc','polysomnographie_iah'])

**Removal of spurious data**
(negative values for strict positive fields, values dozens of times bigger than the average, and so on)

In [11]:
#Discard the rows in which the field to be predicted is absent(null).
df = df[df['polysomnographie_iah'].notna()] #cannot work with null
print('After polysomnographie_iah',df.shape[0])
print('Original',df.shape[0])
#df = df[((df['excercice_physique_nb_min_semaine'] >= 0) & (df['excercice_physique_nb_min_semaine'] < 9000)) | (df['excercice_physique_nb_min_semaine'].isna())]
#print('After excercice_physique_nb_min_semaine',df.shape[0])
#df = df[(df['tabagisme_nb_paquets_annee'] >= 0) | (df['tabagisme_nb_paquets_annee'].isna())] # positive or null accepted
#print('After tabagisme_nb_paquets_annee',df.shape[0])
df = df[df['age'] >= 0] #no negative ages
print('After age',df.shape[0])
df = df[df['alcool'] != 0] #should be -1 ou 1
print('After alcool',df.shape[0])
df = df[((df['nycturie_nb'] >= 0) & (df['nycturie_nb'] < 30)) | (df['nycturie_nb'].isna())] #not discarding null values
print('After nycturie_nb',df.shape[0])
df = df[((df['imc'] >= 0)&(df['imc'] < 80)) | (df['nycturie_nb'].isna())] #not discarding null values
print('After imc',df.shape[0])
df = df[(df['tour_de_hanches'] > 40) | (df['tour_de_hanches'].isna())] #not discarding null values
print('After tour_de_hanches',df.shape[0])
df = df[((df['pasyst'] > 12)&(df['pasyst'] < 230)) | (df['pasyst'].isna())] #not discarding null values
print('After pasyst',df.shape[0])
df = df[(df['padiast'] >= 20) | (df['padiast'].isna())]
print('After padiast',df.shape[0])

After polysomnographie_iah 1054
Original 1054
After age 1052
After alcool 1052
After nycturie_nb 1049
After imc 1045
After tour_de_hanches 1045
After pasyst 1042
After padiast 1042


In [12]:
if True:
    df.boxplot(figsize=(30,10),column=['age','sexe','egfdrs001','alcool',
             'nycturie_nb','mrc','nyha','score_epworth','score_asthenie','score_depression','score_sjsr',
             'imc','perimetre_cervical','perimetre_abdominulll','tour_de_hanches','pasyst','padiast','fc','polysomnographie_iah'])

In [13]:
df = df.astype({"iah_class": int})

In [14]:
df.to_csv('/home/junio/Desktop/Ju/mywaytohealth2020/jupyter/data/01.PATIENTS_VISITES-no_control_no_spurious_data.csv',sep=',')
#profile=pp.ProfileReport(df)
#profile.to_file("/home/junio/Desktop/Ju/mywaytohealth2020/jupyter/data/01.PATIENTS_VISITES-no_control_no_spurious_data-REPORT.html")

**Analysis: positive vs negative**

In [15]:
if True:
    df_positive = df[df['polysomnographie_iah'] >= 5]
    df_negative = df[df['polysomnographie_iah'] < 5]
    df_positive.boxplot(figsize=(30,10),column=['age','sexe','egfdrs001','alcool',
             'nycturie_nb','mrc','nyha','score_epworth','score_asthenie','score_depression','score_sjsr',
             'imc','perimetre_cervical','perimetre_abdominulll','tour_de_hanches','pasyst','padiast','fc','polysomnographie_iah'])

    df_negative.boxplot(figsize=(30,10),column=['age','sexe','egfdrs001','alcool',
             'nycturie_nb','mrc','nyha','score_epworth','score_asthenie','score_depression','score_sjsr',
             'imc','perimetre_cervical','perimetre_abdominulll','tour_de_hanches','pasyst','padiast','fc','polysomnographie_iah'])

**Substitute null values by the median (not the mean) of each column** (no improvements)

In [16]:
#all fields but patient_id_hashed
numerical_cols = ['age','sexe','egfdrs001','alcool','nycturie_nb','mrc','nyha','score_epworth','score_asthenie',
                  'score_depression','score_sjsr','imc','perimetre_cervical','perimetre_abdominulll','tour_de_hanches',
                  'pasyst','padiast','fc', 'tabagisme_nb_paquets_annee', 'excercice_physique_nb_min_semaine']

In [17]:
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

In [18]:
df.to_csv('/home/junio/Desktop/Ju/mywaytohealth2020/jupyter/data/01.PATIENTS_VISITES-no_control_no_spurious_data-median-imputation.csv',sep=',')

**Normalization**

In [22]:
#from sklearn.preprocessing import MinMaxScaler #too many outliers to use this one
from sklearn.preprocessing import PowerTransformer
from sklearn.decomposition import PCA
#all fields but patient_id_hashed
df[numerical_cols] = PowerTransformer().fit_transform(df[numerical_cols])

In [23]:
df.to_csv('/home/junio/Desktop/Ju/mywaytohealth2020/jupyter/data/01.PATIENTS_VISITES-no_control_no_spurious_data-median-imputation.csv',sep=',')

In [24]:
df=df.reset_index()

In [25]:
df.isnull().sum()/len(df)*100

index                                0.0
patient_id                           0.0
age                                  0.0
sexe                                 0.0
egfdrs001                            0.0
alcool                               0.0
nycturie_nb                          0.0
mrc                                  0.0
nyha                                 0.0
score_epworth                        0.0
score_asthenie                       0.0
score_depression                     0.0
score_sjsr                           0.0
imc                                  0.0
perimetre_cervical                   0.0
perimetre_abdominulll                0.0
tour_de_hanches                      0.0
pasyst                               0.0
padiast                              0.0
fc                                   0.0
polysomnographie_iah                 0.0
iah_class                            0.0
tabagisme_nb_paquets_annee           0.0
excercice_physique_nb_min_semaine    0.0
dtype: float64

**PCA visual analysis**

In [133]:
if True:
    #all fields but patient_id_hashed and polysomnographie_iah/iah_class
    numerical_cols_minus_iah = ['age','sexe','egfdrs001','alcool',
                                'nycturie_nb','mrc','nyha','score_epworth','score_asthenie','score_depression','score_sjsr',
                                'imc','perimetre_cervical','perimetre_abdominulll','tour_de_hanches','pasyst','padiast','fc']

    pca = PCA(n_components=3)
    principalComponents = pca.fit_transform(df[numerical_cols_minus_iah])
    principalDf = pd.DataFrame(data = principalComponents, columns = ['comp1', 'comp2', 'comp3'])

    finalDf = pd.concat([principalDf, df[['iah_class']]], axis=1)
    fig = px.scatter_3d(finalDf, x='comp1', y='comp2', z='comp3',
                        color=(finalDf['iah_class']+1).astype(str),
                        symbol='iah_class',
                        size=(1/(finalDf['iah_class']+1)*2),
                        size_max=10,
                        opacity=0.6,
                        width=600, height=600,
                        title='All fields')
    fig.show()

In [134]:
if True:
    numerical_cols_minus_iah = ['padiast','pasyst','fc']
    pca = PCA(n_components=3)
    principalComponents = pca.fit_transform(df[numerical_cols_minus_iah])
    principalDf = pd.DataFrame(data = principalComponents, columns = ['comp1', 'comp2', 'comp3'])

    principalDf=principalDf.reset_index()
    finalDf = pd.concat([principalDf, df[['iah_class']]], axis=1)
    fig = px.scatter_3d(finalDf, x='comp1', y='comp2', z='comp3',
                        color=(finalDf['iah_class']+1).astype(str),
                        symbol='iah_class',
                        size=(1/(finalDf['iah_class']+1)*2),
                        size_max=10,
                        opacity=0.6,
                        width=600, height=600,
                        title='Blood pressure fields')
    fig.show()

In [135]:
if True:
    numerical_cols_minus_iah = ['perimetre_cervical', 'perimetre_abdominulll', 'imc', 'fc']    
    pca = PCA(n_components=3)
    principalComponents = pca.fit_transform(df[numerical_cols_minus_iah])
    principalDf = pd.DataFrame(data = principalComponents, columns = ['comp1', 'comp2', 'comp3'])

    principalDf=principalDf.reset_index()
    finalDf = pd.concat([principalDf, df[['iah_class']]], axis=1)
    fig = px.scatter_3d(finalDf, x='comp1', y='comp2', z='comp3',
                        color=(finalDf['iah_class']+1).astype(str),
                        symbol='iah_class',
                        size=(1/(finalDf['iah_class']+1)*2),
                        size_max=10,
                        opacity=0.6,
                        width=600, height=600,
                        title='Anatomic features')
    fig.show()

In [137]:
if True:
    numerical_cols_minus_iah = ['nyha','score_epworth','score_asthenie','score_depression','score_sjsr']  
    pca = PCA(n_components=3)
    principalComponents = pca.fit_transform(df[numerical_cols_minus_iah])
    principalDf = pd.DataFrame(data = principalComponents, columns = ['comp1', 'comp2', 'comp3'])

    principalDf=principalDf.reset_index()
    finalDf = pd.concat([principalDf, df[['iah_class']]], axis=1)
    fig = px.scatter_3d(finalDf, x='comp1', y='comp2', z='comp3',
                        color=(finalDf['iah_class']+1).astype(str),
                        symbol='iah_class',
                        size=(1/(finalDf['iah_class']+1)*2),
                        size_max=10,
                        opacity=0.6,
                        width=600, height=600,
                        title='Scores features')
    fig.show()

In [138]:
if True:
    numerical_cols_minus_iah = ['age', 'egfdrs001', 'nycturie_nb', 'score_depression', 'imc', 'perimetre_cervical', 'tour_de_hanches', 'padiast']  
   
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(df[numerical_cols_minus_iah])  
    principalDf = pd.DataFrame(data = principalComponents, columns = ['comp1', 'comp2'])
    finalDf = pd.concat([principalDf, df[['iah_class']]], axis=1)
    fig = px.scatter(finalDf, x='comp1', y='comp2',
                        color=(finalDf['iah_class']+1).astype(str),
                        symbol='iah_class',
                        size=(1/(finalDf['iah_class']+3)*2),
                        size_max=10,
                        opacity=0.6,
                        width=300, height=300,
                        title='Final selection of features (Bayes)')
    fig.show()

In [139]:
if True:
    fig = px.parallel_coordinates(df[numerical_cols_minus_iah+['iah_class']], color="iah_class", labels={"patient_id": "patient_id" },
                    color_continuous_scale=px.colors.diverging.Tealrose, color_continuous_midpoint=2)
    fig.show()

In [140]:
if True:
    numerical_cols_minus_iah = ['pasyst','padiast','fc']
    fig = px.parallel_coordinates(df[numerical_cols_minus_iah+['iah_class']], color="iah_class", labels={"patient_id": "patient_id" },
                    color_continuous_scale=px.colors.diverging.Tealrose, color_continuous_midpoint=2)
    fig.show()

**Oversampling using ADASYN**

In [142]:
#all fields but patient_id_hashed
numerical_cols = ['age','sexe','egfdrs001','alcool',
                  'nycturie_nb','mrc','nyha','score_epworth','score_asthenie','score_depression','score_sjsr',
                  'imc','perimetre_cervical','perimetre_abdominulll','tour_de_hanches','pasyst','padiast','fc','polysomnographie_iah'
                 ,'tabagisme_nb_paquets_annee', 'excercice_physique_nb_min_semaine']

In [143]:
from imblearn.over_sampling import ADASYN
y = df[['iah_class']]
y = y.astype({"iah_class": int})
x = df[numerical_cols]
#print(y.iah_class.value_counts())
x_resampled, y_resampled = ADASYN().fit_resample(x, y)
df_resampled = pd.concat([x_resampled, y_resampled], axis=1)

In [144]:
df_resampled.to_csv('/home/junio/Desktop/Ju/mywaytohealth2020/jupyter/data/01.PATIENTS_VISITES-no_control_no_spurious_data-median-imputation_ADASYN.csv',sep=',')

In [145]:
print(str(df_resampled.shape[0]/df.shape[0]*100)+'%','data augmentation with ADASYN')

218.80998080614202% data augmentation with ADASYN


In [146]:
df_resampled.isnull().sum()/len(df)*100

age                                  0.0
sexe                                 0.0
egfdrs001                            0.0
alcool                               0.0
nycturie_nb                          0.0
mrc                                  0.0
nyha                                 0.0
score_epworth                        0.0
score_asthenie                       0.0
score_depression                     0.0
score_sjsr                           0.0
imc                                  0.0
perimetre_cervical                   0.0
perimetre_abdominulll                0.0
tour_de_hanches                      0.0
pasyst                               0.0
padiast                              0.0
fc                                   0.0
polysomnographie_iah                 0.0
tabagisme_nb_paquets_annee           0.0
excercice_physique_nb_min_semaine    0.0
iah_class                            0.0
dtype: float64

**Visualization of oversampled data**

In [148]:
if True:
    numerical_cols_minus_iah = ['age', 'egfdrs001', 'nycturie_nb', 'score_depression', 'imc', 'perimetre_cervical', 'tour_de_hanches', 'padiast']  
    pca = PCA(n_components=3)
    principalComponents = pca.fit_transform(df_resampled[numerical_cols_minus_iah])
    principalDf = pd.DataFrame(data = principalComponents, columns = ['comp1', 'comp2', 'comp3'])

    principalDf=principalDf.reset_index()
    finalDf = pd.concat([principalDf, df_resampled[['iah_class']]], axis=1)
    fig = px.scatter_3d(finalDf, x='comp1', y='comp2', z='comp3',
                        color=(finalDf['iah_class']+1).astype(str),
                        symbol='iah_class',
                        size=(1/(finalDf['iah_class']+1)*2),
                        size_max=10,
                        opacity=0.6,
                        width=600, height=600,
                        title='Final selection of features (Bayes) with ADASYN')
    fig.show()

In [150]:
if True:
    numerical_cols_minus_iah = ['age', 'egfdrs001', 'nycturie_nb', 'score_depression', 'imc', 'perimetre_cervical', 'tour_de_hanches', 'padiast']  
   
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(df_resampled[numerical_cols_minus_iah])  
    principalDf = pd.DataFrame(data = principalComponents, columns = ['comp1', 'comp2'])
    finalDf = pd.concat([principalDf, df_resampled[['iah_class']]], axis=1)
    fig = px.scatter(finalDf, x='comp1', y='comp2',
                        color=(finalDf['iah_class']+1).astype(str),
                        symbol='iah_class',
                        size=(1/(finalDf['iah_class']+2)*2),
                        size_max=10,
                        opacity=0.6,
                        width=300, height=300,
                        title='Final selection of features (Bayes)')
    fig.show()