**Author:** Simon Richard
Random Forest model using CGAS and Bio-Impedance scores to predict PCIAT and (by extension) SII.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [3]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,


In [4]:
cols = list(filter(lambda x: x.startswith('BIA') or x.startswith('CGAS'), df.columns))
cols

['CGAS-Season',
 'CGAS-CGAS_Score',
 'BIA-Season',
 'BIA-BIA_Activity_Level_num',
 'BIA-BIA_BMC',
 'BIA-BIA_BMI',
 'BIA-BIA_BMR',
 'BIA-BIA_DEE',
 'BIA-BIA_ECW',
 'BIA-BIA_FFM',
 'BIA-BIA_FFMI',
 'BIA-BIA_FMI',
 'BIA-BIA_Fat',
 'BIA-BIA_Frame_num',
 'BIA-BIA_ICW',
 'BIA-BIA_LDM',
 'BIA-BIA_LST',
 'BIA-BIA_SMM',
 'BIA-BIA_TBW']

In [8]:
# Check data types
dtypes = {col: df[col].dtype for col in cols}
dtypes

{'CGAS-Season': dtype('O'),
 'CGAS-CGAS_Score': dtype('float64'),
 'BIA-Season': dtype('O'),
 'BIA-BIA_Activity_Level_num': dtype('float64'),
 'BIA-BIA_BMC': dtype('float64'),
 'BIA-BIA_BMI': dtype('float64'),
 'BIA-BIA_BMR': dtype('float64'),
 'BIA-BIA_DEE': dtype('float64'),
 'BIA-BIA_ECW': dtype('float64'),
 'BIA-BIA_FFM': dtype('float64'),
 'BIA-BIA_FFMI': dtype('float64'),
 'BIA-BIA_FMI': dtype('float64'),
 'BIA-BIA_Fat': dtype('float64'),
 'BIA-BIA_Frame_num': dtype('float64'),
 'BIA-BIA_ICW': dtype('float64'),
 'BIA-BIA_LDM': dtype('float64'),
 'BIA-BIA_LST': dtype('float64'),
 'BIA-BIA_SMM': dtype('float64'),
 'BIA-BIA_TBW': dtype('float64')}

In [None]:
seasons = df[filter(lambda col: dtypes[col] == np.dtype('O'), cols)]
seasons

Unnamed: 0,CGAS-Season,BIA-Season
0,Winter,Fall
1,,Winter
2,Fall,
3,Fall,Summer
4,Summer,
...,...,...
3955,Spring,Fall
3956,,Spring
3957,Spring,Winter
3958,Spring,Summer


In [19]:
X = df[filter(lambda col: dtypes[col] == np.dtype('float64'), cols)]
X

Unnamed: 0,CGAS-CGAS_Score,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW
0,51.0,2.0,2.66855,16.8792,932.498,1492.00,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909
1,,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.97400,39.4497,15.4107,27.0552
2,71.0,,,,,,,,,,,,,,,,
3,71.0,3.0,3.84191,18.2943,1131.430,1923.44,15.59250,62.7757,14.0740,4.22033,18.82430,2.0,30.4041,16.77900,58.9338,26.4798,45.9966
4,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,60.0,3.0,4.52277,16.3642,1206.880,2051.70,19.46110,70.8117,14.0629,2.30138,11.58830,1.0,33.3709,17.97970,66.2889,29.7790,52.8320
3956,,,,,,,,,,,,,,,,,
3957,68.0,2.0,4.41305,21.4438,1253.740,2005.99,20.48250,75.8033,14.8043,6.63952,33.99670,2.0,33.9805,21.34030,71.3903,28.7792,54.4630
3958,70.0,4.0,6.66168,12.2372,1414.340,2970.12,26.53230,92.9092,13.0684,-0.83117,-5.90917,2.0,41.3715,25.00540,86.2475,45.4340,67.9038


In [46]:
y_cols = list(filter(lambda x: x.startswith('PCIAT') and not (x.endswith('Season') or x.endswith('Total')), df.columns))
y_cols

['PCIAT-PCIAT_01',
 'PCIAT-PCIAT_02',
 'PCIAT-PCIAT_03',
 'PCIAT-PCIAT_04',
 'PCIAT-PCIAT_05',
 'PCIAT-PCIAT_06',
 'PCIAT-PCIAT_07',
 'PCIAT-PCIAT_08',
 'PCIAT-PCIAT_09',
 'PCIAT-PCIAT_10',
 'PCIAT-PCIAT_11',
 'PCIAT-PCIAT_12',
 'PCIAT-PCIAT_13',
 'PCIAT-PCIAT_14',
 'PCIAT-PCIAT_15',
 'PCIAT-PCIAT_16',
 'PCIAT-PCIAT_17',
 'PCIAT-PCIAT_18',
 'PCIAT-PCIAT_19',
 'PCIAT-PCIAT_20']

In [47]:
Y = df[y_cols]
Y

Unnamed: 0,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20
0,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0
3,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0
4,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,3.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,1.0,2.0,0.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0,0.0
3956,,,,,,,,,,,,,,,,,,,,
3957,5.0,5.0,3.0,0.0,5.0,1.0,0.0,2.0,0.0,2.0,1.0,0.0,1.0,3.0,0.0,0.0,1.0,1.0,0.0,1.0
3958,2.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,0.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0


In [48]:
maskX = ~X.isna().any(axis=1)
maskX

0        True
1       False
2       False
3        True
4       False
        ...  
3955     True
3956    False
3957     True
3958     True
3959    False
Length: 3960, dtype: bool

In [49]:
maskY = ~Y.isna().any(axis=1)
maskY

0        True
1        True
2        True
3        True
4       False
        ...  
3955     True
3956    False
3957     True
3958     True
3959    False
Length: 3960, dtype: bool

In [50]:
mask = maskX & maskY
mask.sum()

np.int64(1549)

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X[mask], Y[mask], test_size=0.2, random_state=0)

rfc = RandomForestClassifier(n_estimators=100, random_state=0)
rfc.fit(X_train, Y_train)
rfc

In [67]:
from sklearn.metrics import cohen_kappa_score

def sii(y):
    """
    0-30=None; 31-49=Mild; 50-79=Moderate; 80-100=Severe
    """
    # y = y[y_cols]
    return np.digitize(y.sum(axis=1), bins=[30, 50, 80], right=True)

def compare_sii(y1, y2):
    return cohen_kappa_score(sii(y1), sii(y2), weights='quadratic')

In [70]:
Y_pred = rfc.predict(X_test)
compare_sii(Y_test, Y_pred)

np.float64(0.16890080428954435)