In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/conversion-predictors-of-cis-to-multiple-sclerosis/conversion_predictors_of_clinically_isolated_syndrome_to_multiple_sclerosis.csv


In [2]:
fpath = r"/kaggle/input/conversion-predictors-of-cis-to-multiple-sclerosis/conversion_predictors_of_clinically_isolated_syndrome_to_multiple_sclerosis.csv"
df = pd.read_csv(fpath)
df

Unnamed: 0.1,Unnamed: 0,Gender,Age,Schooling,Breastfeeding,Varicella,Initial_Symptom,Mono_or_Polysymptomatic,Oligoclonal_Bands,LLSSEP,ULSSEP,VEP,BAEP,Periventricular_MRI,Cortical_MRI,Infratentorial_MRI,Spinal_Cord_MRI,Initial_EDSS,Final_EDSS,group
0,0,1,34,20.0,1,1,2.0,1,0,1,1,0,0,0,1,0,1,1.0,1.0,1
1,1,1,61,25.0,3,2,10.0,2,1,1,0,1,0,0,0,0,1,2.0,2.0,1
2,2,1,22,20.0,3,1,3.0,1,1,0,0,0,0,0,1,0,0,1.0,1.0,1
3,3,2,41,15.0,1,1,7.0,2,1,0,1,1,0,1,1,0,0,1.0,1.0,1
4,4,2,34,20.0,2,1,6.0,2,0,1,0,0,0,1,0,0,0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268,268,2,31,8.0,3,1,9.0,2,0,0,0,0,0,0,0,0,0,,,2
269,269,1,21,15.0,3,3,5.0,2,1,0,0,0,0,0,0,0,1,,,2
270,270,2,19,12.0,3,3,13.0,2,0,1,1,1,0,0,0,0,1,,,2
271,271,2,32,15.0,3,3,15.0,2,1,1,1,1,0,1,1,1,0,,,2


Remove index, Initial_EDSS, and Final_EDSS
Initial and Final EDSS has N/A values for all of group 2, while group 1 has values for both these columns. 
This will inadvertantly bias our data.

In [3]:
df.pop("Unnamed: 0")

0        0
1        1
2        2
3        3
4        4
      ... 
268    268
269    269
270    270
271    271
272    272
Name: Unnamed: 0, Length: 273, dtype: int64

In [4]:
df.pop("Initial_EDSS")
df.pop("Final_EDSS")

0      1.0
1      2.0
2      1.0
3      1.0
4      1.0
      ... 
268    NaN
269    NaN
270    NaN
271    NaN
272    NaN
Name: Final_EDSS, Length: 273, dtype: float64

Two rows are missing data, will just drop them as this dataset doesn't appear to missing much data.

In [5]:
df = df.dropna()

One Hot Encoding Categorical Data
This data is already numeric, but I don't want to accidentally introduce ordinality to these values as 3 is listed as unknown.

In [6]:
cat_cols = ["Gender","Breastfeeding","Varicella","Initial_Symptom","Mono_or_Polysymptomatic","Oligoclonal_Bands"]

In [7]:
df = pd.get_dummies(df, columns=cat_cols,dtype=int)

In [8]:
y = df.pop("group")
X = df

The populations are fairly close in size, if they were different we may consider upsampling.

In [9]:
y.value_counts()

2    146
1    125
Name: group, dtype: int64

In [10]:
X.columns

Index(['Age', 'Schooling', 'LLSSEP', 'ULSSEP', 'VEP', 'BAEP',
       'Periventricular_MRI', 'Cortical_MRI', 'Infratentorial_MRI',
       'Spinal_Cord_MRI', 'Gender_1', 'Gender_2', 'Breastfeeding_1',
       'Breastfeeding_2', 'Breastfeeding_3', 'Varicella_1', 'Varicella_2',
       'Varicella_3', 'Initial_Symptom_1.0', 'Initial_Symptom_2.0',
       'Initial_Symptom_3.0', 'Initial_Symptom_4.0', 'Initial_Symptom_5.0',
       'Initial_Symptom_6.0', 'Initial_Symptom_7.0', 'Initial_Symptom_8.0',
       'Initial_Symptom_9.0', 'Initial_Symptom_10.0', 'Initial_Symptom_11.0',
       'Initial_Symptom_12.0', 'Initial_Symptom_13.0', 'Initial_Symptom_14.0',
       'Initial_Symptom_15.0', 'Mono_or_Polysymptomatic_1',
       'Mono_or_Polysymptomatic_2', 'Mono_or_Polysymptomatic_3',
       'Oligoclonal_Bands_0', 'Oligoclonal_Bands_1', 'Oligoclonal_Bands_2'],
      dtype='object')

In [11]:
X, X_test, y, y_test = train_test_split(X, y, random_state=13)

Initial attempt with RandomForestClassifier, default params

In [12]:
regr = RandomForestClassifier(random_state=13)
regr.fit(X, y)
score = cross_val_score(regr, X, y, cv=10)
score

array([0.71428571, 0.76190476, 0.80952381, 0.9       , 0.8       ,
       0.75      , 0.85      , 0.75      , 0.75      , 0.85      ])

Large variation in results from cross validation, unsure on why.

Trying some other parameters to see if performance increase can be found.

In [13]:
parameters={'min_samples_split':[2,4,8,16],"criterion":["gini", "entropy", 'log_loss']}

In [14]:
clf = GridSearchCV(regr, parameters)
clf.fit(X, y)

In [15]:
clf.cv_results_["mean_test_score"]

array([0.80231707, 0.78304878, 0.78780488, 0.76317073, 0.78756098,
       0.78768293, 0.78268293, 0.77292683, 0.78756098, 0.78768293,
       0.78268293, 0.77292683])

In [16]:
parameters={"n_estimators":[100,200,400]}

In [17]:
clf = GridSearchCV(regr, parameters)
clf.fit(X, y)

In [18]:
clf.cv_results_["mean_test_score"]

array([0.80231707, 0.76804878, 0.78768293])

In [19]:
y_pred = regr.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           1       0.77      0.86      0.81        28
           2       0.89      0.82      0.86        40

    accuracy                           0.84        68
   macro avg       0.83      0.84      0.84        68
weighted avg       0.84      0.84      0.84        68

