#Import library & dataset

In [42]:
#Import all required library 
import pandas as pd
import numpy as np
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load

In [21]:
#Import the dataset
#You can change the path of the testing dataset 
path = "data/train.csv"
df_train = pd.read_csv(path,header = 0)
path = "data/test.csv"
df_test = pd.read_csv(path, header = 0)

In [22]:
print(f'Shape of training dataset:{df_train.shape}')
print(f'Shape of testing dataset:{df_test.shape}')

Shape of training dataset:(87, 13)
Shape of testing dataset:(59, 12)


#Data exploration

In [23]:
#Check the summary of the data 
df_train.describe()

Unnamed: 0,id,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
count,87.0,86.0,86.0,87.0,87.0,87.0,87.0,87.0,87.0,87.0,87.0,86.0,87.0
mean,43.0,1264.244186,290.383721,982.570115,479.34092,494.904023,212.732874,118.78092,1325.096437,40.218391,0.482759,2066.534884,0.333333
std,25.258662,765.452376,490.283499,617.332545,344.326452,311.836604,173.553264,96.218344,791.602538,10.461919,0.502599,1198.401364,0.474137
min,0.0,112.0,30.0,74.4,36.61,39.59,0.0,4.2,209.25,19.0,0.0,72.0,0.0
25%,21.5,685.5,77.5,549.39,237.92,272.745,78.815,52.425,780.615,33.0,0.0,1461.25,0.0
50%,43.0,1108.5,124.5,871.71,423.27,459.72,188.78,89.79,1179.27,41.0,0.0,1757.5,0.0
75%,64.5,1602.25,244.5,1268.085,624.45,624.36,262.845,155.45,1617.725,49.5,1.0,2238.25,1.0
max,86.0,4145.0,3124.0,3791.23,2548.1,1517.81,878.04,485.86,4757.28,60.0,1.0,7515.0,1.0


In [24]:
#Check the first 5 rows of data 
df_train.head()

Unnamed: 0,id,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
0,0,3556.0,2489.0,265.19,77.53,176.55,0.0,4.2,307.91,52,0,7515.0,1
1,1,1906.0,134.0,1442.61,551.9,876.07,112.1,168.15,1735.48,20,1,1756.0,0
2,2,1586.0,71.0,1332.74,684.2,655.26,244.95,216.52,1820.04,28,1,1311.0,0
3,3,683.0,94.0,419.23,255.8,162.17,72.05,44.68,538.22,55,1,1443.0,0
4,4,1032.0,71.0,1102.72,480.27,625.3,188.78,130.77,1427.97,28,1,1542.0,0


In [25]:
#Check the Label distribution 
df_train['label'].value_counts(normalize = True)

0    0.666667
1    0.333333
Name: label, dtype: float64

In [26]:
#Check the correlation of the attribute with the label before data preprocessing 
df_train.corr()['label']

id                         -0.008738
MO HLADR+ MFI (cells/ul)   -0.279773
Neu CD64+MFI (cells/ul)     0.534729
CD3+T (cells/ul)           -0.421029
CD8+T (cells/ul)           -0.375862
CD4+T (cells/ul)           -0.384415
NK (cells/ul)              -0.466103
CD19+ (cells/ul)           -0.369889
CD45+ (cells/ul)           -0.478815
Age                         0.041413
Sex 0M1F                    0.048795
Mono CD64+MFI (cells/ul)    0.380527
label                       1.000000
Name: label, dtype: float64

#Data preprocessing for training dataset 

In [27]:
#Replace the NaN value with mean 
df_train.fillna(df_train.mean(), inplace=True)

In [28]:
#Separate the patient age into different bin
df_train['Age'] = pd.cut(df_train['Age'],bins=[10,20,30,40,50,60,1000],\
                                       labels=['Age: 10-20', 'Age: 20-30', 'Age: 30-40', 'Age: 40-50', 'Age: 50-60', 'Age: 60 - Above'])
lbl = preprocessing.LabelEncoder()
df_train['Age'] = lbl.fit_transform(df_train['Age'].astype(str))

In [29]:
#Separate data into X & Y 
X_train = df_train.iloc[:,1:-1]
Y_train = df_train['label']

In [30]:
#Resampling with the SMOTE
X_train, Y_train = SMOTE(random_state=42).fit_resample(X_train, Y_train)

#Data preprocessing for testing dataset

In [31]:
#Replace the NaN value with mean 
df_test.fillna(df_test.mean(), inplace=True)

In [32]:
#Separate the patient age into different bin
df_test['Age'] = pd.cut(df_test['Age'],bins=[10,20,30,40,50,60,1000],\
                                       labels=['Age: 10-20', 'Age: 20-30', 'Age: 30-40', 'Age: 40-50', 'Age: 50-60', 'Age: 60 - Above'])
lbl = preprocessing.LabelEncoder()
df_test['Age'] = lbl.fit_transform(df_test['Age'].astype(str))

In [33]:
#Drop the ID column 
df_test = df_test.iloc[:,1:]

#Model Training: Gride search for the Randome forest 

In [34]:
# Create the param grid
param = [{'n_estimators': range(10,150,20),
               'max_features' : ["auto","log2"],
               'max_depth': range(3,12,2),
               'min_samples_split': [2,3,4,5],
               'min_samples_leaf': [1, 2,3]}]
print(param)

[{'n_estimators': range(10, 150, 20), 'max_features': ['auto', 'log2'], 'max_depth': range(3, 12, 2), 'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [1, 2, 3]}]


In [35]:
#Define the model
model = GridSearchCV(RandomForestClassifier(), param, cv=5,
                   scoring='accuracy',n_jobs=-1)

In [36]:
#Start the gride search and print the best parameter and the best score 
model.fit(X_train, Y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid=[{'max_depth': range(3, 12, 2),
                          'max_features': ['auto', 'log2'],
                          'min_samples_leaf': [1, 2, 3],
                          'min_samples_split': [2, 3, 4, 5],
                          'n_estimators': range(10, 150, 20)}],
             scoring='accuracy')

In [37]:
#Print the best paramter and the best score 
print("best parameters: ", model.best_params_)
print("best score:", model.best_score_ )

best parameters:  {'max_depth': 9, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 50}
best score: 0.9228260869565219


In [38]:
#With reference to the best set of parameter to train the model 
model = RandomForestClassifier(n_estimators = 50, max_features = 'auto', max_depth = 9, min_samples_leaf = 2, min_samples_split = 4)
model.fit(X_train, Y_train)

RandomForestClassifier(max_depth=9, min_samples_leaf=2, min_samples_split=4,
                       n_estimators=50)

#Save the model

In [43]:
#Save the model
#You can declare your own path 
path = 'Final_Model.joblib'
dump(model, path) 

['Final_Model.joblib']

#Generate the prediction 

In [39]:
#Generate the submission result 
predictions = model.predict(df_test)

In [40]:
#Output to CSV with header of ID & Label 
#Your can declare your own path 
path = "submission.csv"
x_np = predictions.astype(int)
df = pd.DataFrame(x_np)
df.rename_axis('id', inplace=True)
df.to_csv(path,index = True, header=['label'])