In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/msbd5001-spring-2022/sample_submission.csv
/kaggle/input/msbd5001-spring-2022/train.csv
/kaggle/input/msbd5001-spring-2022/test.csv


In [2]:
## Data

train = pd.read_csv("/kaggle/input/msbd5001-spring-2022/train.csv", index_col=0)
test = pd.read_csv("/kaggle/input/msbd5001-spring-2022/test.csv", index_col=0)

train

Unnamed: 0_level_0,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3556.0,2489.0,265.19,77.53,176.55,0.00,4.20,307.91,52,0,7515.0,1
1,1906.0,134.0,1442.61,551.90,876.07,112.10,168.15,1735.48,20,1,1756.0,0
2,1586.0,71.0,1332.74,684.20,655.26,244.95,216.52,1820.04,28,1,1311.0,0
3,683.0,94.0,419.23,255.80,162.17,72.05,44.68,538.22,55,1,1443.0,0
4,1032.0,71.0,1102.72,480.27,625.30,188.78,130.77,1427.97,28,1,1542.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
82,626.0,68.0,1771.57,666.99,1117.48,360.21,118.84,2306.82,42,1,1521.0,0
83,1237.0,71.0,1348.53,428.09,924.69,120.02,48.67,1524.78,56,0,1345.0,0
84,634.0,1002.0,1300.00,558.00,724.00,67.00,105.00,1484.26,34,0,2926.0,1
85,112.0,884.0,942.83,378.49,567.06,116.77,31.81,1104.59,33,1,2352.0,1


In [3]:
## Data Preprocessing
# Drop the single object with missing values
train.dropna(inplace=True)

In [4]:
X = train.drop("label", axis=1)
y = train["label"]

In [5]:
## K-fold Cross Validation

from sklearn.model_selection import KFold

k = 5

kf = KFold(n_splits=k, shuffle=True, random_state=5001)

cv_folds = []

for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    #X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    #y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    cv_folds.append([train_index, test_index])

In [6]:
## XGBoost

# Preprocessing pipeline
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

from sklearn.preprocessing import StandardScaler

numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
)

cat_cols = ["Sex 0M1F"]
num_cols = [col for col in X.columns if col not in cat_cols]

from sklearn.compose import ColumnTransformer

full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
)

# Apply preprocessing
X_processed = full_processor.fit_transform(X)

# XGBoost model
import xgboost as xgb

params = {'objective': 'binary:logistic',
    'colsample_bytree': 0.5,
     'gamma': 1,
     'learning_rate': 0.1,
     'max_depth': 3,
     'reg_lambda': 10,
     'scale_pos_weight': 1,
     'subsample': 0.8}

from sklearn.metrics import accuracy_score
cv_acc = 0
cv_probs_xgb = []
for fold in cv_folds:
    train_index, test_index = fold[0], fold[1]
    
    X_train, X_test = X_processed[train_index], X_processed[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    xgb_cl = xgb.XGBClassifier(use_label_encoder=False, verbosity=0, random_state=5001, **params)
    
    xgb_cl.fit(X_train, y_train)
    y_pred = xgb_cl.predict(X_test)
    y_pred_proba = xgb_cl.predict_proba(X_test)
    cv_probs_xgb.append(y_pred_proba[:,1])
    cv_acc += accuracy_score(y_test, y_pred)
    
#cv_probs = np.concatenate([arr for arr in cv_probs])    
cv_acc = cv_acc/k
print("CV accuracy:", cv_acc)    

CV accuracy: 0.8705882352941178


In [7]:
# y_pred_proba

In [8]:
# y_pred_proba[:,1]

In [9]:
## Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

cv_acc = 0
cv_probs_rf = []
for fold in cv_folds:
    train_index, test_index = fold[0], fold[1]
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    clf = RandomForestClassifier(max_depth=3, random_state=5001)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    y_pred_proba = clf.predict_proba(X_test)
    cv_probs_rf.append(y_pred_proba[:,1])
    cv_acc += accuracy_score(y_test, y_pred)
    
cv_acc = cv_acc/k
print("CV accuracy:", cv_acc)

CV accuracy: 0.9058823529411765


In [10]:
## CatBoost

from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

cat_features = [9] # Gender

cv_acc = 0
cv_probs_cb = []
for fold in cv_folds:
    train_index, test_index = fold[0], fold[1]
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    cb_clf = CatBoostClassifier(iterations=15,
                                learning_rate=0.25,
                                depth=5,
                                verbose=0,
                                random_seed=5001)
    
    cb_clf.fit(X_train, y_train, cat_features)
    y_pred = cb_clf.predict(X_test)
    y_pred_proba = cb_clf.predict_proba(X_test)
    cv_probs_cb.append(y_pred_proba[:,1])
    cv_acc += accuracy_score(y_test, y_pred)
    
cv_acc = cv_acc/k
print("CV accuracy:", cv_acc) 

CV accuracy: 0.8823529411764707


In [11]:
cv_acc = 0
for fold_num in range(len(cv_folds)):
    test_index = cv_folds[fold_num][1]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    y_pred_avg = (cv_probs_xgb[fold_num]+cv_probs_rf[fold_num]+cv_probs_cb[fold_num])/3
    cv_acc += accuracy_score(y_test, y_pred_avg>=0.50)
    
cv_acc = cv_acc/k
print("CV accuracy:", cv_acc)
    

CV accuracy: 0.8823529411764707


In [12]:
## Submission

# Random Forest

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=3, random_state=5001)
clf.fit(X, y)

preds = clf.predict(test)
pred_prob_rf = clf.predict_proba(test)

result = pd.read_csv("/kaggle/input/msbd5001-spring-2022/sample_submission.csv", index_col=0)
result['label'] = preds
result.to_csv("submission_rf.csv")
result.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


In [13]:
## Submission

# XGBoost

xgb_cl = xgb.XGBClassifier(use_label_encoder=False, verbosity=0, random_state=5001, **params)

# Fit
xgb_cl.fit(X_processed, y)

# Predict
test_processed = full_processor.fit_transform(test)
preds = xgb_cl.predict(test_processed)
pred_prob_xgb = xgb_cl.predict_proba(test_processed)

result = pd.read_csv("/kaggle/input/msbd5001-spring-2022/sample_submission.csv", index_col=0)
result['label'] = preds
result.to_csv("submission_xgb.csv")
result.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


In [14]:
## Submission

# CatBoost

cb_clf = CatBoostClassifier(iterations=15,
                            learning_rate=0.25,
                            depth=5,
                            verbose=0,
                            random_seed=5001)
    
cb_clf.fit(X, y, cat_features)
preds = cb_clf.predict(test)
pred_prob_cb = cb_clf.predict_proba(test)

result = pd.read_csv("/kaggle/input/msbd5001-spring-2022/sample_submission.csv", index_col=0)
result['label'] = preds
result.to_csv("submission_cb.csv")
result.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


In [15]:
avg_pred_prob = (pred_prob_rf[:,1] + pred_prob_xgb[:,1] + pred_prob_cb[:,1])/3
avg_preds = avg_pred_prob >= 0.5

result = pd.read_csv("/kaggle/input/msbd5001-spring-2022/sample_submission.csv", index_col=0)
result['label'] = avg_preds
result['label'] = result['label'].astype(int)
result.to_csv("submission_avg.csv")
result.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0
