In [1]:
##### general libraries import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os

import warnings 
warnings.filterwarnings("ignore")

from tqdm import tqdm

import umap

##### scikit learn import
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from xgboost import XGBClassifier

data_version = "240319"
output_version = "focus_v17_20240425"

outdir = "/media/hieunguyen/HNSD_mini/data/outdir"
PROJECT = "UMP_oral_cancer"

path_to_main_input = "/media/hieunguyen/HNSD_mini/data/UMP_Oral_cancer/input"
path_to_main_output = os.path.join(outdir, PROJECT, output_version)

cluster_score = pd.read_csv("/media/hieunguyen/HNSD_mini/data/UMP_Oral_cancer/input/240319/cluster_score.csv", sep = ";")
code_version = "v17"
path_to_01_output = os.path.join(path_to_main_output, "01_output", data_version, code_version)
path_to_04_output = os.path.join(path_to_main_output, "04_output", data_version, code_version)
path_to_05_output = os.path.join(path_to_main_output, "05_output", data_version, code_version)
path_to_06_output = os.path.join(path_to_main_output, "06_output", data_version, code_version)
path_to_07_output = os.path.join(path_to_main_output, "07_output", data_version, code_version)
os.system("mkdir -p {}".format(path_to_07_output))

all_cluster_labels = [ 'RNA.consensus.cluster', 
                      'kmean.cluster',
                      'kmean.2clusters.DrNam', 
                      'kmean.3clusters.DrNam', 
                      'merged.cluster12',
                      'merged.cluster13', 
                      'merged.cluster23']
all_cv_scores = dict()
all_best_params = dict()
all_models = dict()


sample_orders = ['230215_143', '230215_32', '230215_33', '230215_34', '230720_11',
       '230720_15', '230720_2', '230720_3', '230720_46', '230720_6',
       '230720_7', '230825_145', '230825_146', '230825_147', '230825_148',
       '230825_150', '230831_153', '230831_154', '230831_156',
       '230831_157', '230831_235', '230831_238', '230831_25', '230914_48',
       '230914_51', '230914_52', '230914_69', '230914_72', '230914_9',
       '230921_35', '230921_41', '230921_54', '230921_68', '230921_70',
       '230922_158', '230922_159', '230922_163', '230922_178',
       '230922_179', '230922_60', '231003_152', '231003_160',
       '231003_162', '231003_167', '231003_56', '231003_71', '231003_74',
       '231130_186', '231130_187', '231130_189', '231130_190',
       '231130_193', '231130_198', '231130_200', '231213_166',
       '231213_169', '231213_170', '231213_172', '231213_174',
       '231213_176', '231213_207', '231219_161', '231219_180',
       '231219_183', '231219_203', '231219_205', '231219_206',
       '231219_212', '231219_214', '231220_177', '231220_181',
       '231220_192', '231220_195', '231220_204', '231220_213',
       '231220_223', '231228_209', '231228_210', '231228_211',
       '231228_216', '231228_218', '231228_224', '240110_225',
       '240110_228', '240110_230', '240110_234', '240110_256',
       '240110_258', '240110_265', '240202_217', '240202_219',
       '240202_229', '240202_240', '240202_245', '240202_254',
       '240202_259', '240202_260', '240202_262', '240202_264',
       '240202_271', '240202_368']

for selected_cluster_label in tqdm(all_cluster_labels):
    umapdf = pd.read_csv(os.path.join(path_to_01_output, "umap_RNAseq.csv"), index_col = [0])
    featuredf = pd.read_csv(os.path.join(path_to_06_output, "featuredf.final.csv"), index_col = [0]).set_index("SampleID")
    featuredf = featuredf.loc[sample_orders, ].reset_index()
    # featuredf = pd.read_csv("/media/hieunguyen/HNSD_mini/data/outdir/UMP_Oral_cancer/output_20240411/02_output/240319/v17/featuredf.csv")
    featuredf = featuredf.merge(umapdf, right_on = "SampleID", left_on = "SampleID").drop(["V1", "V2"], axis = 1)
    
    selected_features = [item for item in featuredf.columns if "feature" in item]
    
    X = featuredf[selected_features].to_numpy()
    y = featuredf[selected_cluster_label].to_numpy()
    y = [item-1 for item in y]
                        
    if len(set(y)) == 2:
        input_obj_func = "binary:logistic"
    else:
        input_obj_func = "multi:softmax"
        
    # param_grid = {"max_depth": [10, 20, 50, 100], 
    #               "n_estimators" : [10, 20, 50, 100],
    #               "min_child_weight" : range(1,6,2),  
    #               "gamma" : [i/10.0 for i in range(0,5)],
    #               "objective": [input_obj_func],
    #               "tree_method": ["gpu_hist"],
    #               "gpu_id": [-1]}
                
    # grid = GridSearchCV(XGBClassifier(), param_grid, refit = True, verbose = False) 
    # grid.fit(X, y) 
    
    # best_params = grid.best_params_
    best_params = { 'gamma': 0.4,
                   'gpu_id': -1,
                   'max_depth': 10,
                   'min_child_weight': 1,
                   'n_estimators': 10,
                   'objective': 'binary:logistic',
                   'tree_method': 'gpu_hist'}
    clf = XGBClassifier(params = best_params, random_state = 42)
    cv_scores = cross_val_score(clf, X, y, cv = 10)

    all_cv_scores[selected_cluster_label] = cv_scores
    all_best_params[selected_cluster_label] = best_params
    all_models[selected_cluster_label] = clf

X = featuredf[selected_features].to_numpy()
y = featuredf[selected_cluster_label].to_numpy()
y = [item-1 for item in y]
final_model = XGBClassifier(params = best_params, random_state = 42)
final_model.fit(X, y)


100%|█████████████████████████████████████████████| 7/7 [00:12<00:00,  1.85s/it]


In [2]:
for key in all_cv_scores.keys():
    print("Cluster: {}, mean ACC: {}".format(key, np.mean(all_cv_scores[key])))

Cluster: RNA.consensus.cluster, mean ACC: 0.4845454545454545
Cluster: kmean.cluster, mean ACC: 0.5945454545454545
Cluster: kmean.2clusters.DrNam, mean ACC: 0.6345454545454545
Cluster: kmean.3clusters.DrNam, mean ACC: 0.43636363636363634
Cluster: merged.cluster12, mean ACC: 0.6836363636363636
Cluster: merged.cluster13, mean ACC: 0.6936363636363636
Cluster: merged.cluster23, mean ACC: 0.6718181818181816


In [3]:
featuredf_old = pd.read_csv(os.path.join(path_to_05_output, "featuredf.{}.csv".format("raw")), index_col = [0]).set_index("SampleID")

In [7]:
testdf = pd.read_csv(os.path.join(path_to_06_output, "test_featuredf.final.csv"), index_col = [0]).set_index("No.")

In [8]:
final_model.predict(testdf.to_numpy())

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1])