In [1]:
##### general libraries import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os

import warnings 
warnings.filterwarnings("ignore")

from tqdm import tqdm

import umap

##### scikit learn import
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
import xgboost as xgb


from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.svm import SVC 

data_version = "240319"
output_version = "focus_v17_20240425"

outdir = "/media/hieunguyen/HNSD_mini/data/outdir"
PROJECT = "UMP_oral_cancer"

path_to_main_input = "/media/hieunguyen/HNSD_mini/data/UMP_Oral_cancer/input"
path_to_main_output = os.path.join(outdir, PROJECT, output_version)

cluster_score = pd.read_csv("/media/hieunguyen/HNSD_mini/data/UMP_Oral_cancer/input/240319/cluster_score.csv", sep = ";")
code_version = "v17"
path_to_01_output = os.path.join(path_to_main_output, "01_output", data_version, code_version)
path_to_04_output = os.path.join(path_to_main_output, "04_output", data_version, code_version)
path_to_05_output = os.path.join(path_to_main_output, "05_output", data_version, code_version)
path_to_06_output = os.path.join(path_to_main_output, "06_output", data_version, code_version)
path_to_08_output = os.path.join(path_to_main_output, "08_output", data_version, code_version)
os.system("mkdir -p {}".format(path_to_08_output))

all_cluster_labels = [ 'RNA.consensus.cluster', 
                      'kmean.cluster',
                      'kmean.2clusters.DrNam', 
                      'kmean.3clusters.DrNam', 
                      'merged.cluster12',
                      'merged.cluster13', 
                      'merged.cluster23']
all_cv_scores = dict()
all_best_params = dict()


sample_orders = ['230215_143', '230215_32', '230215_33', '230215_34', '230720_11',
       '230720_15', '230720_2', '230720_3', '230720_46', '230720_6',
       '230720_7', '230825_145', '230825_146', '230825_147', '230825_148',
       '230825_150', '230831_153', '230831_154', '230831_156',
       '230831_157', '230831_235', '230831_238', '230831_25', '230914_48',
       '230914_51', '230914_52', '230914_69', '230914_72', '230914_9',
       '230921_35', '230921_41', '230921_54', '230921_68', '230921_70',
       '230922_158', '230922_159', '230922_163', '230922_178',
       '230922_179', '230922_60', '231003_152', '231003_160',
       '231003_162', '231003_167', '231003_56', '231003_71', '231003_74',
       '231130_186', '231130_187', '231130_189', '231130_190',
       '231130_193', '231130_198', '231130_200', '231213_166',
       '231213_169', '231213_170', '231213_172', '231213_174',
       '231213_176', '231213_207', '231219_161', '231219_180',
       '231219_183', '231219_203', '231219_205', '231219_206',
       '231219_212', '231219_214', '231220_177', '231220_181',
       '231220_192', '231220_195', '231220_204', '231220_213',
       '231220_223', '231228_209', '231228_210', '231228_211',
       '231228_216', '231228_218', '231228_224', '240110_225',
       '240110_228', '240110_230', '240110_234', '240110_256',
       '240110_258', '240110_265', '240202_217', '240202_219',
       '240202_229', '240202_240', '240202_245', '240202_254',
       '240202_259', '240202_260', '240202_262', '240202_264',
       '240202_271', '240202_368']


umapdf = pd.read_csv(os.path.join(path_to_01_output, "umap_RNAseq.csv"), index_col = [0])

featuredf = pd.read_csv(os.path.join(path_to_06_output, "featuredf.final.csv"), index_col = [0]).set_index("SampleID")

featuredf = featuredf.loc[sample_orders, ].reset_index()
featuredf = featuredf.merge(umapdf, right_on = "SampleID", left_on = "SampleID").drop(["V1", "V2"], axis = 1)
selected_cluster_label = 'merged.cluster13'
selected_features = [item for item in featuredf.columns if "feature" in item]
    
X = featuredf[selected_features].to_numpy()
y = featuredf[selected_cluster_label].to_numpy()
y = [item-1 for item in y] 

all_best_params = dict()
#####----------------------------------------------------------------#####
##### XGBoost model
#####----------------------------------------------------------------#####
model_name = "XGBoost"
param_grid = {    "max_depth": [10, 20, 50, 100], 
                  "n_estimators" : [10, 20, 50, 100],
                  "min_child_weight" : range(1,6,2),  
                  "gamma" : [i/10.0 for i in range(0,5)],
                  "objective": ["binary:logistic"],
                  "tree_method": ["gpu_hist"],
                  "gpu_id": [-1]
             }
                
grid = GridSearchCV(XGBClassifier(), param_grid, refit = True, verbose = True) 
grid.fit(X, y) 
best_params = grid.best_params_
all_best_params[model_name] = best_params

#####----------------------------------------------------------------#####
##### Logistic regression
#####----------------------------------------------------------------#####
model_name = "LR"
param_grid = {
    "random_state" : [411],
    "solver" : ["newton-cg", "lbfgs", "liblinear"],
    "penalty": ["l2"]    
}
grid = GridSearchCV(LogisticRegression(), param_grid, refit = True, verbose = True) 
grid.fit(X, y) 
best_params = grid.best_params_
all_best_params[model_name] = best_params

#####----------------------------------------------------------------#####
##### SVM
#####----------------------------------------------------------------#####
model_name = "SVM"

param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf', "linear", "poly", "sigmoid"]} 

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = True) 

grid.fit(X, y) 
best_params = grid.best_params_
all_best_params[model_name] = best_params

#####----------------------------------------------------------------#####
##### FINAL FIT
#####----------------------------------------------------------------#####
cv_scores = dict()
models = dict()

clf = GaussianNB()
cv_scores["GaussianNB"] = cross_val_score(clf, X, y, cv = 10)
models["GaussianNB"] = clf.fit(X, y)

clf = MultinomialNB()
cv_scores["MultinomialNB"] = cross_val_score(clf, X, y, cv = 10)
models["MultinomialNB"] = clf.fit(X, y)

clf = GaussianNB()
cv_scores["ComplementNB"] = cross_val_score(clf, X, y, cv = 10)
models["ComplementNB"] = clf.fit(X, y)

clf = BernoulliNB()
cv_scores["BernoulliNB"] = cross_val_score(clf, X, y, cv = 10)
models["BernoulliNB"] = clf.fit(X, y)

clf = XGBClassifier(params = all_best_params["XGBoost"], random_state = 42)
cv_scores["XGBoost"] = cross_val_score(clf, X, y, cv = 10)
models["XGBoost"] = clf.fit(X, y)

clf = SVC(**all_best_params["SVM"])
cv_scores["SVM"] = cross_val_score(clf, X, y, cv = 10)
models["SVM"] = clf.fit(X, y)

clf = LogisticRegression(**all_best_params["LR"])
cv_scores["LR"] = cross_val_score(clf, X, y, cv = 10)
models["LR"] = clf.fit(X, y)

all_cv_scoredf = pd.DataFrame.from_dict(cv_scores, orient="index").T
all_cv_scoredf.to_excel(os.path.join(path_to_08_output, "all_CV_scores_final.xlsx"))


Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [2]:
all_cv_scoredf.mean()

GaussianNB       0.594545
MultinomialNB    0.694545
ComplementNB     0.594545
BernoulliNB      0.693636
XGBoost          0.673636
SVM              0.723636
LR               0.683636
dtype: float64

In [3]:
##### save models
os.system("mkdir -p {}".format(os.path.join(path_to_08_output, "models")))
for model_name in models.keys():
    filename = os.path.join(path_to_08_output, "models", '{}.sav'.format(model_name))
    pickle.dump(models[model_name], open(filename, 'wb'))

In [4]:
##### general libraries import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os

import warnings 
warnings.filterwarnings("ignore")

from tqdm import tqdm

import umap

##### scikit learn import
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
import xgboost as xgb


from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.svm import SVC 

data_version = "240319"
output_version = "focus_v17_20240425"

outdir = "/media/hieunguyen/HNSD_mini/data/outdir"
PROJECT = "UMP_oral_cancer"

path_to_main_input = "/media/hieunguyen/HNSD_mini/data/UMP_Oral_cancer/input"
path_to_main_output = os.path.join(outdir, PROJECT, output_version)

cluster_score = pd.read_csv("/media/hieunguyen/HNSD_mini/data/UMP_Oral_cancer/input/240319/cluster_score.csv", sep = ";")
code_version = "v17"
path_to_01_output = os.path.join(path_to_main_output, "01_output", data_version, code_version)
path_to_04_output = os.path.join(path_to_main_output, "04_output", data_version, code_version)
path_to_05_output = os.path.join(path_to_main_output, "05_output", data_version, code_version)
path_to_06_output = os.path.join(path_to_main_output, "06_output", data_version, code_version)
path_to_08_output = os.path.join(path_to_main_output, "08_output", data_version, code_version)
os.system("mkdir -p {}".format(path_to_08_output))

all_cluster_labels = [ 'RNA.consensus.cluster', 
                      'kmean.cluster',
                      'kmean.2clusters.DrNam', 
                      'kmean.3clusters.DrNam', 
                      'merged.cluster12',
                      'merged.cluster13', 
                      'merged.cluster23']
all_cv_scores = dict()
all_best_params = dict()

sample_orders = ['230215_143', '230215_32', '230215_33', '230215_34', '230720_11',
       '230720_15', '230720_2', '230720_3', '230720_46', '230720_6',
       '230720_7', '230825_145', '230825_146', '230825_147', '230825_148',
       '230825_150', '230831_153', '230831_154', '230831_156',
       '230831_157', '230831_235', '230831_238', '230831_25', '230914_48',
       '230914_51', '230914_52', '230914_69', '230914_72', '230914_9',
       '230921_35', '230921_41', '230921_54', '230921_68', '230921_70',
       '230922_158', '230922_159', '230922_163', '230922_178',
       '230922_179', '230922_60', '231003_152', '231003_160',
       '231003_162', '231003_167', '231003_56', '231003_71', '231003_74',
       '231130_186', '231130_187', '231130_189', '231130_190',
       '231130_193', '231130_198', '231130_200', '231213_166',
       '231213_169', '231213_170', '231213_172', '231213_174',
       '231213_176', '231213_207', '231219_161', '231219_180',
       '231219_183', '231219_203', '231219_205', '231219_206',
       '231219_212', '231219_214', '231220_177', '231220_181',
       '231220_192', '231220_195', '231220_204', '231220_213',
       '231220_223', '231228_209', '231228_210', '231228_211',
       '231228_216', '231228_218', '231228_224', '240110_225',
       '240110_228', '240110_230', '240110_234', '240110_256',
       '240110_258', '240110_265', '240202_217', '240202_219',
       '240202_229', '240202_240', '240202_245', '240202_254',
       '240202_259', '240202_260', '240202_262', '240202_264',
       '240202_271', '240202_368']


umapdf = pd.read_csv(os.path.join(path_to_01_output, "umap_RNAseq.csv"), index_col = [0])

featuredf = pd.read_csv(os.path.join(path_to_06_output, "featuredf.final_with_Gender.csv"), index_col = [0]).set_index("SampleID")

featuredf = featuredf.loc[sample_orders, ].reset_index()
featuredf = featuredf.merge(umapdf, right_on = "SampleID", left_on = "SampleID").drop(["V1", "V2"], axis = 1)
selected_cluster_label = 'merged.cluster13'
selected_features = [item for item in featuredf.columns if "feature" in item]
    
X = featuredf[selected_features].to_numpy()
y = featuredf[selected_cluster_label].to_numpy()
y = [item-1 for item in y] 

all_best_params = dict()
#####----------------------------------------------------------------#####
##### XGBoost model
#####----------------------------------------------------------------#####
model_name = "XGBoost"
param_grid = {    "max_depth": [10, 20, 50, 100], 
                  "n_estimators" : [10, 20, 50, 100],
                  "min_child_weight" : range(1,6,2),  
                  "gamma" : [i/10.0 for i in range(0,5)],
                  "objective": ["binary:logistic"],
                  "tree_method": ["gpu_hist"],
                  "gpu_id": [-1]
             }
                
grid = GridSearchCV(XGBClassifier(), param_grid, refit = True, verbose = True) 
grid.fit(X, y) 
best_params = grid.best_params_
all_best_params[model_name] = best_params

#####----------------------------------------------------------------#####
##### Logistic regression
#####----------------------------------------------------------------#####
model_name = "LR"
param_grid = {
    "random_state" : [411],
    "solver" : ["newton-cg", "lbfgs", "liblinear"],
    "penalty": ["l2"]    
}
grid = GridSearchCV(LogisticRegression(), param_grid, refit = True, verbose = True) 
grid.fit(X, y) 
best_params = grid.best_params_
all_best_params[model_name] = best_params

#####----------------------------------------------------------------#####
##### SVM
#####----------------------------------------------------------------#####
model_name = "SVM"

param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf', "linear", "poly", "sigmoid"]} 

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = True) 

grid.fit(X, y) 
best_params = grid.best_params_
all_best_params[model_name] = best_params

#####----------------------------------------------------------------#####
##### FINAL FIT
#####----------------------------------------------------------------#####
cv_scores = dict()
models_with_gender = dict()

clf = GaussianNB()
cv_scores["GaussianNB"] = cross_val_score(clf, X, y, cv = 10)
models_with_gender["GaussianNB"] = clf.fit(X, y)

clf = MultinomialNB()
cv_scores["MultinomialNB"] = cross_val_score(clf, X, y, cv = 10)
models_with_gender["MultinomialNB"] = clf.fit(X, y)

clf = GaussianNB()
cv_scores["ComplementNB"] = cross_val_score(clf, X, y, cv = 10)
models_with_gender["ComplementNB"] = clf.fit(X, y)

clf = BernoulliNB()
cv_scores["BernoulliNB"] = cross_val_score(clf, X, y, cv = 10)
models_with_gender["BernoulliNB"] = clf.fit(X, y)

clf = XGBClassifier(params = all_best_params["XGBoost"], random_state = 42)
cv_scores["XGBoost"] = cross_val_score(clf, X, y, cv = 10)
models_with_gender["XGBoost"] = clf.fit(X, y)

clf = SVC(**all_best_params["SVM"])
cv_scores["SVM"] = cross_val_score(clf, X, y, cv = 10)
models_with_gender["SVM"] = clf.fit(X, y)

clf = LogisticRegression(**all_best_params["LR"])
cv_scores["LR"] = cross_val_score(clf, X, y, cv = 10)
models_with_gender["LR"] = clf.fit(X, y)

all_cv_scoredf = pd.DataFrame.from_dict(cv_scores, orient="index").T
all_cv_scoredf.to_excel(os.path.join(path_to_08_output, "all_CV_scores_final_with_Gender.xlsx"))



Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [5]:
all_cv_scoredf.mean()

GaussianNB       0.594545
MultinomialNB    0.694545
ComplementNB     0.594545
BernoulliNB      0.693636
XGBoost          0.693636
SVM              0.743636
LR               0.683636
dtype: float64

In [6]:
##### save models
os.system("mkdir -p {}".format(os.path.join(path_to_08_output, "models_with_gender")))
for model_name in models_with_gender.keys():
    filename = os.path.join(path_to_08_output, "models_with_gender", '{}.sav'.format(model_name))
    pickle.dump(models_with_gender[model_name], open(filename, 'wb'))

In [7]:
Xtest = pd.read_csv(os.path.join(path_to_06_output, "test_featuredf.final.csv"), index_col = [0]).dropna()
Xtest_gender = pd.read_csv(os.path.join(path_to_06_output, "test_featuredf.final_with_Gender.csv"), index_col = [0]).dropna()

for input_model in models.keys():
    testres = pd.DataFrame(data = models[input_model].predict(Xtest.set_index("No.").to_numpy()), columns = ["prediction"])
    testres["SampleID"] = Xtest["No."].values
    
    testres_with_gender = pd.DataFrame(data = models_with_gender[input_model].predict(Xtest_gender.set_index("No.").to_numpy()), columns = ["prediction"])
    testres_with_gender["SampleID"] = Xtest_gender["No."].values
    
    testres.to_excel(os.path.join(path_to_08_output, "test_results_noRNAseq.model_{}.xlsx".format(input_model)))
    testres_with_gender.to_excel(os.path.join(path_to_08_output, "test_results_noRNAseq.model_{}_with_gender.xlsx".format(input_model)))