In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Perceptron, SGDClassifier
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

def impute(df, label_column='GROUP'):
    imputed_dfs = []
    for group_value in df[label_column].unique().tolist():
        # Extract the non-numeric column and the numeric columns
        non_numeric_column = df[df[label_column] == group_value].iloc[:, :2].reset_index(drop=True)
        numeric_data = df[df[label_column] == group_value].iloc[:, 2:].reset_index(drop=True)

        # Initialize the KNNImputer
        knn_imputer = KNNImputer(n_neighbors=5)

        # Impute the missing values in the numeric data
        imputed_numeric_data = knn_imputer.fit_transform(numeric_data)

        # Create a new DataFrame with the imputed values
        imputed_data_frame = pd.DataFrame(imputed_numeric_data, columns=numeric_data.columns)

        imputed_data_frame = pd.concat([non_numeric_column, imputed_data_frame], axis=1)
        imputed_dfs.append(imputed_data_frame)

    return pd.concat(imputed_dfs).reset_index(drop=True)

df_Turkish_meta = pd.read_csv('../src/Turkish meta.txt', sep='\t')
df_Turkish_data = pd.read_csv('../src/Turkish data.txt', sep='\t')
df_Swedish_meta = pd.read_csv('../src/Swedish meta.txt', sep='\t')
df_Swedish_data = pd.read_csv('../src/Swedish data.txt', sep='\t')

df_Turkish = df_Turkish_meta.merge(df_Turkish_data, on='DAid')
df_Swedish = df_Swedish_meta.merge(df_Swedish_data, on='DAid')

df_combined = pd.concat([df_Turkish, df_Swedish], axis=0).reset_index(drop=True)

df_combined_imputed = impute(df_combined)

Disease_list = ['LIVD-Chronic Liver Disease (CLD)', 'LIVD-ARLD', 
                'LIVD-Hepatocellular Carcinoma (HCC)', 'LIVD-VIRAL', 'LIVD-MASLD', 
                'PANC-pancreas cancer', 'PSYC-bipolar', 'PSYC-schizophrenia', 'THEL- ', 'COLC- ', 
                'AML', 'BRC', 'CLL', 'CVX', 'ENDC', 'GLIOM', 'LUNGC', 'LYMPH', 'MENI', 'MYEL', 'OVC', 'PIT-NET', 
                'PRC', 'SI-NET']

disease_mapping = {disease: i for i, disease in enumerate(Disease_list)}

df_train_test_cohort = df_combined_imputed[df_combined_imputed['GROUP'].isin(Disease_list)].reset_index(drop=True)


x, y = df_train_test_cohort.iloc[:,2:].values, np.array([disease_mapping[disease] for disease in df_train_test_cohort['GROUP'].tolist()], dtype=int)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=3, stratify=y)

In [2]:
# Define the IDs for each category group
blood_id = [8, 10, 12, 17, 19]
liver_id = [0, 1, 2, 3, 4]
psych_id = [6, 7]
cancer_id = [5, 9, 11, 13, 14, 15, 16, 18, 20, 21, 22, 23]

In [3]:
def _create_first_layer_labels(y):
    # Create labels for the first layer
    labels = np.full_like(y, -1, dtype=int)  # Default label for safety
    labels[np.isin(y, blood_id)] = 0
    labels[np.isin(y, liver_id)] = 1
    labels[np.isin(y, psych_id)] = 2
    labels[np.isin(y, cancer_id)] = 3
    return labels

y_first_layer = _create_first_layer_labels(y_train)

sm = BorderlineSMOTE(k_neighbors=6, m_neighbors=8, n_jobs=-1, random_state=3)
x_res, y_res = sm.fit_resample(x_train, y_first_layer)

multi_classifier = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', Perceptron(random_state=3, n_jobs=-1)),
])

param_grid = {
    'clf__penalty': [None, 'l1', 'l2'],
    'clf__alpha': [1e-7, 1e-8, 1e-9]
}

tuned_multi_classifier = GridSearchCV(multi_classifier, param_grid, scoring='f1_weighted', cv=5).fit(x_res, y_res)
    
print(tuned_multi_classifier.best_estimator_)
print("F1:", f1_score(_create_first_layer_labels(y_test), tuned_multi_classifier.predict(x_test), average='weighted'))

Pipeline(steps=[('scaler', StandardScaler()),
                ('clf',
                 Perceptron(alpha=1e-07, n_jobs=-1, penalty='l1',
                            random_state=3))])
F1: 0.9841709976143244


In [4]:
blood_classifier = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', ExtraTreesClassifier(random_state=3, n_jobs=-1)),
])

param_grid = {
    'clf__n_estimators': [75, 100, 125],
    'clf__max_depth': [None, 5, 7]
}

mask = np.isin(y_train, blood_id)
sm = BorderlineSMOTE(k_neighbors=4, m_neighbors=13, n_jobs=-1, random_state=3)
x_res, y_res = sm.fit_resample(x_train[mask], y_train[mask])

tuned_blood_classifier = GridSearchCV(blood_classifier, param_grid, scoring='f1_weighted', cv=5).fit(x_res, y_res)
    
print(tuned_blood_classifier.best_estimator_)
print("F1:", f1_score(y_test[np.isin(y_test, blood_id)], tuned_blood_classifier.predict(x_test[np.isin(y_test, blood_id)]), average='weighted'))

Pipeline(steps=[('scaler', StandardScaler()),
                ('clf', ExtraTreesClassifier(n_jobs=-1, random_state=3))])
F1: 1.0


In [5]:
liver_classifier = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RidgeClassifier(random_state=3)),
])

param_grid = {
    'clf__alpha': [0.1, 1, 10],
    'clf__solver': ['svd', 'cholesky', 'lsqr']
}

mask = np.isin(y_train, liver_id)
sm = BorderlineSMOTE(k_neighbors=9, m_neighbors=7, n_jobs=-1, random_state=3)
x_res, y_res = sm.fit_resample(x_train[mask], y_train[mask])

tuned_liver_classifier = GridSearchCV(liver_classifier, param_grid, scoring='f1_weighted', cv=5).fit(x_res, y_res)
    
print(tuned_liver_classifier.best_estimator_)
print("F1:", f1_score(y_test[np.isin(y_test, liver_id)], tuned_liver_classifier.predict(x_test[np.isin(y_test, liver_id)]), average='weighted'))

Pipeline(steps=[('scaler', StandardScaler()),
                ('clf',
                 RidgeClassifier(alpha=0.1, random_state=3, solver='svd'))])
F1: 0.7522884233821734


In [6]:
psych_classifier = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SGDClassifier(random_state=3, n_jobs=-1)),
])

param_grid = {
    'clf__loss': ['hinge', 'squared_error', 'perceptron'],
    'clf__penalty': ['l2', 'l1', None],
    'clf__alpha': [5e-5, 1e-4, 2e-4]
}

mask = np.isin(y_train, psych_id)
sm = BorderlineSMOTE(k_neighbors=4, m_neighbors=3, n_jobs=-1, random_state=3)
x_res, y_res = sm.fit_resample(x_train[mask], y_train[mask])

tuned_psych_classifier = GridSearchCV(psych_classifier, param_grid, scoring='f1_weighted', cv=5).fit(x_res, y_res)
    
print(tuned_psych_classifier.best_estimator_)
print("F1:", f1_score(y_test[np.isin(y_test, psych_id)], tuned_psych_classifier.predict(x_test[np.isin(y_test, psych_id)]), average='weighted'))

Pipeline(steps=[('scaler', StandardScaler()),
                ('clf', SGDClassifier(alpha=5e-05, n_jobs=-1, random_state=3))])
F1: 0.8666666666666667


In [7]:
cancer_classifier = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(random_state=3, n_jobs=-1)),
])

param_grid = {
    'clf__penalty': ['l1', 'l2', None],
    'clf__C': [0.001, 0.005, 0.01],
    'clf__solver': ['lbfgs', 'liblinear']
}

mask = np.isin(y_train, cancer_id)
sm = BorderlineSMOTE(k_neighbors=1, m_neighbors=10, n_jobs=-1, random_state=3)
x_res, y_res = sm.fit_resample(x_train[mask], y_train[mask])

tuned_cancer_classifier = GridSearchCV(cancer_classifier, param_grid, scoring='f1_weighted', cv=5).fit(x_res, y_res)
    
print(tuned_cancer_classifier.best_estimator_)
print("F1:", f1_score(y_test[np.isin(y_test, cancer_id)], tuned_cancer_classifier.predict(x_test[np.isin(y_test, cancer_id)]), average='weighted'))

Pipeline(steps=[('scaler', StandardScaler()),
                ('clf',
                 LogisticRegression(C=0.01, n_jobs=-1, random_state=3,
                                    solver='liblinear'))])
F1: 0.8051238454657911
