In [89]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

In [90]:
data1 = pd.read_excel('U-GENE_data_RU_PL_fixed (1).xlsx')
data1 = data1[0:-1].drop(columns={'Unnamed: 0'})

In [91]:
data1 = data1.drop(columns=['ID', 'Impared fasting glucose (prediabetes)', 'Bilirubin bound, mkmol/L', 
                           'Uric acid, mmol/L', 'Glucose, mmol/L', 'α - lipoproteins, mg/dL (HDL)', 
                           ' β - lipoproteins, mmol/L (LDL)', 'Total cholesterol, mmol/L', 
                            'Triglycerides, mmol/L', 'Max life weight, kg.1', 'Birth weight, kg.1', 
                            'Weight one year ago, kg', 'Desired Weight, kg'])

data1 = data1.replace('<25', 25)
data1 = data1.replace('bd', 31)
data1 = data1.replace('ХИЛЕЗ', 4.12)
data1 = data1.replace('>350', 352)
data1 = data1.replace('`1', 1)
data1 = data1[~data1['Ethnicity'].isin([1, 2, 3, 4])].reset_index(drop=True)
data1['Ethnicity'] = data1['Ethnicity'].replace(5,1)

In [92]:
data1['BMI'] = data1['BMI'].astype(float)

In [93]:
data1['BMI'] = data1['BMI'].apply(lambda x: 1 if x > 29 else 0)

In [94]:
data1['Diabetes'] = data1['Diabetes'].replace(2,1)

In [95]:
data1['Increased fasting glucose'] = data1['Increased fasting glucose'].replace(2,1)

In [96]:
def change_target_dom(x):
    if x == 3:
        return 1
    elif x == 2:
        return 1
    else:
        return 0 

In [97]:
targets = ['Condition', 'Vascular diseases ','Diabetes',
       'Heart attack, chest pain, or other heart condition', 'Thyroid disease',
       'Dyslipidemia', 'Increased triglycerides', 'High blood pressure',
       'Increased fasting glucose', 'Metabolic Syndrome', 'BMI']

genetics = ['UCP1 rs45539933 (+1068GA)', 'UCP1 rs1800592  (-3826AG)',
       'UCP1 rs3811791 (-1766AG)', 'UCP1 rs10011540 (-112AC)',
       'UCP1 rs2270565 (Met229Leu)', 'UCP2 rs659366 (-866GA)', 'UCP2 rs660339 (Ala55Val)',
       'UCP3 rs1800849', 'FTO rs9939609']

physical = ['Sex', 'Age', 'Ethnicity']



In [98]:
#for el in genetics:
#    data1[el] = data1[el].apply(lambda x: change_target_dom(x))

In [99]:
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score, average_precision_score

from sklearn.metrics import confusion_matrix
import seaborn as sn
import matplotlib.pyplot  as plt
import numpy as np


from sklearn.metrics import classification_report

In [100]:
def get_prediction_log_reg(X, Y):

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(
        X, Y, test_size=0.25, random_state = 45,  shuffle=True
    )

    grid = {'C':np.logspace(-3,3,7), 
        'penalty':['none', 'l2']}

    model = LogisticRegression()

    logreg_cv = GridSearchCV(model, grid, cv=5, scoring='f1_macro')
    model.fit(Xtrain, Ytrain)

    #print('Best parameters: ', logreg_cv.best_params_)
    #print('Best score: ', logreg_cv.best_score_)

    preds = model.predict(Xtest)
    
    pred_metrics = {'accuracy':round(accuracy_score(Ytest, preds),3), 
                    'recall':round(recall_score(Ytest, preds), 3), 
                    'precision':round(precision_score(Ytest, preds), 3), 
                    'roc_auc_score':round(roc_auc_score(Ytest, preds), 3), 
                   'specificity':round(recall_score(np.logical_not(Ytest) , np.logical_not(preds)),3)}
    return pred_metrics

In [101]:
def get_prediction_dec_tree(X, Y):

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(
        X, Y, test_size=0.25, random_state = 45, shuffle=True
    )

    
    
    dec_tree = DecisionTreeClassifier(random_state=0)
    param_grid = {"max_depth": [40, 60, 80],
                  "min_samples_leaf": [1, 2],
                  "max_leaf_nodes": [180, 200, 220, 240],
                  }

    grid_cv_dtm = GridSearchCV(dec_tree, param_grid, cv=7, scoring='f1_macro')

    dec_tree.fit(Xtrain, Ytrain)
    #print('Best params: ', grid_cv_dtm.best_params_)
    #print('Best score: ', grid_cv_dtm.best_score_)

    preds_dec_grid = dec_tree.predict(Xtest)

    
    pred_metrics = {'accuracy':round(accuracy_score(Ytest, preds_dec_grid),3), 
                    'recall':round(recall_score(Ytest, preds_dec_grid), 3), 
                    'precision':round(precision_score(Ytest, preds_dec_grid), 3), 
                    'roc_auc_score':round(roc_auc_score(Ytest, preds_dec_grid), 3), 
                    'specificity':round(recall_score(np.logical_not(Ytest) , np.logical_not(preds_dec_grid)),3)}
    return pred_metrics

In [102]:
def get_prediction_rf(X, Y):

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(
        X, Y, test_size=0.25, random_state = 45,  shuffle=True
    )

    

    param_grid = {
        'bootstrap': [True],
        'max_depth': [100, 110],
        'max_features': [2, 3],
        'min_samples_leaf': [4, 5],
        'min_samples_split': [8, 10],
        'n_estimators': [10, 20]
    }

    rf = RandomForestClassifier()

    rf.fit(Xtrain, Ytrain)
    #print('Best params: ', grid_cv_rf.best_params_)
    #print('Best score: ', grid_cv_rf.best_score_)

    preds_rf_grid = rf.predict(Xtest)

    
    pred_metrics = {'accuracy':round(accuracy_score(Ytest, preds_rf_grid),3), 
                    'recall':round(recall_score(Ytest, preds_rf_grid), 3), 
                    'precision':round(precision_score(Ytest, preds_rf_grid), 3), 
                    'roc_auc_score':round(roc_auc_score(Ytest, preds_rf_grid), 3), 
                   'specificity':round(recall_score(np.logical_not(Ytest) , np.logical_not(preds_rf_grid)),3)}
    
    return pred_metrics

In [103]:
def get_prediction_svm(X, Y):
    
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(
        X, Y, test_size=0.25, random_state = 45, shuffle=True
    )

    
    grid = {'kernel':['rbf', 'poly']}

    model = svm.SVC()
    # define the ovr strategy

    svm_cv = GridSearchCV(model, grid, cv=2, scoring='f1_macro') 
    model.fit(Xtrain, Ytrain)

    #print('Best parameters: ', svm_cv.best_params_)
    #print('Best score: ', svm_cv.best_score_)

    preds = model.predict(Xtest)
    
    pred_metrics = {'accuracy':round(accuracy_score(Ytest, preds),3), 
                    'recall':round(recall_score(Ytest, preds), 3), 
                    'precision':round(precision_score(Ytest, preds), 3), 
                    'roc_auc_score':round(roc_auc_score(Ytest, preds), 3), 
                   'specificity':round(recall_score(np.logical_not(Ytest) , np.logical_not(preds)),3)}

    
    return pred_metrics

In [104]:
import itertools

def get_combinations(input_list):
    result = []
    for r in range(1, len(input_list) + 1):
        combinations = list(itertools.combinations(input_list, r))
        result.extend(combinations)
    return result


items = [genetics, physical]
combinations = get_combinations(items)

# Преобразуем кортежи в списки
combinations_as_lists = [list(comb) for comb in combinations]

print(len(combinations_as_lists))

3


In [105]:
final_combs = []
for el in combinations_as_lists:
    final_list = []
    for el2 in el:
        for k in el2:
            final_list.append(k)
    final_combs.append(final_list)

In [106]:
def main_prediction(df, features, target):
    
    final_features = []
    for el in features:
        for k in el.split('__'):
            final_features.append(k)
    
        
    
    filter_ = final_features + [target]
    df_main = df[filter_].dropna().reset_index(drop=True)
    
    if df_main.empty:
        print('df empty')
        return None
    
    X = df_main[final_features]
    Y = df_main[target]
    
    log_reg_metrics = get_prediction_log_reg(X, Y)
    dec_tree_metrics = get_prediction_dec_tree(X, Y)
    rf_metrics = get_prediction_rf(X, Y)
    svm_metrics = get_prediction_svm(X, Y)
    
    
    all_metrics = {}
    
    
    all_metrics['logreg'] = log_reg_metrics['roc_auc_score']
    all_metrics['dec_tree'] = dec_tree_metrics['roc_auc_score']
    all_metrics['rf'] = rf_metrics['roc_auc_score']
    all_metrics['svm'] = svm_metrics['roc_auc_score']
    return all_metrics

In [107]:
all_metrics_final = {}
for comb in final_combs:
    
    for tar in targets:
        
        print('Combination:  ' + '; '.join(comb))
        print('Target:  ' + tar)
        all_metrics_final[tar+ '~' +'+'.join(comb)] = main_prediction(data1, comb, tar)
        #except:
        #    print('something went wrong')
        #    all_metrics_final[tar+ '~' +'+'.join(comb)] = None
        #print('\n')

Combination:  UCP1 rs45539933 (+1068GA); UCP1 rs1800592  (-3826AG); UCP1 rs3811791 (-1766AG); UCP1 rs10011540 (-112AC); UCP1 rs2270565 (Met229Leu); UCP2 rs659366 (-866GA); UCP2 rs660339 (Ala55Val); UCP3 rs1800849; FTO rs9939609
Target:  Condition
Combination:  UCP1 rs45539933 (+1068GA); UCP1 rs1800592  (-3826AG); UCP1 rs3811791 (-1766AG); UCP1 rs10011540 (-112AC); UCP1 rs2270565 (Met229Leu); UCP2 rs659366 (-866GA); UCP2 rs660339 (Ala55Val); UCP3 rs1800849; FTO rs9939609
Target:  Vascular diseases 
Combination:  UCP1 rs45539933 (+1068GA); UCP1 rs1800592  (-3826AG); UCP1 rs3811791 (-1766AG); UCP1 rs10011540 (-112AC); UCP1 rs2270565 (Met229Leu); UCP2 rs659366 (-866GA); UCP2 rs660339 (Ala55Val); UCP3 rs1800849; FTO rs9939609
Target:  Diabetes
Combination:  UCP1 rs45539933 (+1068GA); UCP1 rs1800592  (-3826AG); UCP1 rs3811791 (-1766AG); UCP1 rs10011540 (-112AC); UCP1 rs2270565 (Met229Leu); UCP2 rs659366 (-866GA); UCP2 rs660339 (Ala55Val); UCP3 rs1800849; FTO rs9939609
Target:  Heart attack, 

In [108]:
metrics = pd.DataFrame(all_metrics_final).T.dropna().reset_index()

In [109]:
metrics['Target'] = metrics['index'].apply(lambda x:x.split('~')[0])

In [110]:
metrics['Predictors'] = metrics['index'].apply(lambda x:x.split('~')[1])

In [111]:
metrics = metrics.drop(columns=['index'])

In [112]:
metrics['Predictors'].unique()

array(['UCP1 rs45539933 (+1068GA)+UCP1 rs1800592  (-3826AG)+UCP1 rs3811791 (-1766AG)+UCP1 rs10011540 (-112AC)+UCP1 rs2270565 (Met229Leu)+UCP2 rs659366 (-866GA)+UCP2 rs660339 (Ala55Val)+UCP3 rs1800849+FTO rs9939609',
       'Sex+Age+Ethnicity',
       'UCP1 rs45539933 (+1068GA)+UCP1 rs1800592  (-3826AG)+UCP1 rs3811791 (-1766AG)+UCP1 rs10011540 (-112AC)+UCP1 rs2270565 (Met229Leu)+UCP2 rs659366 (-866GA)+UCP2 rs660339 (Ala55Val)+UCP3 rs1800849+FTO rs9939609+Sex+Age+Ethnicity'],
      dtype=object)

In [113]:
metrics = metrics[metrics['Predictors'] != 'UCP1 rs45539933 (+1068GA)+UCP1 rs1800592  (-3826AG)+UCP1 rs3811791 (-1766AG)+UCP1 rs10011540 (-112AC)+UCP1 rs2270565 (Met229Leu)+UCP2 rs659366 (-866GA)+UCP2 rs660339 (Ala55Val)+UCP3 rs1800849+FTO rs9939609']

In [114]:
metrics

Unnamed: 0,logreg,dec_tree,rf,svm,Target,Predictors
11,0.724,0.699,0.706,0.745,Condition,Sex+Age+Ethnicity
12,0.695,0.679,0.729,0.5,Vascular diseases,Sex+Age+Ethnicity
13,0.5,0.56,0.553,0.5,Diabetes,Sex+Age+Ethnicity
14,0.602,0.607,0.636,0.5,"Heart attack, chest pain, or other heart condi...",Sex+Age+Ethnicity
15,0.579,0.527,0.582,0.5,Thyroid disease,Sex+Age+Ethnicity
16,0.57,0.592,0.594,0.5,Dyslipidemia,Sex+Age+Ethnicity
17,0.627,0.582,0.61,0.62,Increased triglycerides,Sex+Age+Ethnicity
18,0.653,0.622,0.607,0.658,High blood pressure,Sex+Age+Ethnicity
19,0.5,0.573,0.611,0.5,Increased fasting glucose,Sex+Age+Ethnicity
20,0.731,0.68,0.718,0.5,Metabolic Syndrome,Sex+Age+Ethnicity


In [116]:
metrics.pivot(index='Target', columns='Predictors', values=['logreg', 
                                                            'dec_tree', 
                                                            'rf', 
                                                            'svm'])

Unnamed: 0_level_0,logreg,logreg,dec_tree,dec_tree,rf,rf,svm,svm
Predictors,Sex+Age+Ethnicity,UCP1 rs45539933 (+1068GA)+UCP1 rs1800592 (-3826AG)+UCP1 rs3811791 (-1766AG)+UCP1 rs10011540 (-112AC)+UCP1 rs2270565 (Met229Leu)+UCP2 rs659366 (-866GA)+UCP2 rs660339 (Ala55Val)+UCP3 rs1800849+FTO rs9939609+Sex+Age+Ethnicity,Sex+Age+Ethnicity,UCP1 rs45539933 (+1068GA)+UCP1 rs1800592 (-3826AG)+UCP1 rs3811791 (-1766AG)+UCP1 rs10011540 (-112AC)+UCP1 rs2270565 (Met229Leu)+UCP2 rs659366 (-866GA)+UCP2 rs660339 (Ala55Val)+UCP3 rs1800849+FTO rs9939609+Sex+Age+Ethnicity,Sex+Age+Ethnicity,UCP1 rs45539933 (+1068GA)+UCP1 rs1800592 (-3826AG)+UCP1 rs3811791 (-1766AG)+UCP1 rs10011540 (-112AC)+UCP1 rs2270565 (Met229Leu)+UCP2 rs659366 (-866GA)+UCP2 rs660339 (Ala55Val)+UCP3 rs1800849+FTO rs9939609+Sex+Age+Ethnicity,Sex+Age+Ethnicity,UCP1 rs45539933 (+1068GA)+UCP1 rs1800592 (-3826AG)+UCP1 rs3811791 (-1766AG)+UCP1 rs10011540 (-112AC)+UCP1 rs2270565 (Met229Leu)+UCP2 rs659366 (-866GA)+UCP2 rs660339 (Ala55Val)+UCP3 rs1800849+FTO rs9939609+Sex+Age+Ethnicity
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
BMI,0.736,0.728,0.673,0.609,0.679,0.686,0.666,0.642
Condition,0.724,0.706,0.699,0.671,0.706,0.686,0.745,0.733
Diabetes,0.5,0.5,0.56,0.607,0.553,0.554,0.5,0.5
Dyslipidemia,0.57,0.593,0.592,0.589,0.594,0.569,0.5,0.5
"Heart attack, chest pain, or other heart condition",0.602,0.609,0.607,0.596,0.636,0.634,0.5,0.5
High blood pressure,0.653,0.658,0.622,0.582,0.607,0.644,0.658,0.683
Increased fasting glucose,0.5,0.534,0.573,0.6,0.611,0.567,0.5,0.5
Increased triglycerides,0.627,0.645,0.582,0.537,0.61,0.611,0.62,0.639
Metabolic Syndrome,0.731,0.766,0.68,0.702,0.718,0.732,0.5,0.5
Thyroid disease,0.579,0.567,0.527,0.568,0.582,0.558,0.5,0.5


In [115]:
metrics.pivot(index='Target', columns='Predictors', values=['logreg', 
                                                            'dec_tree', 
                                                            'rf', 
                                                            'svm']).to_excel('turn_off_add.xlsx')