In [2]:
"""
Evaluation of informative features from RLR models 

Will save the results to various .tsv/.csv files


"""

'\nEvaluation of informative features from RLR models \n\nWill save the results to various .tsv/.csv files\n\n\n'

In [82]:
import argparse
import numpy as np
import os.path
import pandas as pd
import pathlib
from pprint import pprint
import scipy
import sklearn
import sys
import traceback
import warnings
# from shutil import rmtree
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [5]:
sklearn.__version__

'0.24.2'

In [7]:
input_file = "/Users/schmidtj/Documents/GitHub/MalariaVaccineEfficacyPrediction/results/RLR/whole/RGSCV/RepeatedGridSearchCV_results_24.03.2022_09-23-48.tsv"
results_RLR_whole = pd.read_csv(input_file, sep="\t", index_col=0)

In [8]:
results_RLR_whole

Unnamed: 0,time,scoring,best_params,best_score
0,III14,mcc,"{'logisticregression__C': 0.1, 'logisticregres...",0.652156
1,III14,precision_recall_auc,"{'logisticregression__C': 0.1, 'logisticregres...",0.890387
2,III14,roc_auc,"{'logisticregression__C': 0.1, 'logisticregres...",0.897096
3,C-1,mcc,"{'logisticregression__C': 0.1, 'logisticregres...",0.730569
4,C-1,precision_recall_auc,"{'logisticregression__C': 0.1, 'logisticregres...",0.894966
5,C-1,roc_auc,"{'logisticregression__C': 0.1, 'logisticregres...",0.896338
6,C28,mcc,"{'logisticregression__C': 1.0, 'logisticregres...",0.778867
7,C28,precision_recall_auc,"{'logisticregression__C': 1.0, 'logisticregres...",0.92005
8,C28,roc_auc,"{'logisticregression__C': 1.0, 'logisticregres...",0.92399


In [97]:
def get_kernel_paramter(kernel_parameter):
    """ Returns the combination of kernel parameters from the results of the
        multitask-SVM approach based on the highest mean AUC.

        see results of the Parser_multitask_SVM.py module

        Args: kernel_parameter: results of the multitask-SVM approach as .csv file

        Returns:
            pam (list): Combination of kernel parameter for the combination of kernel functions for the
            multitask-SVM classifier
            based on the highest mean AUC value
            best_AUC (float): highest mean AUC value
        """
    pam_roc_auc = kernel_parameter[kernel_parameter['scoring'].isin(['roc_auc'])]
    pam = pam_roc_auc['best_params']
    #print(pam)
    pam = pam.str.split(" ", expand=True).values
    #print(pam)
    return pam

In [10]:
def select_time_point(kernel_parameter, time_point):
    """
    Selection of the time point to run ESPY measurment
    
    Parameter:
    ---------
    kernel_parameter: dataframe
        performance results per time point
    time_point: str
        preferable time point 
        
    Returns:
    --------
    X: np.darray
        matrix of performance scores per time point
    
    """
    X = kernel_parameter[kernel_parameter['time'].isin([time_point])]
    #print(X)
    return X

In [11]:
time_point_III14 = select_time_point(results_RLR_whole, "III14")

In [98]:
kernel_parameter = get_kernel_paramter(time_point_III14)

In [102]:
kernel_parameter

'0.1,'

In [191]:
def RLR_model(data, kernel_parameter):
    
    X_data = data.iloc[:, 5:].to_numpy()
    #print(X_data)
    y_labels = data.loc[:, 'Protection'].to_numpy()
    #print(y_labels)
    
    c = pd.to_numeric(kernel_parameter[0][1].split(",")[0])
    print("C-value:" + str(c))
    print(type(c))
    l1_value = pd.to_numeric(kernel_parameter[0][3].split("}")[0])
    print("l1_value:" + str(l1_value))
    
    estimator = make_pipeline(
            StandardScaler(
                with_mean=True,
                with_std=True,
            ),
            LogisticRegression(
                penalty='elasticnet',
                C = c,
                solver='saga',
                l1_ratio = l1_value,
                max_iter=10000,
            ),
            # memory=cachedir,
        )
    estimator.fit(X_data, y_labels)
    print(estimator)
    model = estimator[1]
    print("Non Zero weights:",np.count_nonzero(model.coef_))
    cdf = pd.concat([pd.DataFrame(data.iloc[:,5:].columns),pd.DataFrame(np.transpose(model.coef_))], axis = 1)
    cdf.columns = ['Pf_antigen_ID', 'weight']
    cdf = cdf.sort_values(by = ['weight'], ascending = True)
    cdf_nonzeros = cdf[cdf['weight'] != 0]
    return model, cdf_nonzeros
    

## Informative features evaluation at III14

In [74]:
proteome_input_file = "/Users/schmidtj/Documents/GitHub/MalariaVaccineEfficacyPrediction/data/timepoint-wise/whole_data_III14.csv"
proteome_data_whole = pd.read_csv(proteome_input_file)

In [75]:
proteome_data_whole

Unnamed: 0,Patient,group,Protection,Dose,TimePointOrder,mal_mito_3.iso1.exon1.amp1_4066,mal_mito_2.iso1.exon1.amp1_1466,mal_mito_1.iso1.exon1.amp1_3777,PFL2430c_484,PFL2215w_4059,...,MAL8P1.320_1o1_4243,MAL8P1.156_187,MAL8P1.155_2o3_4890,MAL8P1.143_3o15_2092,MAL8P1.143_1o15_2416,MAL8P1.143_14o15_1450,MAL8P1.140_4049,MAL7P1.32_5037,MAL13P1.303e4s1_1829,MAL13P1.303e1s1_4421
0,T2-002 III 14,2,1,1,2,7.033423,6.357552,6.954196,3.70044,0.0,...,0.0,0.0,0.0,11.168672,9.628446,0.0,0.0,15.327447,0.0,0.0
1,T2-005 III 14,5,0,0,2,8.59619,8.481799,8.840778,9.076816,9.04576,...,8.663558,0.0,9.17742,9.353147,7.632995,0.0,8.290019,8.556506,8.873444,8.96145
2,T2-006 III 14,6,1,1,2,9.838416,9.589651,10.115694,10.676398,11.304922,...,10.879966,10.665336,10.838416,10.771489,10.681678,10.901621,10.989394,13.195757,11.017504,10.706064
3,T2-007 III 14,7,0,1,2,6.965784,7.451211,7.149747,10.499846,5.066089,...,0.0,7.396605,0.0,11.946724,8.33985,8.497852,0.0,15.761292,6.209453,0.0
4,T2-010 III 14,10,0,2,2,6.483816,7.97728,7.8009,10.444497,9.814582,...,8.147205,10.837628,10.098032,11.304351,10.863799,10.42574,10.063395,14.122383,10.745674,0.0
5,T2-011 III 14,11,0,0,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,T2-012 III 14,12,0,1,2,7.027906,7.499846,7.974415,9.063395,0.0,...,7.388017,8.445015,9.463524,10.251482,0.0,9.923327,0.0,12.406736,8.828136,8.560333
7,T2-015 III 14,15,1,1,2,0.0,0.0,0.0,0.0,0.0,...,0.0,7.409391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,T2-017 III 14,17,1,3,2,0.0,0.0,0.0,0.0,3.584963,...,7.751544,6.820179,8.197217,6.651052,4.459432,7.422065,7.434628,7.312883,5.870365,7.266787
9,T2-018 III 14,18,1,2,2,0.0,0.0,0.0,0.0,0.0,...,9.310613,0.0,0.0,0.0,0.0,0.0,8.618386,0.0,0.0,0.0


In [76]:
np.all(np.isin(proteome_data_whole.dtypes.to_list()[5:], ['float64']))

True

In [77]:
def rearagne_columns(data):
    df = data.copy()
    dose = df['Dose']
    df = df.drop(columns=['Dose'])
    df.insert(loc=4, column='Dose', value=dose)
    #print(df)
    
    return df
    

In [78]:
proteome_data_whole = rearagne_columns(proteome_data_whole)

In [79]:
proteome_data_whole

Unnamed: 0,Patient,group,Protection,TimePointOrder,Dose,mal_mito_3.iso1.exon1.amp1_4066,mal_mito_2.iso1.exon1.amp1_1466,mal_mito_1.iso1.exon1.amp1_3777,PFL2430c_484,PFL2215w_4059,...,MAL8P1.320_1o1_4243,MAL8P1.156_187,MAL8P1.155_2o3_4890,MAL8P1.143_3o15_2092,MAL8P1.143_1o15_2416,MAL8P1.143_14o15_1450,MAL8P1.140_4049,MAL7P1.32_5037,MAL13P1.303e4s1_1829,MAL13P1.303e1s1_4421
0,T2-002 III 14,2,1,2,1,7.033423,6.357552,6.954196,3.70044,0.0,...,0.0,0.0,0.0,11.168672,9.628446,0.0,0.0,15.327447,0.0,0.0
1,T2-005 III 14,5,0,2,0,8.59619,8.481799,8.840778,9.076816,9.04576,...,8.663558,0.0,9.17742,9.353147,7.632995,0.0,8.290019,8.556506,8.873444,8.96145
2,T2-006 III 14,6,1,2,1,9.838416,9.589651,10.115694,10.676398,11.304922,...,10.879966,10.665336,10.838416,10.771489,10.681678,10.901621,10.989394,13.195757,11.017504,10.706064
3,T2-007 III 14,7,0,2,1,6.965784,7.451211,7.149747,10.499846,5.066089,...,0.0,7.396605,0.0,11.946724,8.33985,8.497852,0.0,15.761292,6.209453,0.0
4,T2-010 III 14,10,0,2,2,6.483816,7.97728,7.8009,10.444497,9.814582,...,8.147205,10.837628,10.098032,11.304351,10.863799,10.42574,10.063395,14.122383,10.745674,0.0
5,T2-011 III 14,11,0,2,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,T2-012 III 14,12,0,2,1,7.027906,7.499846,7.974415,9.063395,0.0,...,7.388017,8.445015,9.463524,10.251482,0.0,9.923327,0.0,12.406736,8.828136,8.560333
7,T2-015 III 14,15,1,2,1,0.0,0.0,0.0,0.0,0.0,...,0.0,7.409391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,T2-017 III 14,17,1,2,3,0.0,0.0,0.0,0.0,3.584963,...,7.751544,6.820179,8.197217,6.651052,4.459432,7.422065,7.434628,7.312883,5.870365,7.266787
9,T2-018 III 14,18,1,2,2,0.0,0.0,0.0,0.0,0.0,...,9.310613,0.0,0.0,0.0,0.0,0.0,8.618386,0.0,0.0,0.0


In [192]:
RLR_model_III14, coeffiecients_III14 = RLR_model(proteome_data_whole, kernel_parameter)

C-value:0.1
<class 'numpy.float64'>
l1_value:1.0
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.1, l1_ratio=1.0, max_iter=10000,
                                    penalty='elasticnet', solver='saga'))])
Non Zero weights: 2


In [193]:
coeffiecients_III14

Unnamed: 0,Pf_antigen_ID,weight
1997,PF3D7_1306600.3o3_2037,-0.116641
757,PF3D7_1432100.e6s1_2174,-0.043455


## Informative feature evaluation at C-1

In [173]:
proteome_input_file = "/Users/schmidtj/Documents/GitHub/MalariaVaccineEfficacyPrediction/data/timepoint-wise/whole_data_C-1.csv"
proteome_data_whole_C1 = pd.read_csv(proteome_input_file)

In [174]:
np.all(np.isin(proteome_data_whole_C1.dtypes.to_list()[5:], ['float64']))

True

In [175]:
proteome_data_whole_C1 = rearagne_columns(proteome_data_whole_C1)

In [176]:
time_point_C1 = select_time_point(results_RLR_whole, "C-1")

In [177]:
kernel_parameter_C1 = get_kernel_paramter(time_point_C1)

In [178]:
kernel_parameter_C1

array([["{'logisticregression__C':", '0.1,',
        "'logisticregression__l1_ratio':", '0.7}']], dtype=object)

In [194]:
RLR_model_C1, coeffiecients_C1 = RLR_model(proteome_data_whole_C1, kernel_parameter_C1)

C-value:0.1
<class 'numpy.float64'>
l1_value:0.7
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.1, l1_ratio=0.7, max_iter=10000,
                                    penalty='elasticnet', solver='saga'))])
Non Zero weights: 13


In [195]:
coeffiecients_C1

Unnamed: 0,Pf_antigen_ID,weight
40,PFC10_API0059.iso1.exon1.amp1_short_1180,-0.172482
5570,PF3D7_0618300.iso1.exon1.amp1_1184,-0.136246
6065,PF3D7_0511700.iso1.exon4.amp1_2340,-0.119547
1101,PF3D7_1404600.iso3.exon5.amp1_1181,-0.118711
1144,PF3D7_1401900.iso1.exon4.amp1_3493,-0.096907
2099,PF3D7_1254300.iso1.exon2.amp1_2431,-0.061047
812,PF3D7_1427700.iso1.exon1.amp1_1501,-0.058478
7176,PF3D7_0201800.iso1.exon3.amp1_2338,-0.056893
2428,PF3D7_1229000.iso1.exon1.amp2_2206,-0.03808
740,PF3D7_1432700.iso1.exon2.amp1_1183,-0.012493


# Informative feature evaluation C+28

In [196]:
proteome_input_file = "/Users/schmidtj/Documents/GitHub/MalariaVaccineEfficacyPrediction/data/timepoint-wise/whole_data_C28.csv"
proteome_data_whole_C28 = pd.read_csv(proteome_input_file)

In [197]:
np.all(np.isin(proteome_data_whole_C28.dtypes.to_list()[5:], ['float64']))

True

In [198]:
proteome_data_whole_C28 = rearagne_columns(proteome_data_whole_C28)

In [201]:
time_point_C28 = select_time_point(results_RLR_whole, "C28")

In [202]:
time_point_C28

Unnamed: 0,time,scoring,best_params,best_score
6,C28,mcc,"{'logisticregression__C': 1.0, 'logisticregres...",0.778867
7,C28,precision_recall_auc,"{'logisticregression__C': 1.0, 'logisticregres...",0.92005
8,C28,roc_auc,"{'logisticregression__C': 1.0, 'logisticregres...",0.92399


In [203]:
kernel_parameter_C28 = get_kernel_paramter(time_point_C28)

In [204]:
kernel_parameter_C28

array([["{'logisticregression__C':", '1.0,',
        "'logisticregression__l1_ratio':", '1.0}']], dtype=object)

In [205]:
RLR_model_C28, coeffiecients_C28 = RLR_model(proteome_data_whole_C28, kernel_parameter_C28)

C-value:1.0
<class 'numpy.float64'>
l1_value:1.0
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(l1_ratio=1.0, max_iter=10000,
                                    penalty='elasticnet', solver='saga'))])
Non Zero weights: 44


In [206]:
coeffiecients_C28

Unnamed: 0,Pf_antigen_ID,weight
3215,PF3D7_1121600.iso1.exon2.amp1_2679,-0.556014
6908,PF3D7_0223300.iso1.exon2.amp1_3611,-0.370055
21,PFE1590w_2119,-0.271777
3968,PF3D7_1002100.2o2_2704,-0.259108
2641,PF3D7_1213200.1o1_57,-0.244142
4483,PF3D7_0833300.iso1.exon1.amp1_1870,-0.217699
3541,PF3D7_1035200.1o1_117,-0.208249
4220,PF3D7_0920800.e1s1_251,-0.197429
3486,PF3D7_1041100.iso1.exon2.amp1_3013,-0.195328
67,PFB0300c_4064,-0.183568


# Evaluation of informatives features from selective set

In [234]:
input_file2 = "/Users/schmidtj/Documents/GitHub/MalariaVaccineEfficacyPrediction/results/RLR/selective/RGSCV/RepeatedGridSearchCV_results_24.03.2022_12-47-24.tsv"
results_RLR_selective = pd.read_csv(input_file2, sep="\t", index_col=0)

In [235]:
results_RLR_selective

Unnamed: 0,time,scoring,best_params,best_score
0,III14,mcc,"{'logisticregression__C': 0.1, 'logisticregres...",0.676579
1,III14,precision_recall_auc,"{'logisticregression__C': 0.1, 'logisticregres...",0.900958
2,III14,roc_auc,"{'logisticregression__C': 0.1, 'logisticregres...",0.909343
3,C-1,mcc,"{'logisticregression__C': 0.1, 'logisticregres...",0.698802
4,C-1,precision_recall_auc,"{'logisticregression__C': 0.1, 'logisticregres...",0.905956
5,C-1,roc_auc,"{'logisticregression__C': 0.1, 'logisticregres...",0.893687
6,C28,mcc,"{'logisticregression__C': 1.0, 'logisticregres...",0.77943
7,C28,precision_recall_auc,"{'logisticregression__C': 1.0, 'logisticregres...",0.944973
8,C28,roc_auc,"{'logisticregression__C': 1.0, 'logisticregres...",0.948232


# at III14

In [236]:
proteome_input_file1 = "/Users/schmidtj/Documents/GitHub/MalariaVaccineEfficacyPrediction/data/timepoint-wise/selective_data_III14.csv"
proteome_data_selective_III14 = pd.read_csv(proteome_input_file1)

In [237]:
np.all(np.isin(proteome_data_selective_III14.dtypes.to_list()[5:], ['float64']))

True

In [238]:
proteome_data_selective_III14 = rearagne_columns(proteome_data_selective_III14)

In [239]:
time_point_s_III14 = select_time_point(results_RLR_selective, "III14")

In [240]:
time_point_s_III14

Unnamed: 0,time,scoring,best_params,best_score
0,III14,mcc,"{'logisticregression__C': 0.1, 'logisticregres...",0.676579
1,III14,precision_recall_auc,"{'logisticregression__C': 0.1, 'logisticregres...",0.900958
2,III14,roc_auc,"{'logisticregression__C': 0.1, 'logisticregres...",0.909343


In [241]:
kernel_parameter_s_III14 = get_kernel_paramter(time_point_s_III14)

In [242]:
kernel_parameter_s_III14

array([["{'logisticregression__C':", '0.1,',
        "'logisticregression__l1_ratio':", '0.9}']], dtype=object)

In [216]:
RLR_model_s_III14, coeffiecients_s_III14 = RLR_model(proteome_data_selective_III14, kernel_parameter_s_III14)

C-value:0.1
<class 'numpy.float64'>
l1_value:1.0
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.1, l1_ratio=1.0, max_iter=10000,
                                    penalty='elasticnet', solver='saga'))])
Non Zero weights: 2


In [217]:
coeffiecients_s_III14

Unnamed: 0,Pf_antigen_ID,weight
277,PF3D7_1209300.e1s2_2212,-0.050442
315,PF3D7_1201000.e2s1_2210,-0.008373


# at C-1

In [218]:
proteome_input_file2 = "/Users/schmidtj/Documents/GitHub/MalariaVaccineEfficacyPrediction/data/timepoint-wise/selective_data_C-1.csv"
proteome_data_selective_C1 = pd.read_csv(proteome_input_file2)

In [219]:
np.all(np.isin(proteome_data_selective_C1.dtypes.to_list()[5:], ['float64']))

True

In [220]:
proteome_data_selective_C1 = rearagne_columns(proteome_data_selective_C1)

In [221]:
time_point_s_C1 = select_time_point(results_RLR_selective, "C-1")

In [222]:
kernel_parameter_s_C1 = get_kernel_paramter(time_point_s_C1)

In [223]:
kernel_parameter_s_C1

array([["{'logisticregression__C':", '0.1,',
        "'logisticregression__l1_ratio':", '0.7}']], dtype=object)

In [224]:
RLR_model_s_C1, coeffiecients_s_C1 = RLR_model(proteome_data_selective_C1, kernel_parameter_s_C1)

C-value:0.1
<class 'numpy.float64'>
l1_value:0.7
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.1, l1_ratio=0.7, max_iter=10000,
                                    penalty='elasticnet', solver='saga'))])
Non Zero weights: 7


In [225]:
coeffiecients_s_C1

Unnamed: 0,Pf_antigen_ID,weight
390,PF3D7_1115900.iso1.exon2.amp1_4360,-0.256785
260,PF3D7_1229000.iso1.exon1.amp2_2206,-0.222281
915,PF3D7_0514500.iso1.exon6.amp1_1487,-0.138857
1026,PF3D7_0400100.iso1.exon2.amp1_4192,-0.083968
826,PF3D7_0632800.iso1.exon1.amp1_4482,-0.025289
600,PF3D7_0832700.iso1.exon2.amp1_2380,-0.021632
1130,PF3D7_0200200.iso1.exon1.amp1_1284,-0.019726


# at C28

In [226]:
proteome_input_file3 = "/Users/schmidtj/Documents/GitHub/MalariaVaccineEfficacyPrediction/data/timepoint-wise/selective_data_C28.csv"
proteome_data_selective_C28 = pd.read_csv(proteome_input_file3)

In [227]:
np.all(np.isin(proteome_data_selective_C28.dtypes.to_list()[5:], ['float64']))

True

In [228]:
proteome_data_selective_C28 = rearagne_columns(proteome_data_selective_C28)

In [229]:
time_point_s_C28 = select_time_point(results_RLR_selective, "C28")

In [230]:
kernel_parameter_s_C28 = get_kernel_paramter(time_point_s_C28)

In [231]:
kernel_parameter_s_C28

array([["{'logisticregression__C':", '1.0,',
        "'logisticregression__l1_ratio':", '1.0}']], dtype=object)

In [232]:
RLR_model_s_C28, coeffiecients_s_C28 = RLR_model(proteome_data_selective_C28, kernel_parameter_s_C28)

C-value:1.0
<class 'numpy.float64'>
l1_value:1.0
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(l1_ratio=1.0, max_iter=10000,
                                    penalty='elasticnet', solver='saga'))])
Non Zero weights: 17


In [233]:
coeffiecients_s_C28

Unnamed: 0,Pf_antigen_ID,weight
383,PF3D7_1121600.iso1.exon2.amp1_2679,-1.079886
1073,PF3D7_0223300.iso1.exon2.amp1_3611,-0.579821
0,PFE1590w_2119,-0.478398
439,PF3D7_1035200.1o1_117,-0.350594
416,PF3D7_1041100.iso1.exon2.amp1_3013,-0.337831
3,PFB0300c_4064,-0.335427
594,PF3D7_0833300.iso1.exon1.amp1_1870,-0.16968
606,PF3D7_0831500.iso1.exon2.amp1_2951,-0.162559
593,PF3D7_0833400.iso1.exon3.amp1_4185,-0.144171
721,PF3D7_0726400.e5s1_1866,-0.096919
