In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score, average_precision_score


def add_tcrspep(df, setting="onlyseq"):
    if setting=="noTCellType":
        for i, col in enumerate(["tcra","tcrb","peptide","va","ja","vb","ja","mhc"]):
            if i ==0: ser = df[col]
            else: ser += ":" + df[col]
        df["tcrs_pep"] = ser
    elif setting=="onlyseq":
        df["tcrs_pep"] = df["tcra"] + ":" + df["tcrb"] + "&"  + df["peptide"] 
    
    return df

def renamecolumns(df):
    dic = {'TRA':"tcra", "TRB":"tcrb", 
           "TRAV":"va", "TRAJ":"ja",
              "TRBV":"vb", "TRBJ":"jb",
                "T-Cell-Type":"t_cell_type",
                "Peptide":"peptide", "MHC":"mhc"}
    inverse_dic = {v:k for k,v in dic.items()}
           
    return df.rename(columns=dic)

def eval_df(df):
    print("len(df)", len(df))
    print("pos rate", df.sign.mean())
    yt = df.sign
    yp = df.Score
    print("roc", roc_auc_score(yt,yp))    
    print("avp", average_precision_score(yt,yp))

train_mcpas = pd.DataFrame(pd.read_pickle("./Samples/mcpas_train_samples.pickle")).query('tcra!="UNK"')
train_mcpas = add_tcrspep(train_mcpas)
train_vdjdb = pd.DataFrame(pd.read_pickle("./Samples/vdjdb_no10x_train_samples.pickle")).query('tcra!="UNK"')
train_vdjdb = add_tcrspep(train_vdjdb)


In [2]:
# df_mcpas = pd.read_parquet("./mcpas_results.parquet")
# df_vdjdb = pd.read_parquet("./vdjdb_results.parquet")

df_mcpas = pd.read_parquet("./mcpas_onlyseqresults.parquet")
df_vdjdb = pd.read_parquet("./vdjdb_onlyseqresults.parquet")



df_mcpas = renamecolumns(df_mcpas).query('tcra!="UNK"')
df_mcpas = add_tcrspep(df_mcpas, "onlyseq")

df_vdjdb = renamecolumns(df_vdjdb).query('tcra!="UNK"')
df_vdjdb = add_tcrspep(df_vdjdb, "onlyseq")

In [3]:
eval_df(df_mcpas[~df_mcpas.tcrs_pep.isin(train_mcpas.tcrs_pep)])

len(df) 4729
pos rate 0.1511947557623176
roc 0.6938775474649913
avp 0.27838590829185034


In [4]:
# eval_df(df_mcpas)
df_vdjdb = df_vdjdb[~df_vdjdb.tcrs_pep.isin(train_vdjdb.tcrs_pep)]
eval_df(df_vdjdb)


len(df) 4010
pos rate 0.15037406483790525
roc 0.6162972438463198
avp 0.17918598732717866


# No Tcell Type

In [5]:
# df_mcpas = pd.read_parquet("./mcpas_results.parquet")
# df_vdjdb = pd.read_parquet("./vdjdb_results.parquet")

df_mcpas2 = pd.read_parquet("./mcpas_noTCellTyperesults.parquet")
df_vdjdb2 = pd.read_parquet("./vdjdb_noTCellTyperesults.parquet")

df_mcpas2 = renamecolumns(df_mcpas2).query('tcra!="UNK"')
df_mcpas2 = add_tcrspep(df_mcpas2, "noTCellType")

df_vdjdb2 = renamecolumns(df_vdjdb2).query('tcra!="UNK"')
df_vdjdb2 = add_tcrspep(df_vdjdb2, "noTCellType")

In [6]:
eval_df(df_mcpas2[~df_mcpas2.tcrs_pep.isin(train_mcpas.tcrs_pep)])

len(df) 5611
pos rate 0.16717162716093387
roc 0.7239629555441892
avp 0.3095484825704622


In [7]:
a = df_vdjdb2[~df_vdjdb2.tcrs_pep.isin(train_vdjdb.tcrs_pep)].loc[df_vdjdb.index]
eval_df(a)


len(df) 4010
pos rate 0.15037406483790525
roc 0.44781717087198775
avp 0.12977735554536068


In [8]:
!ls -l *.parquet

/bin/bash: /home/kyohei/miniconda3/envs/cryoem/lib/libtinfo.so.6: no version information available (required by /bin/bash)
-rw-rw-r-- 1 kyohei kyohei 314733  6月 18 01:03 mcpas_noTCellTyperesults.parquet
-rw-rw-r-- 1 kyohei kyohei 310220  6月 18 01:04 mcpas_onlyseqresults.parquet
-rw-rw-r-- 1 kyohei kyohei 456221  6月 18 01:04 vdjdb_noTCellTyperesults.parquet
-rw-rw-r-- 1 kyohei kyohei 443035  6月 18 01:04 vdjdb_onlyseqresults.parquet


In [10]:

df_mcpas2 = pd.read_parquet("./mcpas_noTCellTyperesults.parquet")
df_vdjdb2 = pd.read_parquet("./vdjdb_noTCellTyperesults.parquet")

df_mcpas2 = renamecolumns(df_mcpas2).query('tcra!="UNK"')
df_mcpas2 = add_tcrspep(df_mcpas2, "onlyseq")

df_vdjdb2 = renamecolumns(df_vdjdb2).query('tcra!="UNK"')
df_vdjdb2 = add_tcrspep(df_vdjdb2, "onlyseq")


eval_df(df_mcpas2[~df_mcpas2.tcrs_pep.isin(train_mcpas.tcrs_pep)])

eval_df(df_vdjdb2[~df_vdjdb2.tcrs_pep.isin(train_vdjdb.tcrs_pep)])


len(df) 4729
pos rate 0.1511947557623176
roc 0.7306497189905262
avp 0.28110697086616077
len(df) 4010
pos rate 0.15037406483790525
roc 0.44781717087198775
avp 0.12977735554536068


In [11]:
!mv ./eval_ergo.ipynb ../TCRPrediction/analysis/2023-0616_eval_ergo.ipynb

/bin/bash: /home/kyohei/miniconda3/envs/cryoem/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/home/kyohei/workspace/ERGO-II
