In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from functools import reduce

In [2]:
fontsize = 16
lw = 2

In [3]:
path1 = '01_Data/00_Reports/DIANN180/'
path2 = '01_Data/00_Reports/Spec18/'

names = ['indiv', 'mbr', 
         '+1_5', '+5_5', '+10_5', '+100_5',
         '+1_10', '+5_10', '+10_10', '+100_10',
         '+1_20', '+5_20', '+10_20', '+100_20']

dnn_reports = []
spec_reports = []

# for diann
indiv_reports = []
for i in range(7):
    frame = pd.read_csv(path1 + 'r' + str(i+1) + '_report.tsv', sep='\t')
    indiv_reports.append(frame)
    
dnn_reports.append(pd.concat(indiv_reports))

for name in names:
    if name != 'indiv':
        frame = pd.read_csv(path1 + name + '_report.tsv', sep='\t')
        dnn_reports.append(frame)

# for spectronaut
indiv_reports = []
for i in range(7):
    frame = pd.read_csv(path2 + 'r' + str(i+1) + '_report.tsv', sep='\t')
    indiv_reports.append(frame)
    
spec_reports.append(pd.concat(indiv_reports))

for name in names:
    if name != 'indiv':
        frame = pd.read_csv(path2 + name + '_report.tsv', sep='\t')
        spec_reports.append(frame) 
        

In [42]:
# for diann
for i, frame in enumerate(dnn_reports):
    organisms = []
    for row in frame['Protein.Names']:
        elements = [i.split('_')[-1] for i in row.split(';')]
        organisms.append(';'.join(elements))
    frame.loc[:,('Organism')] = organisms
    frame = frame[~(frame['Organism'].str.contains('HUMAN') & frame['Organism'].str.contains('ECOLI'))]
    dnn_reports[i] = frame[frame['Run'].str.contains('00perc')]
    
# for spectronaut
ec_anno = pd.read_csv('01_Data/Ecoli_K12_annotation_20230630.tsv', sep='\t')
ec_anno_set = set(ec_anno['Entry'])

for i, frame in enumerate(spec_reports):
    organisms = []
    for row in frame['PG.ProteinGroups']:
        organism = ['ECOLI' if prot in ec_anno_set else 'HUMAN' for prot in row.split(';')]
        organisms.append(';'.join(organism))
    frame.loc[:,('Organism')] = organisms
    frame = frame[~(frame['Organism'].str.contains('HUMAN') & frame['Organism'].str.contains('ECOLI'))]
    spec_reports[i] = frame[frame['R.FileName'].str.contains('00perc')]

In [43]:
dnn_qvalues = ['Q.Value', 'Global.Q.Value', 'PG.Q.Value', 'Global.PG.Q.Value', 'Lib.Q.Value', 'Lib.PG.Q.Value']
spec_qvalues = ['EG.Qvalue', 'PG.Qvalue', 'PG.QValue (Run-Wise)']

# for diann
for i, frame in enumerate(dnn_reports):
    frame['ECOLI'] = ['+' if 'ECOLI' in r else '' for r in frame['Organism']]
    frame = frame[dnn_qvalues + ['ECOLI']]
#     frame.to_csv('01_Data/05_ROC_data/Qvals+Ecoli/DIANN_' + names[i] + '_Qvals+Ecoli.tsv', sep='\t')

# for spectronaut
for i, frame in enumerate(spec_reports):
    frame['ECOLI'] = ['+' if 'ECOLI' in r else '' for r in frame['Organism']]
    frame = frame[spec_qvalues + ['ECOLI']]
#     frame.to_csv('01_Data/05_ROC_data/Qvals+Ecoli/Spec_' + names[i] + '_Qvals+Ecoli.tsv', sep='\t')

In [47]:
names
spec_reports[8]

Unnamed: 0,E.Errors,E.LFQMethod,E.Warnings,R.FileName,R.Fraction,R.Condition,R.Replicate,R.Label,PG.GroupLabel,PG.ProteinAccessions,...,EG.Cscore,EG.NormalizedCscore,FG.Charge,FG.IntMID,FG.LabeledSequence,FG.PrecMz,FG.PrecMzCalibrated,FG.Quantity,Organism,ECOLI
0,0,MaxLFQ,0,20220407_KK_HeLa_Ecoli-00perc_1ng_R1_A1_1_6866,,HE10_10,1,20220407_KK_HeLa_Ecoli-00perc_1ng_R1_A1_1_6866.d,A0A024RBG1;Q9NZJ9,A0A024RBG1;Q9NZJ9,...,28.480253,28.459311,2,_AAC[+57]LC[+57]FR_,_AAC[Carbamidomethyl (C)]LC[Carbamidomethyl (C...,449.207123,449.204491,11.518570,HUMAN;HUMAN,
1,0,MaxLFQ,0,20220407_KK_HeLa_Ecoli-00perc_1ng_R1_A1_1_6866,,HE10_10,1,20220407_KK_HeLa_Ecoli-00perc_1ng_R1_A1_1_6866.d,A0A0B4J2D5;P0DPI2,A0A0B4J2D5;P0DPI2,...,36.440941,36.425417,2,_NVLTESAR_,_NVLTESAR_,445.240509,445.237788,42.086121,HUMAN;HUMAN,
2,0,MaxLFQ,0,20220407_KK_HeLa_Ecoli-00perc_1ng_R1_A1_1_6866,,HE10_10,1,20220407_KK_HeLa_Ecoli-00perc_1ng_R1_A1_1_6866.d,A0A0B4J2D5;P0DPI2,A0A0B4J2D5;P0DPI2,...,32.367344,32.349047,2,_NLSTFAVDGK_,_NLSTFAVDGK_,526.274536,526.271671,28.853369,HUMAN;HUMAN,
3,0,MaxLFQ,0,20220407_KK_HeLa_Ecoli-00perc_1ng_R1_A1_1_6866,,HE10_10,1,20220407_KK_HeLa_Ecoli-00perc_1ng_R1_A1_1_6866.d,A0A0B4J2D5;P0DPI2,A0A0B4J2D5;P0DPI2,...,34.185150,34.168091,2,_EVVEAHVDQK_,_EVVEAHVDQK_,577.296021,577.292405,13.997160,HUMAN;HUMAN,
4,0,MaxLFQ,0,20220407_KK_HeLa_Ecoli-00perc_1ng_R1_A1_1_6866,,HE10_10,1,20220407_KK_HeLa_Ecoli-00perc_1ng_R1_A1_1_6866.d,A0A0B4J2D5;P0DPI2,A0A0B4J2D5;P0DPI2,...,39.652893,39.639555,2,_WPYAGTAEAIK_,_WPYAGTAEAIK_,603.811340,603.807532,19.445438,HUMAN;HUMAN,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275168,0,MaxLFQ,0,20220407_KK_HeLa_Ecoli-00perc_1ng_R7_A7_1_6914,,HE10_10,7,20220407_KK_HeLa_Ecoli-00perc_1ng_R7_A7_1_6914.d,Q9Y6Y8,Q9Y6Y8,...,26.582266,26.507861,2,_SIIEC[+57]VDDFR_,_SIIEC[Carbamidomethyl (C)]VDDFR_,627.295166,627.294178,7.954617,HUMAN,
275169,0,MaxLFQ,0,20220407_KK_HeLa_Ecoli-00perc_1ng_R7_A7_1_6914,,HE10_10,7,20220407_KK_HeLa_Ecoli-00perc_1ng_R7_A7_1_6914.d,Q9Y6Y8,Q9Y6Y8,...,26.450150,26.375443,2,_C[+57]PGPLAVANGVVK_,_C[Carbamidomethyl (C)]PGPLAVANGVVK_,641.352844,641.351905,4.236940,HUMAN,
275170,0,MaxLFQ,0,20220407_KK_HeLa_Ecoli-00perc_1ng_R7_A7_1_6914,,HE10_10,7,20220407_KK_HeLa_Ecoli-00perc_1ng_R7_A7_1_6914.d,Q9Y6Y8,Q9Y6Y8,...,12.515084,12.408642,2,_AHTSSTQLQEELEK_,_AHTSSTQLQEELEK_,800.894287,800.892328,1.000000,HUMAN,
275171,0,MaxLFQ,0,20220407_KK_HeLa_Ecoli-00perc_1ng_R7_A7_1_6914,,HE10_10,7,20220407_KK_HeLa_Ecoli-00perc_1ng_R7_A7_1_6914.d,Q9Y6Y8,Q9Y6Y8,...,-5.709745,-5.713743,2,_QGFISSLK_,_QGFISSLK_,440.250366,440.249716,1.000000,HUMAN,
