In [20]:
import pandas as pd
import re
import altair as alt

In [3]:
df = pd.read_parquet(r"N:\BCaP_6Line_DIANN\BCaP_6Line_DIANN_report.parquet")

In [4]:
df.columns = [c.lower().replace(".", "_") for c in df.columns]

In [11]:
def grab_sample_info(s):
    info = []
    pattern = re.compile(r"BCaP\_([A-Z0-9]+)\-(\d)\_run(\d)")
    match = re.search(pattern, s)
    
    return '-'.join(list(match.groups()))
        

info = df.run.map(grab_sample_info)

In [14]:
info = info.str.split('-', expand=True)
info.columns = ["sample_name", "bio_rep", "tech_rep"]

df = df.merge(info, left_index=True, right_index=True)
df.head()

Unnamed: 0,file_name,run,protein_group,protein_ids,protein_names,genes,pg_quantity,pg_normalised,pg_maxlfq,genes_quantity,...,ptm_q_value,ptm_site_confidence,lib_ptm_site_confidence,im,iim,predicted_im,predicted_iim,sample_name,bio_rep,tech_rep
0,D:\Data\DIA-NN\GD_20230404_BCaP\BCaP_BPH1-1_ru...,BCaP_BPH1-1_run1_1_1_908,P55011,P55011,S12A2_HUMAN,SLC12A2,9598.63,10854.8,12006.9,9598.63,...,0.0,0.995124,0.999917,1.06292,1.06542,1.07996,1.04615,BPH1,1,1
1,D:\Data\DIA-NN\GD_20230404_BCaP\BCaP_BPH1-1_ru...,BCaP_BPH1-1_run2_1_1_909,P55011,P55011,S12A2_HUMAN,SLC12A2,9068.59,10110.2,10812.6,9068.59,...,0.0,0.999772,0.999917,1.06591,1.06542,1.08115,1.04861,BPH1,1,2
2,D:\Data\DIA-NN\GD_20230404_BCaP\BCaP_BPH1-3_ru...,BCaP_BPH1-3_run1_1_1_914,P55011,P55011,S12A2_HUMAN,SLC12A2,10028.5,11783.6,12627.2,10028.5,...,0.0,0.0,0.999917,1.07455,1.06542,1.08134,1.05721,BPH1,3,1
3,D:\Data\DIA-NN\GD_20230404_BCaP\BCaP_M1-1_run1...,BCaP_M1-1_run1_1_1_953,P55011,P55011,S12A2_HUMAN,SLC12A2,17645.4,17007.8,15965.9,17645.4,...,0.0,0.999772,0.999917,1.03542,1.06542,1.05679,1.04328,M1,1,1
4,D:\Data\DIA-NN\GD_20230404_BCaP\BCaP_M1-1_run2...,BCaP_M1-1_run2_1_1_954,P55011,P55011,S12A2_HUMAN,SLC12A2,14797.9,14952.7,16692.0,14797.9,...,0.0,0.995467,0.999917,1.045,1.06542,1.06361,1.04517,M1,1,2


In [25]:
grouped = df.groupby(["sample_name", "bio_rep", "modified_sequence"]).mean()
grouped.reset_index("modified_sequence", inplace=True)
samples = df.sample_name.unique()
reps = df.bio_rep.unique()


peptide_data = pd.DataFrame()
for s in samples:
    for r in reps:
        data = grouped.loc[(s, r), :]
        small = pd.DataFrame({
            "sample":f"{s}-{r}",
            "values":[len(data), len(data[data.proteotypic==1])],
            "kind":["Total", "Unique"]
        })
        peptide_data = pd.concat([peptide_data, small]).reset_index(drop=True)

alt.Chart(peptide_data).mark_bar().encode(
    x=alt.X("kind:N", title="",
                axis=alt.Axis(labelAngle=-45)),
    y=alt.Y("values:Q", title="Peptide Count"),
    column="sample:N",
    color="kind:N"
)




In [27]:
df.loc[:, "protien_ids"] = df.protein_ids.str.split(";")
df = df.explode('protein_ids')

In [30]:
grouped = df.groupby(["sample_name", "bio_rep", "modified_sequence", "protein_ids"]).mean()
grouped.reset_index(["modified_sequence", "protein_ids"], inplace=True)

protein_data = pd.DataFrame()
for s in samples:
    for r in reps:
        data = grouped.loc[(s, r), :]
        small = pd.DataFrame({
            "sample":f"{s}-{r}",
            "values":[len(data.protein_ids.drop_duplicates()), len(data[data.proteotypic==1].protein_ids.drop_duplicates())],
            "kind":["Total", "Unique"]
        })
        protein_data = pd.concat([protein_data, small]).reset_index(drop=True)

alt.Chart(protein_data).mark_bar().encode(
    x=alt.X("kind:N", title="",
                axis=alt.Axis(labelAngle=-45)),
    y=alt.Y("values:Q", title="Protein Count"),
    column="sample:N",
    color="kind:N"
)

alt.Chart(protein_data).mark_line().encode(
    x=alt.X("sample:N", title="",
                axis=alt.Axis(labelAngle=-45)),
    y=alt.Y("values:Q", title="Protein Count"),
    # column="sample:N",
    color="kind:N"
)