# A complete case study 

In the previous tutorials we have introduced the building blocks of IPTK, and how it can be used to analyize different aspects of IP experiments. In the current tutorials we are going to put it all togther and reproduce some of the results described in the paper (ElAbd, 2021, manuscript)).

## Import the modules

In [1]:
## First, we import the library modules 
from IPTK.IO.InFunctions import parse_xml_based_format_to_identification_table, download_pdb_entry
from IPTK.IO.OutFunctions import write_auto_named_peptide_to_fasta 
from IPTK.Utils.UtilityFunction import get_experiment_summary, combine_summary
from IPTK.Classes.Experiment import Experiment
from IPTK.Classes.ExperimentSet import ExperimentSet
from IPTK.Classes.Database import SeqDB, GeneExpressionDB, CellularLocationDB,OrganismDB
from IPTK.Classes.Tissue import Tissue
from IPTK.Classes.HLASet import HLASet
from IPTK.Classes.Proband import Proband
from IPTK.Classes.Peptide import Peptide 
from IPTK.Visualization.vizTools import (plot_parent_protein_expression_in_tissue,plotly_parent_protein_expression_in_tissue, plot_gene_expression_vs_num_peptides, 
plot_num_protein_per_location, plot_num_peptide_per_go_term, plot_overlap_heatmap,plot_protein_coverage,plot_coverage_and_annotation, plot_paired_represention, 
plot_MDS_from_ic_coverage)
from IPTK.Analysis.AnalysisFunction import compute_binary_distance, compute_ic_distance_experiments
## second we import common python modules 
import matplotlib.pyplot as plt
from typing import List,Set,Dict 
import pandas as pd 
import numpy as np
import random 
import seaborn as sns 
from tqdm import tqdm 



## Define the experiment-invariants

In [None]:
proband: Proband = Proband(name='Tutorial_3') # a placeholder value for the name of the proband, here just a dummy placeholder value   
hla_set: HLASet = HLASet(hlas=['HLA-DRB1*15:01','HLA-DRB1*13:01']) # define the HLA alleles, here just a dummy placeholder value 
seqs: SeqDB = SeqDB(path2fasta="data/human_proteome.fasta") # load the sequences data
expresson_profile: GeneExpressionDB= GeneExpressionDB() # use the default expression database 
protein_locations: CellularLocationDB= CellularLocationDB() # use the default cellular location.  
tissue: Tissue = Tissue(name='total PBMC', # define the targe tissue --> PBMC 
                        main_exp_value=expresson_profile,  
                        main_location=protein_locations)

## Load the idxXML Files

#### Load the input idXML files

In [None]:
# load rep1 with 5e7 cells 
rep1_5e7: pd.DataFrame = parse_xml_based_format_to_identification_table(
    path2XML_file="idXML/0810202_0.5_all_ids_merged_psm_perc_filtered.idXML",
    path2fastaDB="database/human_proteome.fasta",
    is_idXML= True) 
# load rep1 with 1e8 cells 
rep1_1e8: pd.DataFrame = parse_xml_based_format_to_identification_table(
    path2XML_file="idXML/0810202_1_all_ids_merged_psm_perc_filtered.idXML",
    path2fastaDB="database/human_proteome.fasta",
    is_idXML= True) 
# load rep2 with 5e7 cells 
rep2_5e7: pd.DataFrame = parse_xml_based_format_to_identification_table(
    path2XML_file="idXML/27112020_0.5_all_ids_merged_psm_perc_filtered.idXML",
    path2fastaDB="database/human_proteome.fasta",
    is_idXML= True) 
# load rep2 with 1e8 cells 
rep2_1e8: pd.DataFrame = parse_xml_based_format_to_identification_table(
    path2XML_file="idXML/27112020_1_all_ids_merged_psm_perc_filtered.idXML",
    path2fastaDB="database/human_proteome.fasta",
    is_idXML= True) 

#### Create the experiments 

In [None]:
# define experiment for rep1 5e7 cells 
exp_rep1_5e7= Experiment(proband=proband,hla_set=hla_set,tissue=tissue,database=seqs, 
                 ident_table=rep1_5e7)
# define experiment for rep1 1e8 cells 
exp_rep1_1e8= Experiment(proband=proband,hla_set=hla_set,tissue=tissue,database=seqs, 
                 ident_table=rep1_1e8)
# define experiment for rep2 5e7 cells 
exp_rep2_5e7= Experiment(proband=proband,hla_set=hla_set,tissue=tissue,database=seqs, 
                 ident_table=rep2_5e7)
# define experiment for rep2 1e8 cells 
exp_rep2_1e8= Experiment(proband=proband,hla_set=hla_set,tissue=tissue,database=seqs, 
                 ident_table=rep2_1e8)

### Create an ExperimentSet object 

In [None]:
exps: ExperimentSet = ExperimentSet(**{'rep1_5e7':exp_rep1_5e7, 
                                    'rep1_1e8':exp_rep1_1e8, 
                                    'rep2_5e7':exp_rep2_5e7,
                                    'rep2_1e8':exp_rep2_1e8})

## Computing N-representation using HLA-atlas data

First let's define some help functions to automate parsing and loading the data

In [2]:
def load_and_fill_exps_list(path2files:str,res:Dict[str,Experiment], path2seqDB: str):
    """Load the CSV files the generated file object 

    :param path2files: The path to load the CSV files 
    :type path2files: str
    :param res: a list of experimental objects 
    :type res: Dict[str,Experiment]
    :param path2seqDB: the path to the sequence database
    :type  path2seqDB: str 
    """
    # get a list of the CSV file
    file_names: List[str]=get_level_two_subdir(path2files)
    for fname in tqdm(file_names): 
        # generate the tissue name 
        tissue_name: str =fname.split('/')[-1].split('.')[0].split('_')[-1]
        # load the tables as an identification table 
        ident_table: pd.DataFrame = parse_text_table(
            path2file=fname,
            path2fastaDB=path2seqDB,
            seq_column="peptide_seq",
            accession_column="protein_names") 
        # create a proband instance 
        donor: Proband = Proband(name='CLEAVAGE_MOTIF_'+tissue_name) 
        # a place holder for the current usage as we are only interested in the coverage 
        hlas: HLASet = HLASet(hlas=['HLA-DRB1*15:01','HLA-DRB1*13:01']) 
        # generate a sequence database 
        seqs: SeqDB = SeqDB(path2fasta=path2seqDB)
        expresson_profile: GeneExpressionDB= GeneExpressionDB()
        protein_locations: CellularLocationDB= CellularLocationDB()
        # if the tissue is not supported we skip it, i.e. not defined in the expression database 
        try: 
            tissue: Tissue = Tissue(name=tissue_name.lower(),
                        main_exp_value=expresson_profile, 
                        main_location=protein_locations)
        except KeyError:
            print(f'Tissue: {tissue_name} is not on the database')
            continue
        # otherwise we create an experimental instance for this tissue 
        res[tissue_name]=Experiment(
            proband=donor, hla_set=hlas, 
            tissue=tissue, database=seqs, ident_table=ident_table)
        # print progress statment 
        print(f'Tissue: {tissue_name} has been parsed')
    return    
def get_level_two_subdir(path2check: str)->List[str]:
    """
    """
    results: List[str]= []
    level_one_dirs: List[str] = os.listdir(path2check)
    for dir_ in level_one_dirs: 
        path_=os.path.join(path2check,dir_)
        if os.path.isdir(path_):
            level_two_dirs: List[str] = os.listdir(path_)
            for dir__ in level_two_dirs: 
                results.append(os.path.join(path_,dir__))
    return results

In [4]:
## load exps dict by filling the dict of experiments 
exps: Dict[str,Experiment] = dict()
load_and_fill_exps_list("AUT01-DN08" ,exps,'')

NameError: name 'get_level_two_subdir' is not defined