In [1]:
import pandas as pd

In [2]:
adenocarcinoma_path = "/mnt/beegfs/userdata/j_wang/Algorithm_Resources/dig/downloads/mutation_files/PCAWG/ICGC_only/Adenocarcinoma_tumors_SNV_MNV_INDEL.ICGC.annot.txt.gz"

In [3]:
adenocarcinoma_df = pd.read_table(adenocarcinoma_path, sep='\t', names=['CHROM', 'START', 'END', 'REF', 'ALT', 'SAMPLE', 'GENE', 'ANNOT', 'MUT', 'CONTEXT'])

In [4]:
adenocarcinoma_df.head()

Unnamed: 0,CHROM,START,END,REF,ALT,SAMPLE,GENE,ANNOT,MUT,CONTEXT
0,1,10506,10507,C,A,ee5d5e7d-78cf-4a29-a9ee-56aa3da877dd,.,Noncoding,C>A,CCT
1,1,10508,10509,G,C,097a7d36-905b-72be-e050-11ac0d482c9a,.,Noncoding,G>C,TGA
2,1,10513,10514,G,A,aa4a868a-df23-4eef-a618-e945aa2ce98a,.,Noncoding,G>A,AGA
3,1,10527,10528,C,G,efc39172-083c-4297-b922-3f58df781332,.,Noncoding,C>G,CCT
4,1,10595,10596,G,C,dcc938da-3e45-4c2f-ae0f-47817be04518,.,Noncoding,G>C,TGT


#### chromosome X are dropped

In [5]:
set(adenocarcinoma_df['CHROM'])

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22}

#### 1096 samples in adenocarcinoma

In [6]:
len(set(adenocarcinoma_df['SAMPLE']))

1096

#### 7 types of annotation for mutations
Single nucleotide variant (SNV): 'Essential_Splice', 'Missense', 'Noncoding', 'Nonsense', 'Stop_loss', 'Synonymous' <br>
Multi-nucleotide variants (MNVs): 'INDEL'

In [7]:
set(adenocarcinoma_df['ANNOT'])

{'Essential_Splice',
 'INDEL',
 'Missense',
 'Noncoding',
 'Nonsense',
 'Stop_loss',
 'Synonymous'}

#### MNVs REF and ALT lengths

In [8]:
adenocarcinoma_indel_df = adenocarcinoma_df[adenocarcinoma_df['ANNOT'] == 'INDEL']

In [9]:
adenocarcinoma_indel_df[adenocarcinoma_indel_df['REF'] != '-'].apply(lambda x: len(x['REF']), axis=1).describe()

count    610216.000000
mean          3.924963
std           8.072942
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max          99.000000
dtype: float64

In [10]:
adenocarcinoma_indel_df[adenocarcinoma_indel_df['ALT'] != '-'].apply(lambda x: len(x['ALT']), axis=1).describe()

count    369068.000000
mean          1.541597
std           2.175199
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max          82.000000
dtype: float64

In [11]:
import os 
import glob
from pathlib import Path

#### number of samples in each cohort 

In [12]:
cohort_stat_table = []

for annot_path in glob.glob("/mnt/beegfs/userdata/j_wang/Algorithm_Resources/dig/downloads/mutation_files/PCAWG/ICGC_only/*gz"):
    annot_path = Path(annot_path)
    cohort_name= annot_path.name.split('_')[0]
    cohort_df  = pd.read_table(annot_path, sep='\t', header=None, usecols=[5])
    cohort_stat_table.append([cohort_name, len(set(cohort_df[5]))])

In [13]:
cohort_stat_df = pd.DataFrame(cohort_stat_table, columns=['cohort', 'size'])

In [14]:
cohort_stat_df

Unnamed: 0,cohort,size
0,Eso-AdenoCa,97
1,CNS-PiloAstro,89
2,Breast-LobularCa,7
3,Lymph-BNHL,98
4,Ovary-AdenoCA,69
5,Kidney-RCC,74
6,Skin-Melanoma,70
7,Breast,117
8,Adenocarcinoma,1096
9,Myeloid-MDS,2


#### total number of samples in PCAWG

In [15]:
cohort_stat_df['size'].sum()

7472