In [None]:
# Scripts to do:
# - Refresher
# - Mapper

In [1]:
# Module import
import pandas as pd
import os

# This is a list of combined identifiers and variant information
# Documentation: 
# https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/
relevant_columns = ['Tumor_Sample_UUID','Matched_Norm_Sample_UUID',
                    'case_id', 'NCBI_Build','Chromosome', 
                    'Start_Position','End_Position',
                    'Variant_Classification', 'Variant_Type',
                    'Reference_Allele', 'Tumor_Seq_Allele2',
                    'HGVSc', 'HGVSp', 'HGVSp_Short',
                    'Tumor_Sample_Barcode', 'all_effects',
                    'Transcript_ID', 'Gene', 'Feature',
                    'Feature_type', 'HGNC_ID', 'ENSP',
                    'RefSeq']

# Create a dataframe for the data
data = pd.DataFrame()

# Assign columns from relevant list
for info in relevant_columns:
    data[info] = []

# Iterate through directory with downloaded maf files and
# load the relevant information in 'data'

print('Starting data extraction...')

Starting data extraction...


In [9]:
import time
from tqdm import tqdm

my_list = ['aaa', 'bbb', 'ccc', 'ddd', 'eee']

for i in tqdm(my_list):
    time.sleep(2)
    print(i)

 20%|█████████                                    | 1/5 [00:02<00:08,  2.01s/it]

aaa


 40%|██████████████████                           | 2/5 [00:04<00:06,  2.01s/it]

bbb


 60%|███████████████████████████                  | 3/5 [00:06<00:04,  2.01s/it]

ccc


 80%|████████████████████████████████████         | 4/5 [00:08<00:02,  2.01s/it]

ddd


100%|█████████████████████████████████████████████| 5/5 [00:10<00:00,  2.01s/it]

eee





In [10]:
import time
from tqdm import tqdm

for file in tqdm(os.listdir("../data/maf_files")): 
    if '.maf' in str(file): 
        df = pd.read_csv("../data/maf_files/"+ str(file), sep='\t',
         skiprows=7, header=0, low_memory = False)
        df = df[relevant_columns]
        data = pd.concat([data, df])

data = data.reset_index(drop=True) # Remove indexing column

# Tumor Barcode shortening to get the sample barcode instead of the
# aliquot barcode (first 16 characters = sample barcode)
data['Tumor_Sample_Barcode'] = data['Tumor_Sample_Barcode'].str.slice(stop=16)

 # Check for the directory
os.makedirs('../temp/', exist_ok=True)

# and create .csv file in the directory
data.to_csv('../temp/maf_data.csv', index = False)

print('Data extraction completed.')

  4%|█▍                                     | 386/10631 [01:10<32:27,  5.26it/s]Exception ignored in: <function tqdm.__del__ at 0x1274969e0>
Traceback (most recent call last):
  File "/opt/homebrew/mambaforge/lib/python3.10/site-packages/tqdm/std.py", line 1162, in __del__
    self.close()
  File "/opt/homebrew/mambaforge/lib/python3.10/site-packages/tqdm/gui.py", line 91, in close
    if self.disable:
AttributeError: 'tqdm_gui' object has no attribute 'disable'
Exception ignored in: <function tqdm.__del__ at 0x1274969e0>
Traceback (most recent call last):
  File "/opt/homebrew/mambaforge/lib/python3.10/site-packages/tqdm/std.py", line 1162, in __del__
    self.close()
  File "/opt/homebrew/mambaforge/lib/python3.10/site-packages/tqdm/gui.py", line 91, in close
    if self.disable:
AttributeError: 'tqdm_gui' object has no attribute 'disable'
100%|███████████████████████████████████| 10631/10631 [1:00:59<00:00,  2.90it/s]


Data extraction completed.


In [2]:
import os
from pymongo import MongoClient
import bycon
import pandas as pd
import time, base36

# Prepare mongodb tools
client = MongoClient()
db = client.progenetix
bs = db.biosamples

In [3]:
df = pd.read_csv('../temp/mappingfile.tsv', sep = '\t')

In [16]:
df.keys()

Index(['biosample_id', 'variant_id', 'callset_id', 'chromosome', 'start',
       'end', 'strand', 'reference_bases', 'alternate_bases', 'hgvsc',
       'variant_classification', 'variant_type', 'hgvsp', 'hgvsp_short',
       'aliquot_id', 'reference_id', 'case_id', 'sample_id'],
      dtype='object')

In [14]:
dd_filtered = df.loc[df['variant_type'] == 'SNP', ['hgvsc', 'reference_bases',
                                                   'alternate_bases', 'start', 'end']].copy()
INS = df.loc[df['variant_type'] == 'INS', ['start', 'end', 'reference_bases', 'alternate_bases']].copy()
# Drop rows with missing values in HGVSC
dd_filtered.dropna(subset=['hgvsc'], inplace=True)

# Extract the last character from HGVSC and assign it to a new column 'HGVSC_ref_base'
dd_filtered['HGVSC_ref_base'] = dd_filtered['hgvsc'].str.split('>').str[0].str[-1]
dd_filtered['HGVSC_alt_base'] = dd_filtered['hgvsc'].str.split('>').str[1]


# Reset the index if necessary
dd_filtered.reset_index(drop=True, inplace=True)

compare = dd_filtered[['hgvsc', 'start', 'end', 'HGVSC_ref_base', 'reference_bases', 'HGVSC_alt_base', 'alternate_bases']]
issues = compare[compare['reference_bases'] != compare['HGVSC_ref_base']]
correct = compare[compare['reference_bases'] == compare['HGVSC_ref_base']]

In [15]:
correct

Unnamed: 0,hgvsc,start,end,HGVSC_ref_base,reference_bases,HGVSC_alt_base,alternate_bases
1,c.103G>C,226064454,226064454,G,G,C,C
2,c.658A>T,26279162,26279162,A,A,T,T
3,c.1380C>A,31266476,31266476,C,C,A,A
4,c.3484C>A,98303765,98303765,C,C,A,A
5,c.2510T>A,124764127,124764127,T,T,A,A
...,...,...,...,...,...,...,...
2424419,c.315G>T,106612395,106612395,G,G,T,T
2424420,c.813G>A,114906851,114906851,G,G,A,A
2424422,c.165T>A,139537086,139537086,T,T,A,A
2424423,c.56G>T,153536303,153536303,G,G,T,T
