In [None]:
###############################################################################################################
##################################### Diagnostics #############################################################
###############################################################################################################

In [None]:
# Findings!
# The sample ids have hierarchy:
# Case UUID: becedbfd-b2aa-4dde-b7f4-29e6f59ec32c in data bank
# Sample UUID: f6ee8148-9fb7-404e-b951-d1af8d06974f not in data bank
# Belongs to sample TCGA-E2-A1IK-01A
# Sample UUID: 8577ac01-1274-4bd5-ab04-380eaa78d95b in data bank
# Belongs to aliquot TCGA-E2-A1IK-01A-11D-A17G-09
# TCGA-E2-A1IK-01A
#     Portions
#     TCGA-E2-A1IK-01A-11
#         Analytes
#         TCGA-E2-A1IK-01A-11D
#             Aliquots
#             TCGA-E2-A1IK-01A-11D-A141-01
#             TCGA-E2-A1IK-01A-11D-A17G-09
#             TCGA-E2-A1IK-01A-11D-A140-02
#             TCGA-E2-A1IK-01A-11D-A142-09
#             TCGA-E2-A1IK-01A-11D-A145-05

# Solution: find sample UUID of aliquot sample UUID -> R package TCGAutils

# Sample and case id counts
# Case hits
#  Single hits: 9775 
# Multiple hits: 125 
# No hit: 208 
# Total: 10108 

# Sample Hits:
#  Single hits: 10003 
# Multiple hits: 0 
# No hit: 243 
# Total: 10108

# COSMIC is in TCGA
# 118 - COSMIC	Overlapping COSMIC variants


In [1]:
###############################################################################################################
############################################### Setup #########################################################
###############################################################################################################

import pandas as pd
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient()
db = client.progenetix
bs = db.biosamples

# Read in dataframe from intermediate and mapping file
df = pd.read_csv('../temp/mappingfile.tsv', sep = '\t') 
cases = list(set(df['case_id']))
samples = list(set(df['sample_id']))

In [2]:
###############################################################################################################
##################################### Sample and case counts ##################################################
###############################################################################################################


hits_c = []

for ids in cases:
    n_hits_c = bs.count_documents({"external_references.id": {'$regex': ids} ,"biosample_status.id":"EFO:0009656"})
    hits_c.append(n_hits_c)
    
c_single_hit = 0
c_multiple_hits = 0
c_no_hit = 0

for i in hits_c:
    if i == 1:
        c_single_hit += 1
    if i == 0:
        c_no_hit += 1
    if i != 0 and i != 1:
        c_multiple_hits += 1

print("Case hits",  "\nSingle hits:", c_single_hit, "\nMultiple hits:", c_multiple_hits,
      "\nNo hit:", c_no_hit, "\nTotal:", len(hits_c), "\n")

hits_s = []

for ids in samples:
    n_hits_s = bs.count_documents({"external_references.id": {'$regex': ids} ,"biosample_status.id":"EFO:0009656"})
    hits_s.append(n_hits_s)

s_single_hit = 0
s_multiple_hits = 0
s_no_hit = 0

for i in hits_s:
    if i == 1:
        s_single_hit += 1
    if i == 0:
        s_no_hit += 1
    if i != 0 and i != 1:
        s_multiple_hits += 1

print("Sample Hits:", "\nSingle hits:", s_single_hit, "\nMultiple hits:", s_multiple_hits,
      "\nNo hit:", s_no_hit, "\nTotal:", len(hits_c))

# Case hits
#  Single hits: 9775 
# Multiple hits: 125 
# No hit: 208 
# Total: 10108 

# Sample Hits:
#  Single hits: 10003 
# Multiple hits: 0 
# No hit: 243 
# Total: 10108

Case hits 
Single hits: 9775 
Multiple hits: 125 
No hit: 208 
Total: 10108 

Sample Hits: 
Single hits: 10003 
Multiple hits: 0 
No hit: 243 
Total: 10108


In [4]:
###############################################################################################################
##################################### HGVSc Problem ###########################################################
###############################################################################################################


# Filter rows with variant_type == 'SNP'
dd_filtered = df.loc[df['variant_type'] == 'SNP', ['hgvsc', 'reference_bases', 'alternate_bases']].copy()

# Drop rows with missing values in HGVSC
dd_filtered.dropna(subset=['hgvsc'], inplace=True)

# Extract the last character from HGVSC and assign it to a new column 'HGVSC_ref_base'
dd_filtered['HGVSC_ref_base'] = dd_filtered['hgvsc'].str.split('>').str[0].str[-1]
dd_filtered['HGVSC_alt_base'] = dd_filtered['hgvsc'].str.split('>').str[1]


# Reset the index if necessary
dd_filtered.reset_index(drop=True, inplace=True)

compare = dd_filtered[['hgvsc', 'HGVSC_ref_base', 'HGVSC_alt_base', 'reference_bases', 'alternate_bases']]
issues = compare[compare['reference_bases'] != compare['HGVSC_ref_base']]
compare

Unnamed: 0,hgvsc,HGVSC_ref_base,HGVSC_alt_base,reference_bases,alternate_bases
0,c.1760C>G,C,G,G,C
1,c.103G>C,G,C,G,C
2,c.658A>T,A,T,A,T
3,c.1380C>A,C,A,C,A
4,c.3484C>A,C,A,C,A
...,...,...,...,...,...
2424420,c.813G>A,G,A,G,A
2424421,c.1022T>G,T,G,A,C
2424422,c.165T>A,T,A,T,A
2424423,c.56G>T,G,T,G,T


In [None]:
print(
    'Different alt base =\t', sum(issues['HGVSC_ref_base'] != issues['alternate_bases_2']),
    '\nSame alt base =\t\t', sum(issues['HGVSC_ref_base'] == issues['alternate_bases_2']),
    '\nTotal =\t\t\t', len(issues)
)

In [None]:
print('Number of matching alt bases:',
      sum(issues['HGVSC_alt_base'] == issues['alternate_bases_2']) # alt base is never alt base
     )

print('Number of alt bases that are not the ref base:',  
      sum(issues['HGVSC_alt_base'] != issues['reference_bases']) # same numbers as above
     )

In [None]:
issues[issues['HGVSC_ref_base'] != issues['alternate_bases_2']]

In [None]:
# Alt base is sometimes hgvsc ref base, but never hgvsc alt base
# Ref base is never hgvsc ref base, but sometimes hgvsc alt base