# Files in data directory

### ICGC PCAWG Data

In [None]:
icgc_maf_file = "pcawg_icgc.snv_mnv_indel.public.maf"

# Read the file and check the first few lines for comments
with open(icgc_maf_file, "r") as f:
    lines = f.readlines()

# Check for the presence of comments in the first 10 lines
has_comments = any(line.startswith("#") for line in lines[:10])

if has_comments:
    # Skip commented rows
    icgc_data = pd.read_csv(icgc_maf_file, sep="\t", skiprows=lambda x: x < 7 or lines[x].startswith("#"), header=0, low_memory=False)
else:
    # No comments, directly read the file
    icgc_data = pd.read_csv(icgc_maf_file, sep="\t", header=0, low_memory=False)


In [None]:
icgc_data.head()

# Progenetix

In [2]:
# Load libraries
import pandas as pd
import numpy as np
from isodate import parse_duration
from decimal import Decimal
import json


# Connect to progenetix database
from pymongo import MongoClient
client = MongoClient()
db = client.progenetix
individuals_collection = db.individuals
biosamples_collection = db.biosamples
variants_collection = db.variants
callsets_collection = db.callsets
genes_collection = db.genes
hits_collection = db.hits


# Define functions
# Function to convert duration string to years
def convert_to_years(duration_str):
    if pd.isna(duration_str) or duration_str == None:  # Handle missing values
        return pd.NaT
    try:
        duration = parse_duration(duration_str)
        return round(duration.years + (duration.months / Decimal('12')) + (duration.days / Decimal('365.25')),2)
    except Exception as e:
        print(f"Error converting {duration_str}: {e}")
        return pd.NaT
    
# Function to convert duration string to months
def convert_to_months(duration_str):
    if pd.isna(duration_str) or duration_str == 'None':  # Handle missing values
        return pd.NaT
    if duration_str == 'P0M':
        return 0
    try:
        duration = parse_duration(duration_str)
        return round(duration.years * 12 + duration.months + (duration.days / Decimal('30')),0)
    except Exception as e:
        print(f"Error converting {duration_str}: {e}")
        return pd.NaT

# Function to flatten nested structures
def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f'{parent_key}{sep}{k}' if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)


def get_nested_columns(df):
    nested_columns = []
    for i, row in df.iterrows():
        for column in row.keys():
            if '{' in str(row[column]) and not '[' in str(row[column]):
                try:
                    json_string = json.dumps(row[column]).replace("'", '"')
                    column_dict = json.loads(json_string)
                    
                    for key in column_dict.keys():
                        new_column_name = f"{column}_{key}"
                        
                        # Check if the new column exists in the DataFrame, if not, create it
                        if new_column_name not in df.columns:
                            df[new_column_name] = None  # or any default value you prefer

                        df.at[i, new_column_name] = column_dict[key]
                    
                    # Add the column name to the list of columns to remove
                    nested_columns.append(column)
                
                except json.JSONDecodeError:
                    print(f"Error decoding JSON in row {i}, column {column}")
    df.drop(columns=nested_columns, axis=1, inplace=True)
    return df

## Retrieving sample list

In [4]:
# Retrieve SNV TCGA sample list #######################################################

# TCGA samples
# MongoDB Query
query = {"cohorts.id": "pgx:cohort-TCGA"}  # Select only TCGA samples

# Projection to include only the fields you need
projection = {"id": 1, "_id": 0}

# Execute the query
cursor = biosamples_collection.find(query, projection)

TCGA_biosamples = []
for sample in cursor:
    TCGA_biosamples.append(sample['id'])
print(f"TCGA samples: {len(TCGA_biosamples)}")

# SNVs
# MongoDB Query
# Select all SNVs
query = {'variant_state.id': {'$regex': 'SO'}}

# Projection to include only the fields you need
projection = {"biosample_id": 1, "_id": 0}

# Execute the query
cursor = variants_collection.find(query, projection)

SNV_samples = []
for variant in cursor:
    sample = variant['biosample_id']
    if sample not in SNV_samples:
        SNV_samples.append(sample)

# TCGA SNVs
# SNV samples that are also TCGA samples
TCGA_SNV_samples = []
for sample in SNV_samples:
    if sample in TCGA_biosamples:
        TCGA_SNV_samples.append(sample)
print(f"TCGA samples with SNVs: {len(TCGA_SNV_samples)} (should be 10003)") 

print(f"TCGA samples with only CNVs: {len(TCGA_biosamples) - len(TCGA_SNV_samples)}")

# TCGA individuals
individual_ids = []
for sample in TCGA_SNV_samples:
    biosample = biosamples_collection.find_one({"id": sample})
    if biosample['individual_id'] not in individual_ids:
        individual_ids.append(biosample['individual_id'])

print(f"TCGA individuals with SNVs: {len(individual_ids)}")

TCGA samples: 22142


### Comparing similar fields

In [None]:
import pprint
from isodate import parse_duration
import datetime

def duration_to_months(duration_str):
    duration = parse_duration(duration_str)
    total_months = duration.years * 12 + duration.months
    return total_months


for sample in TCGA_biosamples:
    biosample_info_stage = None
    biosample_pathological_stage = None
    biosample_info_survival_status = None
    biosample_info_followup_state = None
    biosample_info_followup_months = None
    biosample_followup_time = None
    individual_stage = None
    individual_followup_state = None
    individual_followup_time = None
    individual_followup_months = None
    individual_vital_status = None


    biosample = biosamples_collection.find_one({"id": sample})
    individual = individuals_collection.find_one({"id": biosample['individual_id']})
    # pprint.pprint(biosample)

    # Biosample data

    # Stage
    biosample_info_stage = biosample['info']['tumor_stage']
    if biosample_info_stage == 'not reported':
        biosample_info_stage = 'Stage Unknown'
    else:
        biosample_info_stage = 'Stage ' + biosample_info_stage.upper()
    
    if biosample['pathological_stage'] != False and 'label' in biosample['pathological_stage'].keys():
        biosample_pathological_stage = biosample['pathological_stage']['label']
    
    
    # Vital / survival
    if 'survival_status' in biosample['info'].keys():
        biosample_info_survival_status = biosample['info']['survival_status']
        if biosample_info_survival_status == 'not reported':
            biosample_info_survival_status = 'no followup status'
        else:
            biosample_info_survival_status = biosample_info_survival_status + ' (follow-up status)'
    
    if 'followup_state' in biosample.keys() and 'label' in biosample['followup_state'].keys():
        biosample_info_followup_state = biosample['followup_state']['label']
    
    # Followup time
    if 'followup_months' in biosample['info'].keys():
        biosample_info_followup_months = biosample['info']['followup_months']

    if 'followup_time' in biosample.keys():
        biosample_followup_time = biosample['followup_time']
    

    # Pathological data
    tnm_findings = biosample['pathological_tnm_findings']

    # Individual data
    if 'index_disease' in individual.keys():
        if 'stage' in individual['index_disease'].keys() and 'label' in individual['index_disease']['stage'].keys():
            individual_stage = individual['index_disease']['stage']['label']

        if 'followup_state' in individual['index_disease'].keys() and 'label' in individual['index_disease']['followup_state'].keys():
            individual_followup_state = individual['index_disease']['followup_state']['label']
        
        if 'followup_time' in individual['index_disease'].keys():
            individual_followup_time = individual['index_disease']['followup_time']
            if individual_followup_time != None:
                individual_followup_months = int(individual_followup_time.split('M')[0].split('P')[1])
    
    if 'vital_status' in individual.keys():
        individual_vital_status = individual['vital_status']['status']
    
    ## Comparisons
    # Stage comparison
    if biosample_info_stage and biosample_pathological_stage:
        if biosample_info_stage != biosample_pathological_stage:
            if biosample_pathological_stage == 'Stage Unknown':
                continue
            else:
                print(f"Stage mismatch in sample for {sample}: {biosample_info_stage} vs {biosample_pathological_stage}")

    if biosample_info_stage and individual_stage:
        if biosample_info_stage != individual_stage:
            if individual_stage == 'Stage Unknown':
                continue
            else:
                print(f"Stage mismatch between collection for {sample}: {biosample_info_stage} vs {individual_stage}")

    # Followup state comparison
    if biosample_info_followup_state and individual_vital_status:
        if biosample_info_followup_state != individual_vital_status:
            print(f"Followup state between collection mismatch for {sample}: {biosample_info_followup_state} vs {individual_vital_status}")
    
    if biosample_info_followup_state and biosample_info_survival_status:
        if biosample_info_followup_state != biosample_info_survival_status:
            print(f"Followup state mismatch in sample for {sample}: {biosample_info_followup_state} vs {biosample_info_survival_status}")

    # Followup time comparison
    if biosample_info_followup_months and individual_followup_months:
        if biosample_info_followup_months != individual_followup_months:
            print(f"Followup time mismatch for {sample}: {biosample_info_followup_months} vs {individual_followup_months}")


__Conclusions:__

- Follow-up time and follow-up status / vital status is all the same
- For stages use biosamples biosamples info.tumor_stage, the other fields are often 'Stage Unkown' and otherwise the same

### Sample list with _only_ CNV data

In [3]:
# Remove TCGA_SNV_samples from TCGA_biosamples
TCGA_CNV_samples = []
for sample in TCGA_biosamples:
    if sample not in TCGA_SNV_samples:
        TCGA_CNV_samples.append(sample)

print(f"TCGA samples with only CNVs: {len(TCGA_CNV_samples)}")

statuses = []
ref=0
neo=0

for sample in TCGA_CNV_samples:
    biosamples = biosamples_collection.find({"id": sample})
    for biosample in biosamples:
            if biosample.get('biosample_status') is not None:
                if biosample.get('biosample_status').get('label') == 'reference sample':
                    ref+=1
                elif biosample.get('biosample_status').get('label') == 'neoplastic sample':
                    neo+=1
                if biosample.get('biosample_status').get('label') not in statuses:
                    statuses.append(biosample.get('biosample_status').get('label'))
print(statuses)
print(f"Reference samples: {ref}")
print(f"Neoplastic samples: {neo}")

TCGA samples with only CNVs: 12139
['reference sample', 'neoplastic sample']
Reference samples: 11052
Neoplastic samples: 1087


## Building dataframes

### Individuals

In [30]:
individuals_df = pd.DataFrame([flatten_dict(individuals_collection.find_one({"id": individual_ids[0]}))])
for individual_id in individual_ids[1:]:
    # Create a DataFrame
    df = pd.DataFrame(
        [flatten_dict(
            individuals_collection.find_one({"id": individual_id})
        )])
    # Concate information
    individuals_df = pd.concat([individuals_df, df])
individuals_df.head()

# Curation
# Combine 'race' and 'ethnicity' columns
# If ethnicity is 'hispanic or latino' and race is 'not reported', race should be 'hispanic or latino'
mask = (individuals_df['info_race'] == 'not reported') & (individuals_df['info_ethnicity'] == 'hispanic or latino')
individuals_df.loc[mask, 'info_race'] = 'hispanic or latino'
individuals_df['ethnicity'] = individuals_df['info_race']

# Convert times
# Divide by 365 to convert days to years
individuals_df['age_at_diagnosis'] = individuals_df['info_age_at_diagnosis'] / 365

### PROBLEM

# # Split index_disease_followup_time at 'M' and 'P' and avoid NaNs
individuals_df['followup_time_months'] = individuals_df['index_disease_followup_time'].apply(convert_to_months)
individuals_df[['followup_time_months', 'index_disease_followup_time']].head()

# Convert index_disease_onset_age_days to years
individuals_df['onset_age'] = individuals_df['index_disease_onset_age_days'] / 365


# Renaming columns
individuals_df.rename(columns={'id': 'individual_id',
                               'info_days_to_death': 'days_to_death',
                               'info_year_of_birth': 'birthyear',
                               'info_death': 'vital_status',
                               'sex_label': 'sex',
                               'index_disease_disease_code_label': 'disease',
                               'index_disease_stage_label': 'stage',
                               'vital_status_survival_time_in_days': 'survival_time_days'}, inplace=True)

# Drop columns that are not needed
columns_to_drop = ['provenance_geo_location_type', # All geo location information is missing
                   'provenance_geo_location_geometry_coordinates',
                   'provenance_geo_location_geometry_type',
                   'description',
                   'index_disease_clinical_tnm_finding',
                   '_id',
                   'info_ethnicity',
                   'vital_status_status',
                   'index_disease_followup_state_id',
                   'index_disease_followup_state_label',
                   'info_age_at_diagnosis',
                   'index_disease_stage_id',
                   'index_disease_followup_time',
                   'index_disease_disease_code_id',
                   'info_ethnicity',
                   'info_race',
                   'info_legacy_ids',
                   'sex_id',
                   'index_disease_stage_id',
                   'index_disease_onset_age',
                   'index_disease_onset_age_days',
                   'onset_age',
                   'stage', # Stage from biosamples is better
                   ]

individuals_df.drop(columns_to_drop, axis=1, inplace=True)

# # Save to csv
# individuals_df.to_csv('../data/progenetix/progenetix_tcga_individuals_data.csv', index=False)

# individuals_df.head()

  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([i

Error converting P-2M: ISO 8601 time designator 'T' missing. Unable to parse datetime string '-2M'


  individuals_df = pd.concat([individuals_df, df])
  individuals_df = pd.concat([individuals_df, df])


### Biosamples

In [14]:
# Create data frame from collection
biosamples_df = pd.DataFrame([flatten_dict(biosamples_collection.find_one({"id": TCGA_SNV_samples[0]}))])
for biosample_id in TCGA_SNV_samples[1:]:
    df = pd.DataFrame(
        [flatten_dict(
            biosamples_collection.find_one({"id": biosample_id})
            )])
    biosamples_df = pd.concat([biosamples_df, df])


# Curation
# Convert 'collection_moment' to years
biosamples_df['collection_moment_years'] = biosamples_df['collection_moment'].apply(convert_to_years)

# Rename columns
biosamples_df.rename(columns={
    'id': 'biosample_id',
    'info_callset_ids': 'callset_ids',
    'info_tumor_stage': 'substage',
    'histological_diagnosis_label': 'histological_diagnosis',
    'icdo_morphology_label': 'icdo_morphology',
    'icdo_topography_label': 'icdo_topography',
    'sample_origin_detail_label': 'sample_origin',
    'notes': 'tumor_type',
    }, inplace=True)

# Get project from 'external_references'
biosamples_df.index = range(len(biosamples_df))
biosamples_df['project'] = None
i = 0
for _, row in biosamples_df.iterrows():
    ref = row['external_references']
    #print(ref)
    for dic in ref:
        if 'project' in dic.get('label'):
            biosamples_df['project'][i] = dic.get('label')
            i+=1


# Rename values from 'substage' column
biosamples_df['substage'].replace({
    '0': 'Stage 0',
    'i': 'Stage I',
    'ia': 'Stage IA',
    'ib': 'Stage IB',
    'is': 'Stage IS',
    'ii': 'Stage II',
    'iia': 'Stage IIA',
    'iib': 'Stage IIB',
    'iic': 'Stage IIC',
    'iii': 'Stage III',
    'iiia': 'Stage IIIA',
    'iiib': 'Stage IIIB',
    'iiic': 'Stage IIIC',
    'iv': 'Stage IV',
    'iva': 'Stage IVA',
    'ivb': 'Stage IVB',
    'ivc': 'Stage IVC',
    'not reported': 'Stage Unknown',
    'x': 'Stage X',
    'i/ii nos': 'Stage I/II NOS',
}, inplace=True)


biosamples_df['stage'] = biosamples_df['substage']
biosamples_df['stage'].replace({
    'Stage IA': 'Stage I',
    'Stage IB': 'Stage I',
    'Stage IS': 'Stage I',
    'Stage IIA': 'Stage II',
    'Stage IIB': 'Stage II',
    'Stage IIC': 'Stage II',
    'Stage IIIA': 'Stage III',
    'Stage IIIB': 'Stage III',
    'Stage IIIC': 'Stage III',
    'Stage IVA': 'Stage IV',
    'Stage IVB': 'Stage IV',
    'Stage IVC': 'Stage IV',
}, inplace=True)


# Drop columns that are not needed
columns_to_drop = ['_id',
                   'provenance_geo_location_type',
                   'provenance_geo_location_geometry_type',
                   'provenance_geo_location_geometry_coordinates',
                   'provenance_geo_location_properties_label',
                   'provenance_geo_location_properties_city',
                   'provenance_geo_location_properties_country',
                   'provenance_geo_location_properties_latitude',
                   'provenance_geo_location_properties_longitude',
                   'provenance_geo_location_properties_ISO3166alpha3',
                   'provenance_geo_location_properties_precision',
                   'info_survival_status',
                   'info_followup_months',
                   'info_legacy_ids',
                   'updated',
                   'histological_diagnosis_id',
                   'pathological_tnm_findings',
                   'pathological_stage_id',
                   'pathological_stage_label',
                   'icdo_morphology_id',
                   'icdo_topography_id',
                   'followup_state_id',
                   'followup_state_label',
                   'followup_time',
                   'biosample_status_id',
                   'biosample_status_label',
                   'sample_origin_detail_id',
                   'collection_moment',
                   'cohorts',
                   'external_references',
                   ]

biosamples_df.drop(columns_to_drop, axis=1, inplace=True)

# Save to csv
biosamples_df.to_csv('../data/progenetix/progenetix_tcga_biosamples_data.csv', index=False)

biosamples_df.head()

                        _id    biosample_id  \
0  5c06951872798368d51a59c5  pgxbs-kftvi9fi   
1  5c06951772798368d51a10cf  pgxbs-kftvhldp   
2  5c06951872798368d51a4d5c  pgxbs-kftvi562   
3  5c06951872798368d51a3ef5  pgxbs-kftvi0ix   
4  5c06951772798368d51a0c5e  pgxbs-kftvhjxx   

                                 external_references                 updated  \
0  [{'id': 'pgx:TCGA.45c3cd39-1c5d-4e26-8afc-5bed... 2020-09-10 17:45:17.581   
1  [{'id': 'pgx:TCGA.518aa9cc-71cf-4734-9d55-b435... 2020-09-10 17:44:50.947   
2  [{'id': 'pgx:TCGA.eb611a76-eecf-452c-bfbe-ab49... 2020-09-10 17:45:13.124   
3  [{'id': 'pgx:TCGA.eff14fed-8a21-41c5-8ed3-7534... 2020-09-10 17:45:07.959   
4  [{'id': 'pgx:TCGA.c01ae0ee-fa58-45d2-ac33-d7f6... 2020-09-10 17:44:49.345   

  provenance_geo_location_type provenance_geo_location_geometry_type  \
0                      Feature                                 Point   
1                      Feature                                 Point   
2                   

### Biosamples + Individuals

In [77]:
from isodate import parse_duration
from decimal import Decimal
import json
import pandas as pd
import numpy as np
from pymongo import MongoClient


# Connect to progenetix database
client = MongoClient()
db = client.progenetix
individuals_collection = db.individuals
biosamples_collection = db.biosamples
variants_collection = db.variants
callsets_collection = db.callsets
genes_collection = db.genes
hits_collection = db.hits

############################################################################################################
# Functions
############################################################################################################

# Function to convert duration string to years
def convert_to_years(duration_str):
    if pd.isna(duration_str) or duration_str == None:  # Handle missing values
        return pd.NaT
    try:
        duration = parse_duration(duration_str)
        return round(duration.years + (duration.months / Decimal('12')) + (duration.days / Decimal('365.25')),2)
    except Exception as e:
        print(f"Error converting {duration_str}: {e}")
        return pd.NaT
    
# Function to convert duration string to months
def convert_to_months(duration_str):
    if pd.isna(duration_str) or duration_str == 'None':  # Handle missing values
        return pd.NaT
    if duration_str == 'P0M':
        return 0
    try:
        duration = parse_duration(duration_str)
        return round(duration.years * 12 + duration.months + (duration.days / Decimal('30')),0)
    except Exception as e:
        print(f"Error converting {duration_str}: {e}")
        return pd.NaT

# Function to flatten nested structures
def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f'{parent_key}{sep}{k}' if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)


def get_nested_columns(df):
    nested_columns = []
    for i, row in df.iterrows():
        for column in row.keys():
            if '{' in str(row[column]) and not '[' in str(row[column]):
                try:
                    json_string = json.dumps(row[column]).replace("'", '"')
                    column_dict = json.loads(json_string)
                    
                    for key in column_dict.keys():
                        new_column_name = f"{column}_{key}"
                        
                        # Check if the new column exists in the DataFrame, if not, create it
                        if new_column_name not in df.columns:
                            df[new_column_name] = None  # or any default value you prefer

                        df.at[i, new_column_name] = column_dict[key]
                    
                    # Add the column name to the list of columns to remove
                    nested_columns.append(column)
                
                except json.JSONDecodeError:
                    print(f"Error decoding JSON in row {i}, column {column}")
    df.drop(columns=nested_columns, axis=1, inplace=True)
    return df


############################################################################################################
# Retrieve SNV TCGA sample list
############################################################################################################

try:
    TCGA_SNV_samples
except NameError:
    # TCGA samples
    # MongoDB Query
    query = {"cohorts.id": "pgx:cohort-TCGA"}  # Select only TCGA samples

    # Projection to include only the fields you need
    projection = {"id": 1, "_id": 0}

    # Execute the query
    cursor = biosamples_collection.find(query, projection)

    TCGA_biosamples = []
    for sample in cursor:
        TCGA_biosamples.append(sample['id'])
    print(f"TCGA samples: {len(TCGA_biosamples)}")

    # SNVs
    # MongoDB Query
    # Select all SNVs
    query = {'variant_state.id': {'$regex': 'SO'}}

    # Projection to include only the fields you need
    projection = {"biosample_id": 1, "_id": 0}

    # Execute the query
    cursor = variants_collection.find(query, projection)

    SNV_samples = []
    for variant in cursor:
        sample = variant['biosample_id']
        if sample not in SNV_samples:
            SNV_samples.append(sample)

    # TCGA SNVs
    # SNV samples that are also TCGA samples
    TCGA_SNV_samples = []
    for sample in SNV_samples:
        if sample in TCGA_biosamples:
            TCGA_SNV_samples.append(sample)
    print(f"TCGA samples with SNVs: {len(TCGA_SNV_samples)} (should be 10003)") 

    print(f"TCGA samples with only CNVs: {len(TCGA_biosamples) - len(TCGA_SNV_samples)}")

    # TCGA individuals
    individual_ids = []
    for sample in TCGA_SNV_samples:
        biosample = biosamples_collection.find_one({"id": sample})
        if biosample['individual_id'] not in individual_ids:
            individual_ids.append(biosample['individual_id'])

    print(f"TCGA individuals with SNVs: {len(individual_ids)}")


############################################################################################################
# Individuals
############################################################################################################

individuals_df = pd.DataFrame([flatten_dict(individuals_collection.find_one({"id": individual_ids[0]}))])
df_list = []
for individual_id in individual_ids[1:]:
    # Create a DataFrame
    df = pd.DataFrame(
        [flatten_dict(
            individuals_collection.find_one({"id": individual_id})
        )])
    if not df.empty:
        df_list.append(df)
# Concate information
individuals_df = pd.concat([individuals_df, *df_list])
individuals_df.head()

# Curation
# Combine 'race' and 'ethnicity' columns
# If ethnicity is 'hispanic or latino' and race is 'not reported', race should be 'hispanic or latino'
mask = (individuals_df['info_race'] == 'not reported') & (individuals_df['info_ethnicity'] == 'hispanic or latino')
individuals_df.loc[mask, 'info_race'] = 'hispanic or latino'
individuals_df['ethnicity'] = individuals_df['info_race']

# Convert times
# Divide by 365 to convert days to years
individuals_df['age_at_diagnosis'] = individuals_df['info_age_at_diagnosis'] / 365

# # Split index_disease_followup_time at 'M' and 'P' and avoid NaNs
individuals_df['followup_time_months'] = individuals_df['index_disease_followup_time'].apply(convert_to_months)

# Convert index_disease_onset_age_days to years
individuals_df['onset_age'] = individuals_df['index_disease_onset_age_days'] / 365


# Renaming columns
individuals_df.rename(columns={'id': 'individual_id',
                               'info_days_to_death': 'days_to_death',
                               'info_year_of_birth': 'birth_year',
                               'info_death': 'vital_status',
                               'sex_label': 'sex',
                               'index_disease_disease_code_label': 'disease',
                               'index_disease_stage_label': 'stage',
                               'vital_status_survival_time_in_days': 'survival_time_days'}, inplace=True)

# Drop columns that are not needed
# Identificator are dropped
# All geo location information from TCGA is missing
columns_to_drop = [
                   '_id',
                   'description', # Empty field
                   'index_disease_clinical_tnm_finding', # Empty field
                   'index_disease_followup_state_id', # Identifier
                   'index_disease_followup_state_label', # vital status is used
                   'index_disease_stage_id', # Identifier
                   'index_disease_followup_time', # followup_time_months is used
                   'index_disease_disease_code_id', # Identifier
                   'index_disease_stage_id', # Identifier
                   'index_disease_onset_age', # onset_age is used
                   'index_disease_onset_age_days',
                   'info_age_at_diagnosis',
                   'info_ethnicity', # ethnicity is used
                   'info_legacy_ids', # Identifier
                   'info_race', # ethnicity is used
                   'onset_age', #
                   'provenance_geo_location_type', # Geo
                   'provenance_geo_location_geometry_coordinates', # Geo
                   'provenance_geo_location_geometry_type', # Geo
                   'sex_id', # Identifier
                   'stage', # Stage from biosamples is better
                   'survival_time_days', # Derived, days_to_death is directly imported
                   'vital_status_status', # vital_status is used
                   'external_references', # Identifier
                   ]

individuals_df.drop(columns_to_drop, axis=1, inplace=True)


############################################################################################################
# Biosamples
############################################################################################################

# Create data frame from collection
biosamples_df = pd.DataFrame([flatten_dict(biosamples_collection.find_one({"id": TCGA_SNV_samples[0]}))])
for biosample_id in TCGA_SNV_samples[1:]:
    df = pd.DataFrame(
        [flatten_dict(
            biosamples_collection.find_one({"id": biosample_id})
            )])
    biosamples_df = pd.concat([biosamples_df, df])


# Curation
# Convert 'collection_moment' to years
biosamples_df['collection_moment_years'] = biosamples_df['collection_moment'].apply(convert_to_years)

# Rename columns
biosamples_df.rename(columns={
    'id': 'biosample_id',
    'info_callset_ids': 'callset_ids',
    'info_tumor_stage': 'substage',
    'histological_diagnosis_label': 'histological_diagnosis',
    'icdo_morphology_label': 'icdo_morphology',
    'icdo_topography_label': 'icdo_topography',
    'sample_origin_detail_label': 'sample_origin',
    'notes': 'tumor_type',
    }, inplace=True)

# Get project from 'external_references'
biosamples_df.index = range(len(biosamples_df))
biosamples_df['project'] = None

for i, row in biosamples_df.iterrows():
    ref = row['external_references']
    #print(ref)
    for dic in ref:
        if 'project' in dic.get('label'):
            project = dic.get('label').replace('TCGA ', '').replace(' project', '')
            biosamples_df.loc[i, 'project'] = project


# Rename values from 'substage' column
# For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.

biosamples_df['substage'] = biosamples_df['substage'].replace({
    '0': 'Stage 0',
    'i': 'Stage I',
    'ia': 'Stage IA',
    'ib': 'Stage IB',
    'is': 'Stage IS',
    'ii': 'Stage II',
    'iia': 'Stage IIA',
    'iib': 'Stage IIB',
    'iic': 'Stage IIC',
    'iii': 'Stage III',
    'iiia': 'Stage IIIA',
    'iiib': 'Stage IIIB',
    'iiic': 'Stage IIIC',
    'iv': 'Stage IV',
    'iva': 'Stage IVA',
    'ivb': 'Stage IVB',
    'ivc': 'Stage IVC',
    'not reported': np.nan,
    'x': 'Stage X',
    'i/ii nos': 'Stage I/II NOS',
})


biosamples_df['stage'] = biosamples_df['substage'].replace({
    'Stage IA': 'Stage I',
    'Stage IB': 'Stage I',
    'Stage IS': 'Stage I',
    'Stage IIA': 'Stage II',
    'Stage IIB': 'Stage II',
    'Stage IIC': 'Stage II',
    'Stage IIIA': 'Stage III',
    'Stage IIIB': 'Stage III',
    'Stage IIIC': 'Stage III',
    'Stage IVA': 'Stage IV',
    'Stage IVB': 'Stage IV',
    'Stage IVC': 'Stage IV',
})


# Drop columns that are not needed
# Identifiers are dropped
# All geo location information is missing
columns_to_drop = ['_id', # Identifier
                   'biosample_status_id', # Identifier
                   'biosample_status_label', # all neoplastic
                   'collection_moment', # collection_moment_years is used
                   'cohorts', # Identifier
                   'external_references', # Identifier
                   'followup_state_id', # Identifier
                   'followup_state_label', # vital_status from individuals is used
                   'followup_time', # followup_time_months from individuals is used
                   'histological_diagnosis_id', # Identifier
                   'icdo_morphology_id', # Identifier
                   'icdo_topography_id', # Identifier
                   'info_survival_status', # vital_status from individuals is used
                   'info_followup_months', # followup_time_months from individuals is used
                   'info_legacy_ids', # Identifier
                   'pathological_tnm_findings', 
                   'pathological_stage_id', # Identifier
                   'pathological_stage_label', # stage is used
                   'provenance_geo_location_type', # Geo
                   'provenance_geo_location_geometry_type', # Geo
                   'provenance_geo_location_geometry_coordinates', # Geo
                   'provenance_geo_location_properties_label', # Geo
                   'provenance_geo_location_properties_city', # Geo
                   'provenance_geo_location_properties_country', # Geo
                   'provenance_geo_location_properties_latitude', # Geo
                   'provenance_geo_location_properties_longitude', # Geo
                   'provenance_geo_location_properties_ISO3166alpha3', # Geo
                   'provenance_geo_location_properties_precision', # Geo
                   'sample_origin_detail_id', # Identifier
                   'updated', # updated from individuals is used
                   ]

biosamples_df.drop(columns_to_drop, axis=1, inplace=True)


############################################################################################################
# Merge individuals and biosamples dataframes
############################################################################################################

# Merge individuals and biosamples dataframes
print("Merging individuals and biosamples dataframes")
individuals_biosamples_df = pd.merge(individuals_df, biosamples_df, on='individual_id', how='outer')
individuals_biosamples_df.head()
print("Shape of individuals_df:", individuals_df.shape)
print("Shape of biosamples_df:", biosamples_df.shape)
print("Shape of individuals_biosamples_df:", individuals_biosamples_df.shape)

del individuals_df
del biosamples_df

# Save to csv
print("Saving to csv")
individuals_biosamples_df.to_csv('../data/progenetix/progenetix_tcga_individuals_biosamples_combined_data.csv', index=False)
individuals_biosamples_df.head()

TCGA samples: 22142
TCGA samples with SNVs: 10006 (should be 10003)
TCGA samples with only CNVs: 12136
TCGA individuals with SNVs: 9893


  individuals_df = pd.concat([individuals_df, *df_list])


Error converting P-2M: ISO 8601 time designator 'T' missing. Unable to parse datetime string '-2M'
Merging individuals and biosamples dataframes
Shape of individuals_df: (9893, 10)
Shape of biosamples_df: (10006, 12)
Shape of individuals_biosamples_df: (10006, 21)
Saving to csv


Unnamed: 0,vital_status,birth_year,days_to_death,individual_id,updated,sex,disease,ethnicity,age_at_diagnosis,followup_time_months,...,substage,callset_ids,histological_diagnosis,icdo_morphology,icdo_topography,tumor_type,sample_origin,collection_moment_years,project,stage
0,dead,1943,517.0,pgxind-kftx3f6u,2018-12-04 14:51:56.401,male genotypic sex,Acute Myeloid Leukemia Not Otherwise Specified,white,65.29589,NaT,...,,[pgxcs-kftvwoq5],Acute Myeloid Leukemia Not Otherwise Specified,"Acute myeloid leukemia, NOS",Bone marrow,Primary Blood Derived Cancer - Peripheral Blood,bone marrow,65.25,LAML,
1,alive,1950,,pgxind-kftx3f6w,2018-12-04 14:51:56.406,female genotypic sex,Thyroid Gland Papillary Carcinoma,white,57.331507,83,...,Stage IVA,[pgxcs-kftw57h4],Thyroid Gland Papillary Carcinoma,"Papillary adenocarcinoma, NOS",Thyroid gland,Primary Tumor,thyroid gland,57.29,THCA,Stage IV
2,alive,1975,,pgxind-kftx3f6y,2018-12-04 14:51:56.410,female genotypic sex,Mixed Glioma,white,38.580822,13,...,,[pgxcs-kftvwor4],Mixed Glioma,Mixed glioma,Cerebrum,Primary Tumor,cerebral hemisphere,38.55,LGG,
3,dead,1936,166.0,pgxind-kftx3f70,2018-12-04 14:51:56.414,male genotypic sex,Floor of Mouth Squamous Cell Carcinoma,white,68.547945,NaT,...,Stage IVA,[pgxcs-kftw2tbq],Floor of Mouth Squamous Cell Carcinoma,"Squamous cell carcinoma, NOS","Floor of mouth, NOS",Primary Tumor,mouth floor,68.5,HNSC,Stage IV
4,alive,1990,,pgxind-kftx3f72,2018-12-04 14:51:56.418,female genotypic sex,Malignant Adrenal Gland Pheochromocytoma,white,21.463014,31,...,,[pgxcs-kftw01tt],Malignant Adrenal Gland Pheochromocytoma,"Pheochromocytoma, malignant","Adrenal gland, NOS",Additional - New Primary,adrenal gland,21.45,PCPG,


### Copy Number Variants

In [None]:
# Create dataframe for variants collection #############################################
print("Collecting variants data")
cnv_db = variants_collection.find({"variant_state.id": {"$regex": "EFO"},
                                   "biosample_id": {"$in": TCGA_SNV_samples}})
cnv_data = list(cnv_db)
cnv_df = pd.DataFrame(cnv_data)


# Extract nested columns ###############################################################
# Collect names of nested columns
print("Collecting nested columns")
get_nested_columns(cnv_df)

# Curation #############################################################################
print("Curating variant data")
# CNV
cnv_df.rename(columns={
    'id': 'variant_id',
    'info_cnv_value': 'cnv_value',
    'info_var_length': 'variant_length',
    'variant_state_label': 'variant_state',
    'location_chromosome': 'chromosome',
    'location_start': 'start',
    'location_end': 'end',
}, inplace=True)

columns_to_drop = ['_id',
                   'info_version',
                   'variant_state_id',
                   'location_sequence_id'
                   ]
cnv_df.drop(columns_to_drop, axis=1, inplace=True)
for i, row in cnv_df.iterrows():
    callset = callsets_collection.find_one({"id": row['callset_id']})
    print(row['callset_id'])
    print(callset)
    cnv_df.at[i, 'cnvfraction'] = callset.get('cnv_stats').get('cnvfraction')
    cnv_df.at[i, 'dupfraction'] = callset.get('cnv_stats').get('dupfraction')
    cnv_df.at[i, 'delfraction'] = callset.get('cnv_stats').get('delfraction')
cnv_df.head()

# Save `cnv_df` and `snv_df` to csv
print("Saving to csv")


# Adding genes to CNV data

cnv_df['cnv_state'] = np.nan
cnv_df.loc[cnv_df['variant_state'] == 'copy number gain', 'cnv_state'] = 'DUP'
cnv_df.loc[cnv_df['variant_state'] == 'copy number loss', 'cnv_state'] = 'DEL'

cnv_df['affected_genes'] = np.nan
for i, row in cnv_df.iterrows():
    query = {
            "start": {"$lte": row['end']},
            "end": {"$gte": row['start']},
            "reference_name": row['chromosome'],
    }
    projection = {"_id": 0, "symbol": 1, "gene_locus_length": 1}
    genes = genes_collection.find(query, projection)
    affected_genes = []
    for gene in genes:
        if gene['symbol'] not in affected_genes:
            affected_genes.append(gene['symbol'])
    cnv_df.at[i, 'affected_genes'] = str(affected_genes)

cnv_df.to_csv('../data/progenetix/progenetix_tcga_cnv_variants_data.csv', index=False)

cnv_df.head()

Collecting variants data


NameError: name 'variants_collection' is not defined

## Sequence data

In [82]:
sequence_alterations = pd.read_csv('../data/maf_master.csv', low_memory=False)
sequence_alterations.head()

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,chromosome,start,end,Strand,variant_classification,variant_type,...,cytogenetic,aminoacid_changes,gene_ids,molecular_effects,clinvar_ids,transcriptHGVS_ids,proteinHGVS_ids,genomicHGVS_id,variant_alternative_ids,frequency_in_populations
0,GLMN,11146,BCM,GRCh38,1,92246554,92246555,+,Missense_Mutation,SNV,...,,['S587C'],['GLMN'],"[{'id': 'SO:0001583', 'label': 'missense_varia...",,['c.1760C>G'],['p.Ser587Cys'],g.92246555G>C,"['dbSNP:novel', 'CCDS:738.1']",
1,H3-3A,3020,BCM,GRCh38,1,226064453,226064454,+,Missense_Mutation,SNV,...,,['G35R'],['H3-3A'],"[{'id': 'SO:0001583', 'label': 'missense_varia...",,['c.103G>C'],['p.Gly35Arg'],g.226064454G>C,['CCDS:1550.1'],
2,HADHB,3032,BCM,GRCh38,2,26279161,26279162,+,Missense_Mutation,SNV,...,,['S220C'],['HADHB'],"[{'id': 'SO:0001583', 'label': 'missense_varia...",,['c.658A>T'],['p.Ser220Cys'],g.26279162A>T,"['dbSNP:novel', 'CCDS:1722.1']",
3,EHD3,30845,BCM,GRCh38,2,31266475,31266476,+,Silent,SNV,...,,,['EHD3'],"[{'id': 'SO:0001819', 'label': 'synonymous_var...",,['c.1380C>A'],['p.Gly460='],g.31266476C>A,"['dbSNP:novel', 'CCDS:1774.1']",
4,VWA3B,200403,BCM,GRCh38,2,98303764,98303765,+,Missense_Mutation,SNV,...,,['H1162N'],['VWA3B'],"[{'id': 'SO:0001583', 'label': 'missense_varia...",,['c.3484C>A'],['p.His1162Asn'],g.98303765C>A,"['dbSNP:novel', 'CCDS:42718.1']",


In [None]:
for column in sequence_alterations.columns:
    print(column, sequence_alterations.loc[sequence_alterations[column].notna(), column].iloc[0])

Hugo_Symbol GLMN
Entrez_Gene_Id 11146
Center BCM
NCBI_Build GRCh38
chromosome 1
start 92246554
end 92246555
Strand +
variant_classification Missense_Mutation
variant_type SNV
reference_sequence G
sequence C
dbSNP_RS dbSNP:novel
Tumor_Sample_Barcode TCGA-OR-A5K0-01A-11D-A29I-10
Mutation_Status Somatic
aliquot_id d7593b2a-0d86-44aa-a404-7fd1b10f65d4
HGVSc c.1760C>G
HGVSp p.Ser587Cys
HGVSp_Short p.S587C
Transcript_ID ENST00000370360
Exon_Number 19/19
t_depth 60
t_ref_count 2
t_alt_count 58
n_depth 96
all_effects GLMN,missense_variant,p.S587C,ENST00000370360,NM_001319683.1&NM_053274.3,c.1760C>G,MODERATE,YES,deleterious(0.03),benign(0.156),-1;C1orf146,downstream_gene_variant,,ENST00000370373,,,MODIFIER,,,,1;C1orf146,downstream_gene_variant,,ENST00000370375,NM_001012425.2,,MODIFIER,YES,,,1;GLMN,downstream_gene_variant,,ENST00000495852,,,MODIFIER,,,,-1;GLMN,3_prime_UTR_variant,,ENST00000495106,,c.*421C>G,MODIFIER,,,,-1;GLMN,non_coding_transcript_exon_variant,,ENST00000471465,,n.706C>G,MODIFIE

In [80]:
# True for all:
# NCBI_Build = GRCh38
# Strand = +
# Mutation_Status = Somatic
columns_for_analysis = [
    'Hugo_Symbol',
    'Center',
    'chromosome',
    'start',
    'end',
    'variant_classification',
    'variant_type',
    'reference_sequence',
    'sequence',
    'Consequence',
    'One_Consequence',
    'BIOTYPE',
    'CANONICAL',
    'SIFT',
    'PolyPhen',
    '1000G_AF',
    '1000G_AFR_AF',
    '1000G_AMR_AF',
    '1000G_EAS_AF',
    '1000G_EUR_AF',
    '1000G_SAS_AF',
    'ESP_AA_AF',
    'ESP_EA_AF',
    'gnomAD_AF',
    'gnomAD_AFR_AF',
    'gnomAD_AMR_AF',
    'gnomAD_ASJ_AF',
    'gnomAD_EAS_AF',
    'gnomAD_FIN_AF',
    'gnomAD_NFE_AF',
    'gnomAD_OTH_AF',
    'gnomAD_SAS_AF',
    'MAX_AF',
    'MAX_AF_POPS',
    'gnomAD_non_cancer_AF',
    'gnomAD_non_cancer_AFR_AF',
    'gnomAD_non_cancer_AMI_AF',
    'gnomAD_non_cancer_AMR_AF',
    'gnomAD_non_cancer_ASJ_AF',
    'gnomAD_non_cancer_EAS_AF',
    'gnomAD_non_cancer_FIN_AF',
    'gnomAD_non_cancer_MID_AF',
    'gnomAD_non_cancer_NFE_AF',
    'gnomAD_non_cancer_OTH_AF',
    'gnomAD_non_cancer_SAS_AF',
    'gnomAD_non_cancer_MAX_AF_adj',
    'gnomAD_non_cancer_MAX_AF_POPS_adj',
    'CLIN_SIG',
    'miRNA',
    'IMPACT',
    'CONTEXT',
    'hotspot',
    'callers',
    'biosample_id',
    'individual_id',
    'clinical_interpretations',
    'cytogenetic',
    'aminoacid_changes',
]

columns_to_drop = [
    'Entrez_Gene_Id',
    'Tumor_Sample_Barcode',
    'aliquot_id',
    'HGVSc',
    'HGVSp',
    'HGVSp_Short',
    'Transcript_ID',
    't_depth',
    't_ref_count',
    't_alt_count',
    'n_depth',
    'all_effects',
    'Gene',
    'Feature',
    'Feature_type',
    'cDNA_position',
    'CDS_position',
    'Protein_position',
    'Amino_acids',
    'Codons',
    'Existing_variation',
    'DISTANCE',
    'TRANSCRIPT_STRAND',
    'SYMBOL',
    'SYMBOL_SOURCE',
    'HGNC_ID',
    'clinvar_ids',
    'transcriptHGVS_ids',
    'proteinHGVS_ids',
    'genomicHGVS_id',
    'variant_alternative_ids',
    'frequency_in_populations',
    'gene_ids',
    'sequence_id',
    'variant_name',
    'specific_so',
    'variant_state_id',
    'variant_id',
    'sample_id',
    'sample_barcode',
    'CCDS',
    'ENSP',
    'SWISSPROT',
    'TREMBL',
    'UNIPARC',
    'UNIPROT_ISOFORM',
    'RefSeq',
    'MANE',
    'APPRIS',
    'FLAGS',
    'EXON',
    'INTRON',
    'DOMAINS',
    'SOMATIC',
    'PUBMED',
    'PICK',
    'TSL',
    'HGVS_OFFSET',
    'PHENO',
    'GENE_PHENO',
    'case_id',
    'GDC_FILTER',
    'COSMIC',
    'RNA_Support',
    'sample_barcode',
    'aliquot_barcode',
    'sample_id',
    'variant_id',
    'variant_state_id',
    'specific_so',
    'callset_id',
    'variant_name',
    'sequence_id',
    'gene_ids',
    'molecular_effects',
]

snvs_df = sequence_alterations[columns_for_analysis]

In [81]:
# Curation

# Aminoacid changes
# Change from string to list
snvs_df['aminoacid_changes'] = snvs_df['aminoacid_changes'].apply(lambda x: eval(x) if pd.notna(x) else x)


# Clinical Interpretations
snvs_df['clinical_interpretations'] = snvs_df['clinical_interpretations'].apply(lambda x: eval(x) if pd.notna(x) else x)
snvs_df['clinvar_interpretations'] = None
snvs_df['clinvar_effects'] = None

for i, row in snvs_df.iterrows():
    if pd.isna(row['clinical_interpretations']):
        continue
    number_interpretations = len(eval(row['clinical_interpretations']))
    interpretations = []
    effects = []
    if eval(row['clinical_interpretations']) == []:
        continue
    for ci in eval(row['clinical_interpretations']):
        interpretations.append(ci.get('clinical_relevance'))
    if type(ci.get('effect')) == None:
        continue
    if type(ci.get('effect')) != list:
        if ci.get('effect').get('label') == None:
            continue
        effects = [ci.get('effect').get('label').split('(')[0]]
    else:
        for effect in ci.get('effect'):
            effects.append(effect.get('label').split('(')[0])
    snvs_df.at[i, 'clinvar_interpretations'] = set(interpretations)
    snvs_df.at[i, 'clinvar_effects'] = set(effects)

# Consequence
snvs_df['Consequence'] = snvs_df['Consequence'].apply(lambda x: x.split(';'))


# SIFT
snvs_df['SIFT_score'] = snvs_df['SIFT'].apply(lambda x: x.split('(')[1].replace(')', '') if pd.notna(x) else x)
snvs_df['SIFT'] = snvs_df['SIFT'].apply(lambda x: x.split('(')[0] if pd.notna(x) else x)


# PolyPhen
snvs_df['PolyPhen_score'] = snvs_df['PolyPhen'].apply(lambda x: x.split('(')[1].replace(')', '') if pd.notna(x) else x)
snvs_df['PolyPhen'] = snvs_df['PolyPhen'].apply(lambda x: x.split('(')[0] if pd.notna(x) else x)

# Clinical interpretations from ClinVar
for i, row in snvs_df.iterrows():
    if pd.isna(row['clinical_interpretations']):
        continue
    number_interpretations = len(eval(row['clinical_interpretations']))
    interpretations = []
    if eval(row['clinical_interpretations']) == []:
        continue
    for ci in eval(row['clinical_interpretations']):
        interpretations.append(ci.get('clinical_relevance'))
    snvs_df.at[i, 'clinvar_interpretations'] = interpretations



# Save data
snvs_df.to_csv('../data/maf_analysis_data.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snvs_df['aminoacid_changes'] = snvs_df['aminoacid_changes'].apply(lambda x: eval(x) if pd.notna(x) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snvs_df['clinical_interpretations'] = snvs_df['clinical_interpretations'].apply(lambda x: eval(x) if pd.notna(x) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-

In [44]:
from pymongo import MongoClient

# Connect to progenetix database
client = MongoClient()
db = client.progenetix
collations_collection = db.collations

metadata = pd.read_csv("../data/progenetix/progenetix_tcga_individuals_biosamples_combined_data.csv")

icdom = metadata['icdo_morphology'].unique()
icdot = metadata['icdo_topography'].unique()


In [78]:
from pprint import pprint


morph_depths = {}
for m in icdom:
    cursor = collations_collection.find({"label": m})
    for element in cursor:
        hierarchy = element.get('hierarchy_paths')
        for h in hierarchy:
            h.get('depth')
            if not h.get('depth') == 0:
                ancestor = collations_collection.find_one({"id": h.get('path')[0]}).get('label')
                morph_depths[m] = ancestor
            else:
                morph_depths[m] = m

topo_depths = {}
for t in icdot:
    cursor = collations_collection.find({"label": t})
    for element in cursor:
        hierarchy = element.get('hierarchy_paths')
        for h in hierarchy:
            h.get('depth')
            if not h.get('depth') == 0:
                ancestor = collations_collection.find_one({"id": h.get('path')[1]}).get('label')
                topo_depths[t] = ancestor
            else:
                topo_depths[t] = t




139

In [74]:
metadata['icdo_morphology_specific'] = metadata['icdo_morphology']
metadata['icdo_topography_specific'] = metadata['icdo_topography']
metadata['icdo_morphology'] = metadata['icdo_morphology'].map(morph_depths)
metadata['icdo_topography'] = metadata['icdo_topography'].map(topo_depths)
# Save
metadata.to_csv('../data/progenetix/progenetix_tcga_individuals_biosamples_combined_data.csv', index=False)

In [71]:
len(set(topo_depths.values()))

9

In [68]:
histo = metadata['histological_diagnosis'].unique()
biosamples_collection = db.biosamples

histo_ncit = {}
ncits = []
for hist in histo:
    ncit = biosamples_collection.find_one({"histological_diagnosis.label": hist}).get('histological_diagnosis').get('id')
    histo_ncit[hist] = ncit
    ncits.append(ncit)

parents = {}
for nc in ncits:
    hierarchies = collations_collection.find_one({"id": nc}).get('hierarchy_paths')
    for h in hierarchies:
        if not h.get('depth') == 0:
            parents[nc] = collations_collection.find_one({"id": h.get('path')[2]}).get('label')
        else:
            parents[nc] = hierarchies.get('label')

# Bring the two dictionaries together
histo_ncit_parents = {}
for hist in histo:
    histo_ncit_parents[hist] = parents[histo_ncit[hist]]

print(set(histo_ncit_parents.values()))
print(len(set(histo_ncit_parents.values())))

{'Mucinous Adenocarcinoma, Endocervical Type', 'Mesenchymal Cell Neoplasm', 'Adenocarcinoma, Endocervical Type', 'Benign Neoplasm', 'Epithelial Neoplasm', 'Well Differentiated Liposarcoma', 'Germ Cell Tumor', 'Neuroepithelial, Perineurial, and Schwann Cell Neoplasm', 'Mixed Neoplasm', 'Malignant Neoplasm', 'Neoplasm by Obsolete Classification', 'Unspecified Tissue'}
12
