# TCGA SNV Datasets


<b><i class="fa fa-folder-o" area-hidden="true" style="color:#1976D2"> </i>&nbsp; File Location</b><br>
<p style="background:#F5F5F5; text-indent: 1em;">
<code style="background:#F5F5F5; color:#404040; font-weight:bold; font-size:12px">C:\Users\ibrah\Documents\GitHub\Predicting-Mutation-Effects\src\tables</code>
</p>

<b><i class="far fa-file" area-hidden="true" style="color:#1976D2"> </i>&nbsp; File Name</b>
<p style="background:#F5F5F5; text-indent: 1em;">
<code style="background:#F5F5F5; color:#404040; font-weight:bold; font-size:12px">SNV_datasets.ipynb</code>
</p>

<b><i class="far fa-calendar-alt" area-hidden="true" style="color:#1976D2"> </i>&nbsp; Last Edited</b>
<p style="background:#F5F5F5; text-indent: 1em;">
<code style="background:#F5F5F5; color:#404040; font-weight:bold; font-size:12px">October 13th, 2021</code>
</p>


<div class="alert alert-block" style="background-color: #F5F5F5; border: 1px solid; padding: 10px; border-color: #E0E0E0">
    <b><i class="fa fa-compass" aria-hidden="true" style="color:#404040"></i></b>&nbsp; <b style="color: #404040">Purpose </b> <br>
<div>
   
- [ ] Provide information regarding TCGA SNV Datasets.
    * BRCA
    * COAD
    * OV
    * ESCA
    
**Dependent files:**

* TCGA SNV Cohort files

In [1]:
# Change to main directory.
import os

os.chdir('../')

In [2]:
# Common imports
import pandas as pd
import os.path as op
from helpers.helpers_analysis.loaders import load_snv_datasets

# Paths
## SNVs
SNV_COMMON_PATH = "C:/Users/ibrah/Desktop/SNV_data/SNV_datasets/"
SNV_BRCA_PATH = op.join(SNV_COMMON_PATH, "SNV_BRCA_hg38_2021-09-22.csv")
SNV_COAD_PATH = op.join(SNV_COMMON_PATH, "SNV_COAD_hg38_2021-09-22.csv")
SNV_OV_PATH = op.join(SNV_COMMON_PATH, "SNV_OV_hg38_2021-09-22.csv")
SNV_ESCA_PATH = op.join(SNV_COMMON_PATH, "SNV_ESCA_hg38_2021-09-22.csv")

In [24]:
import re

pattern = re.compile(r"^SNV_(\w+)_hg38_2021-09-22.csv$")


'ACC'

In [28]:
for snv_file in os.listdir(SNV_COMMON_PATH):
    snv_path = op.join(SNV_COMMON_PATH, snv_file)
    
    info = {}
    tcga = pattern.match(snv_file).group(1)
    
    print(f"TCGA: {tcga}")
    load_snv_datasets(tcga, snv_path, info)
    
    print("Number of patients: {}".format(
        info[f'{tcga}_snv_data_simplified']["Tumor_Sample_Barcode"].nunique()
        )
    )
    
    del info
    print('- - -')

TCGA: ACC
2021-10-14 12:14:30 |[32m INFO     [0m| helpers.helpers_analysis.loaders | Loading ACC SNV datasets ..
2021-10-14 12:14:30 |[36m DEBUG    [0m| helpers.helpers_analysis.loaders | ACC SNV data size: (10747, 121)
2021-10-14 12:14:30 |[36m DEBUG    [0m| helpers.helpers_analysis.loaders | ACC SNV data processed size: (5392, 121)
2021-10-14 12:14:30 |[32m INFO     [0m| helpers.helpers_analysis.loaders | ACC SNV datasets are loaded.
Number of patients: 92
- - -
TCGA: BLCA
2021-10-14 12:14:30 |[32m INFO     [0m| helpers.helpers_analysis.loaders | Loading BLCA SNV datasets ..
2021-10-14 12:14:35 |[36m DEBUG    [0m| helpers.helpers_analysis.loaders | BLCA SNV data size: (134513, 121)
2021-10-14 12:14:35 |[36m DEBUG    [0m| helpers.helpers_analysis.loaders | BLCA SNV data processed size: (72788, 121)
2021-10-14 12:14:35 |[32m INFO     [0m| helpers.helpers_analysis.loaders | BLCA SNV datasets are loaded.
Number of patients: 412
- - -
TCGA: BRCA
2021-10-14 12:14:35 |[32m 

In [None]:
# head and neck  | HNSC

In [8]:
esca_info = {}

load_snv_datasets("ESCA", SNV_ESCA_PATH, esca_info)

2021-10-14 11:59:11 |[32m INFO     [0m| helpers.helpers_analysis.loaders | Loading ESCA SNV datasets ..
2021-10-14 11:59:14 |[36m DEBUG    [0m| helpers.helpers_analysis.loaders | ESCA SNV data size: (45313, 121)
2021-10-14 11:59:14 |[36m DEBUG    [0m| helpers.helpers_analysis.loaders | ESCA SNV data processed size: (19497, 121)
2021-10-14 11:59:14 |[32m INFO     [0m| helpers.helpers_analysis.loaders | ESCA SNV datasets are loaded.


In [9]:
esca_info["ESCA_snv_data_simplified"]["Tumor_Sample_Barcode"].nunique()

184

In [11]:
ov_info = {}

load_snv_datasets("OV", SNV_OV_PATH, ov_info)

2021-10-14 12:00:01 |[32m INFO     [0m| helpers.helpers_analysis.loaders | Loading OV SNV datasets ..
2021-10-14 12:00:04 |[36m DEBUG    [0m| helpers.helpers_analysis.loaders | OV SNV data size: (75168, 121)
2021-10-14 12:00:05 |[36m DEBUG    [0m| helpers.helpers_analysis.loaders | OV SNV data processed size: (38486, 121)
2021-10-14 12:00:05 |[32m INFO     [0m| helpers.helpers_analysis.loaders | OV SNV datasets are loaded.


In [12]:
ov_info.keys()

dict_keys(['OV_snv_data', 'OV_snv_data_processed', 'OV_snv_data_simplified'])

In [13]:
ov_info["OV_snv_data_simplified"]["Tumor_Sample_Barcode"].nunique()

436

In [3]:
brca_info = {}

load_snv_datasets("BRCA", SNV_BRCA_PATH, brca_info)

2021-10-13 12:50:51 |[32m INFO     [0m| helpers.helpers_analysis.loaders | Loading BRCA SNV datasets ..
2021-10-13 12:50:59 |[36m DEBUG    [0m| helpers.helpers_analysis.loaders | BRCA SNV data size: (120988, 121)
2021-10-13 12:51:00 |[36m DEBUG    [0m| helpers.helpers_analysis.loaders | BRCA SNV data processed size: (60251, 121)
2021-10-13 12:51:00 |[32m INFO     [0m| helpers.helpers_analysis.loaders | BRCA SNV datasets are loaded.


In [4]:
brca_info.keys()

dict_keys(['BRCA_snv_data', 'BRCA_snv_data_processed', 'BRCA_snv_data_simplified'])

In [5]:
brca_info["BRCA_snv_data_simplified"]["Tumor_Sample_Barcode"].nunique()

985

In [6]:
brca_info["BRCA_snv_data_processed"].shape

(60251, 121)

In [None]:
brca_info["BRCA_snv_data_processed"].shape

In [None]:
# fixed NaNs in process_snv function. 
# I dont think it will effect the AnalysisNotebooks.
# Maybe check again

In [65]:
brca_info["BRCA_snv_data"][
    (brca_info["BRCA_snv_data"]["Variant_Classification"] == "Missense_Mutation") &
    (brca_info["BRCA_snv_data"]["HGVSp_Short"].isna())
]["HGVSp_Short"]

10001     NaN
22023     NaN
23858     NaN
24553     NaN
31592     NaN
33109     NaN
39631     NaN
39721     NaN
40455     NaN
42600     NaN
47764     NaN
50848     NaN
52557     NaN
53450     NaN
58942     NaN
62918     NaN
63387     NaN
70351     NaN
72207     NaN
73784     NaN
76011     NaN
84340     NaN
94251     NaN
104306    NaN
108604    NaN
108682    NaN
115583    NaN
Name: HGVSp_Short, dtype: object

In [57]:
brca_info["BRCA_snv_data"][
    (brca_info["BRCA_snv_data"]["Variant_Classification"] == "Missense_Mutation") &
    (brca_info["BRCA_snv_data"]["HGVSp_Short"].isna())
]["HGVSp_Short"]

10001     NaN
22023     NaN
23858     NaN
24553     NaN
31592     NaN
33109     NaN
39631     NaN
39721     NaN
40455     NaN
42600     NaN
47764     NaN
50848     NaN
52557     NaN
53450     NaN
58942     NaN
62918     NaN
63387     NaN
70351     NaN
72207     NaN
73784     NaN
76011     NaN
84340     NaN
94251     NaN
104306    NaN
108604    NaN
108682    NaN
115583    NaN
Name: HGVSp_Short, dtype: object