# Using R biomart client to download all known genes (for grch37)

In [1]:
%load_ext rpy2.ipython

In [2]:
%%R

# ### Name for the platform (reflected in filename and in platform definition)
PLATFORM_NAME = 'qDNAseq_hg19'
PLATFORM_DEFINITION_FILE = paste(c(PLATFORM_NAME, '.txt'), collapse = '')

# ### Using ensembl on GRCh37 -----------------------------------------------
library(biomaRt)
mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL",
                dataset = "hsapiens_gene_ensembl",
                host="grch37.ensembl.org")

# Only use standard human chromosomes
normal.chroms <- c(1:22, "X", "Y", "M")

# # Get the coordinates through biomart and merge with platform -------------
entrez_list <- getBM(attributes = c("chromosome_name", "start_position", "end_position", 
                                    "band", 'hgnc_symbol','entrezgene'),
                        filter = 'chromosome_name',
                        values = normal.chroms,
                        mart = mart)

# Only keep entries with both HGNC symbol and Entrez gene ID
entrez_list <- entrez_list[which(!is.na(entrez_list$entrezgene) & entrez_list$hgnc_symbol != ''),]

# Deduplicate list from hgnc symbols
entrez_list <- entrez_list[order(entrez_list$entrezgene),]
entrez_list <- entrez_list[!duplicated(entrez_list$hgnc_symbol),]

# Sorting based on chromomome and start position
entrez_list <- entrez_list[order(entrez_list$chromosome_name, entrez_list$start_position),]

### Create platform file
biomart_entrez_platform <- data.frame(  "GPL_ID" = PLATFORM_NAME,
                                        "REGION_NAME" = entrez_list$hgnc_symbol,
                                        "CHR" = entrez_list$chromosome_name,
                                        "START_BP" = as.integer(entrez_list$start_position),
                                        "END_BP" = as.integer(entrez_list$end_position),
                                        "NUM_PROBES" = '',
                                        "CYTOBAND" = entrez_list$band,
                                        "GENE_SYMBOL" = entrez_list$hgnc_symbol,
                                        "GENE_ID" = entrez_list$entrezgene,
                                        "ORGANISM" = 'Homo sapiens'
                                )


write.table(biomart_entrez_platform, file = PLATFORM_DEFINITION_FILE, sep='\t', row.names = FALSE)

# Load custom platform file, and map segements to appropriate values

First load a tranSMART platform file, can be created from any source (eg. biomart R script). Convert the X and Y chromosomes into int for faster comparison

In [3]:
import pandas as pd
import numpy as np

PLATFORM_FILE_R = %R PLATFORM_DEFINITION_FILE
PLATFORM_FILE = PLATFORM_FILE_R[0]

platform_raw = pd.read_table(PLATFORM_FILE)
platform = platform_raw.replace(to_replace='X', value=23)
platform.replace(to_replace='Y', value=24, inplace=True)
platform[['CHR', 'START_BP', 'END_BP']] = platform[['CHR', 'START_BP', 'END_BP']].astype(int)

Load the "transmart input file"

In [5]:
TM_REGION_INPUT_FILE = './CLUC_transmartInput_HCT116-2.txt'

segments = pd.read_table(TM_REGION_INPUT_FILE)

regions = pd.DataFrame(segments['region_id'].str.split('[:-]').tolist())
regions.columns = ['chromosome', 'start', 'end']
regions[['chromosome', 'start', 'end']] = regions[['chromosome', 'start', 'end']].astype(int)

This function takes the tranSMART platform file and a segment table as input. The segment table has 4 columns: chr, start, end, value.


In [6]:
def find_overlapping_segments(chrom, start, end):
    selected_segments_index = regions.loc[((regions.chromosome == chrom) &
                            (regions.end > start) &
                            (regions.start < end))].index
    
    if selected_segments_index != []:
        return selected_segments_index
    else:
        return None
        
def map_multiple_segments_to_gene(platform):
    chrom = 'CHR'
    start = 'START_BP'
    end   = 'END_BP'
    
    overlap = platform.apply(lambda x: find_overlapping_segments(x[chrom], x[start], x[end]), 
                            axis=1)
    return overlap

overlap = map_multiple_segments_to_gene(platform)

Now the overlapping genomic regions have been calculated. We can map the data from one to the other.

In [7]:
remapped_regions = pd.DataFrame(columns=segments.columns)
remapped_regions['region_id'] = platform.REGION_NAME

genes_with_scores = ~overlap.isnull()
only_scores = overlap[genes_with_scores]

for index, value in only_scores.iteritems():
    mean_values = segments.iloc[value].mean()
    remapped_regions.iloc[index] = mean_values

## Ugly way to add back the lost region_id's
remapped_regions['region_id'] = platform.REGION_NAME



Remove any genes that have no regions (remove rows that have NaN's)

In [8]:
remapped_regions = remapped_regions[~remapped_regions.isnull().any(axis=1)]

Round the flag back to int

In [20]:
col_names_contain_flag = remapped_regions.columns.str.contains('.flag')
column_that_contains_flag = remapped_regions.columns[col_names_contain_flag]

remapped_regions[column_that_contains_flag] = np.rint(remapped_regions[column_that_contains_flag].astype(float)).astype(int)

Write table to disk with "gene_mapped.txt" added

In [47]:
remapped_name = TM_REGION_INPUT_FILE + '_gene_mapped.txt'
remapped_regions.to_csv(remapped_name, sep='\t', decimal=3, index=False)