## Features 2q11.1 sub region chr2 96985244-97005244 (C)

Analysis of genomic features in this region as discussed in manuscript. 
Regions are identified using GRCh37 (1KG) coordinates

In [1]:
import sys, os
import pickle, gzip
import pandas as pd
from pyliftover import LiftOver
from sqlalchemy.sql import and_, not_, select
import decimal
from ensembldb3 import HostAccount, Genome
from Bio import SeqUtils

path = '/home/helmut/helmutsimonpython/helmutsimonpython/Neighbourhood_Effects'
if not os.getcwd() == path:
    os.chdir(path)

account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())
release = 89
genome = Genome(species='human', release=release, account=account)

Convert from 1KGP (GRCh37) to GRCh38 coordinates and look for genes in this sub region.

In [2]:
chrom = 'chr2'
coord_name = '2'
lo = LiftOver('hg19', 'hg38')

start = lo.convert_coordinate(chrom, 96985244)[0][1]
end   = lo.convert_coordinate(chrom, 97005244)[0][1]

print(start, end)
genes = genome.get_features(coord_name=coord_name, start=start, end=end, feature_types="gene")
for gene in genes:
    gstart = gene.location.start
    gend = gene.location.end
    print('\n', gene.symbol, gstart, gend)
    for exon in gene.canonical_transcript.exons:
        print(exon, exon.location)
    if gene.canonical_transcript.introns:
        for intron in gene.canonical_transcript.introns:
            print(intron, intron.location)

96319506 96339506

 AC021188.1 96307262 96321731
Exon(stableid=ENSE00001666599, rank=1) Homo sapiens:chromosome:2:96321135-96321731:-1
Exon(stableid=ENSE00001760016, rank=2) Homo sapiens:chromosome:2:96307262-96307406:-1
Intron(TranscriptId=ENST00000421534, rank=1) Homo sapiens:chromosome:2:96307406-96321135:-1

 ITPRIPL1 96325330 96330517
Exon(stableid=ENSE00001435085, rank=1) Homo sapiens:chromosome:2:96326222-96330517:1

 NCAPH 96335786 96373845
Exon(stableid=ENSE00001893240, rank=1) Homo sapiens:chromosome:2:96335786-96335848:1
Exon(stableid=ENSE00000770619, rank=2) Homo sapiens:chromosome:2:96341641-96341894:1
Exon(stableid=ENSE00003653556, rank=3) Homo sapiens:chromosome:2:96342049-96342140:1
Exon(stableid=ENSE00003602184, rank=4) Homo sapiens:chromosome:2:96342755-96342848:1
Exon(stableid=ENSE00003650085, rank=5) Homo sapiens:chromosome:2:96343165-96343304:1
Exon(stableid=ENSE00003563081, rank=6) Homo sapiens:chromosome:2:96344104-96344229:1
Exon(stableid=ENSE00003648879, rank=7

Look for genes in the 2-kb sub-segment starting with 96987244

In [3]:
chrom = 'chr2'
coord_name = '2'
lo = LiftOver('hg19', 'hg38')

start = lo.convert_coordinate(chrom, 96987244)[0][1]
end   = lo.convert_coordinate(chrom, 96989244)[0][1]

print(start, end)
genes = genome.get_features(coord_name=coord_name, start=start, end=end, feature_types="gene")
for gene in genes:
    gstart = gene.location.start
    gend = gene.location.end
    print('\n', gene.symbol, gstart, gend)
    for exon in gene.canonical_transcript.exons:
        print(exon, exon.location)
    for intron in gene.canonical_transcript.introns:
        print(intron, intron.location)
  

96321506 96323506

 AC021188.1 96307262 96321731
Exon(stableid=ENSE00001666599, rank=1) Homo sapiens:chromosome:2:96321135-96321731:-1
Exon(stableid=ENSE00001760016, rank=2) Homo sapiens:chromosome:2:96307262-96307406:-1
Intron(TranscriptId=ENST00000421534, rank=1) Homo sapiens:chromosome:2:96307406-96321135:-1


Search the 2-kb sub-segment starting with 96987244 for regulatory variants (see Neutrality Test manuscript). We then search for the associted regulatory feature.

In [4]:
seq_region_id = 131545
chrom = 'chr2'
coord_name = '2'
lo = LiftOver('hg19', 'hg38')
variation_feature_table = genome.VarDb.get_table('variation_feature')
whereclause = and_(variation_feature_table.c.seq_region_id == seq_region_id,
                        variation_feature_table.c.seq_region_start > start,
                        variation_feature_table.c.seq_region_end < end,
                        variation_feature_table.c.variation_name.contains("rs"),
                        variation_feature_table.c.somatic == 0,
                        variation_feature_table.c.alignment_quality == decimal.Decimal(1),
                        variation_feature_table.c.minor_allele_freq.isnot(None))
query = select([variation_feature_table.c.variation_name,
                    variation_feature_table.c.variation_feature_id,
                    variation_feature_table.c.seq_region_start,
                    variation_feature_table.c.consequence_types], whereclause)
vfids = list()
for snp in query.execute():
    if snp[3] != {'intergenic_variant'}:
        print(snp)
        vfids.append(snp[1])
print(vfids)
regulatory_feature_variation_table = genome.VarDb.get_table('regulatory_feature_variation')
whereclause1 = regulatory_feature_variation_table.c.variation_feature_id.in_(vfids)
query = select([regulatory_feature_variation_table.c.feature_stable_id,
                    regulatory_feature_variation_table.c.variation_feature_id,
                    regulatory_feature_variation_table.c.feature_type,
                    regulatory_feature_variation_table.c.consequence_types], whereclause1)
for feature in query.execute():
    print(feature)

('rs148475229', 32896602, 96321523, {'regulatory_region_variant', 'non_coding_transcript_exon_variant', 'upstream_gene_variant'})
('rs533407893', 66091165, 96321548, {'regulatory_region_variant', 'non_coding_transcript_exon_variant', 'upstream_gene_variant'})
('rs544114957', 76765206, 96321664, {'regulatory_region_variant', 'non_coding_transcript_exon_variant', 'upstream_gene_variant'})
('rs116248005', 22990503, 96321719, {'regulatory_region_variant', 'non_coding_transcript_exon_variant', 'upstream_gene_variant'})
('rs189388509', 43984094, 96321835, {'regulatory_region_variant', 'upstream_gene_variant'})
('rs142815503', 28178665, 96321836, {'regulatory_region_variant', 'upstream_gene_variant'})
('rs375589031', 57487398, 96321838, {'regulatory_region_variant', 'upstream_gene_variant'})
('rs528144556', 60844103, 96321924, {'regulatory_region_variant', 'upstream_gene_variant'})
('rs547517849', 80157664, 96321942, {'regulatory_region_variant', 'upstream_gene_variant'})
('rs570343021', 1029