In [1]:
import os
base_dir = 'C:\Users\Ben\Documents\GitHub\expression_broad_data'
os.chdir(base_dir) 
from core import expression_plots 
from core import io_library 
# from IPython.core.debugger import Tracer
import numpy as np
import pandas as pd
import requests
from lxml import etree

#import re
#import matplotlib.pyplot as plt 
#import seaborn as sns
#import csv
#import scipy.cluster.hierarchy as sch
#from collections import Counter, defaultdict
#import math
#import plotly.plotly as py
#py.sign_in('heineike02', 'APPjKrtARaN2ZgUYIkqr')


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
#Build a database that has promoter regions for all KL genes

#read in all KLac gene names
kl_nmpp1_lfc = pd.read_pickle(base_dir + "\expression_data\kl_PKA_as_20160824\LFC_KLac_ASmin_ASplus.pkl")
#Filter to just have KLLA..g named genes
kl_genes = [gene for gene in kl_nmpp1_lfc['orf_name'] if (gene[0:4]+gene[-1]) == 'KLLAg']

In [4]:

#extract data for the subset remaining
kl_genes_subset = kl_genes[2746:]
kl_genes_data = []

strand_inds = {"plus":"1","minus":"2"}
prom_length = 700

jj = 1

for kl_gene in kl_genes_subset: 
    
    print "extracting data for " + kl_gene + " " + str(jj) + " of " + str(len(kl_genes_subset)) 
    jj = jj+1
    # First find coordinates of the gene: 
    # Search for gene name
    #http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term=KLLA0F08393g
    gene_search_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term=(' + kl_gene + '%5BGene%20Name%5D)%20AND%20Kluyveromyces%20Lactis%5BOrganism%5D&retmode=json'
    gene_search_response = requests.get(gene_search_url)
    #verify response was ok
    if gene_search_response.ok:
        #verify there is only one search result
        search_count = gene_search_response.json()['esearchresult']['count']
        if int(search_count) != 1: 
            raise ValueError('More or less than one search result for ' + kl_gene + ' count = ' + search_count)
        entrez_gene_id = gene_search_response.json()['esearchresult']['idlist'][0]
    else:
        raise ValueError('Gene search response for ' + kl_gene + ' not ok')

    # Extract ID fetch data
    gene_data_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=' + entrez_gene_id + '&retmode=xml'
    gene_data_response = requests.get(gene_data_url) 
    if gene_data_response.ok:
        gene_data_xml = etree.fromstring(gene_data_response.content)
        locus_element = gene_data_xml.getchildren()[0].getchildren()[7]
        #check that Entrezgene_locus is still indexed at item7
        if locus_element.tag != 'Entrezgene_locus': 
            raise ValueError('Entrezgene_locus not at position 7')
        locus_gene_commentary = locus_element.find("Gene-commentary") 
        seq_id = locus_gene_commentary.find("Gene-commentary_accession").text
        seq_ver = locus_gene_commentary.find("Gene-commentary_version").text
        #it seems that the "Seq-interval_strand" element only exists if it is plus so set default to plus
        seq_strand = "plus"
        seq_start = seq_stop = None
        for element in locus_gene_commentary.find("Gene-commentary_seqs").getchildren()[0].iter():
            if element.tag == "Seq-interval_from": 
                seq_start = str(int(element.text)+1)  #query seems one off from the data in the gene commentary
            if element.tag == "Seq-interval_to":
                seq_stop = str(int(element.text)+1)   #query seems one off from the data in the gene commentary
            if element.tag == "Seq-interval_strand":
                seq_strand = element.getchildren()[0].get("value")
    

        #This is the structure of the part of the XML file that is returned that we are parsing: 
        #
        #<Entrezgene_locus>
        #    <Gene-commentary>
        #             <Gene-commentary_accession>NC_006042</Gene-commentary_accession>
        #             <Gene-commentary_version>1</Gene-commentary_version>
        #             <Gene-commentary_seqs>
        #                     <Seq-loc>
        #                           <Seq-loc_int>
        #                                 <Seq-interval>
        #                                       <Seq-interval_from>781507</Seq-interval_from>
        #                                       <Seq-interval_to>783018</Seq-interval_to>
        #                                       <Seq-interval_strand>
        #                                             <Na-strand value="minus"/>
        #                                       </Seq-interval_strand>

    else:
        raise ValueError('Gene Id fetch response for ' + kl_gene + ' not ok.  ID = ' + entrez_gene_id)

    # extract chromosome sequence number, coordinates and strand.  Save into database. 

    # Add the operational promoter length to the correct side. 
    if seq_strand == "plus": 
        seq_start_new = str(int(seq_start)-prom_length)
        seq_stop_new = seq_stop
    elif seq_strand == "minus":
        seq_start_new = seq_start
        seq_stop_new = str(int(seq_stop)+prom_length)

    sequence_query = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=" + seq_id + "." + seq_ver + "&rettype=fasta&seq_start=" + seq_start_new + "&seq_stop=" + seq_stop_new + "&strand=" + strand_inds[seq_strand] 
    sequence_data_response = requests.get(sequence_query) 
    sequence_data = ''.join(sequence_data_response.text.split('\n')[1:-2])
    #should probably have check here to ensure that I don't parse the sequence output in a bad way - for now assuming it is second line through
    #the second to last line per the example. 

    # Add all protein sequences plus promoter sequen
    kl_genes_data.append((seq_id + "." + seq_ver,[seq_start,seq_stop],seq_strand,sequence_data[0:prom_length],sequence_data[prom_length:]))

    
kl_gene_refseqs, kl_gene_locs, kl_gene_strands, kl_gene_prom_seqs, kl_gene_seqs = zip(*kl_genes_data)
kl_promoters_dict = {"kl_gene": kl_genes_subset, "refseq":kl_gene_refseqs, "loc":kl_gene_locs , "strand":kl_gene_strands , "prom_seq":kl_gene_prom_seqs, "gene_seq":kl_gene_seqs }
kl_promoters = pd.DataFrame.from_dict(kl_promoters_dict)
kl_promoters = kl_promoters[["kl_gene","refseq","loc","strand","prom_seq","gene_seq"]]
kl_promoters.set_index("kl_gene", inplace = True)

# Load and extract only promoter regions 


# one list of outputs append all items in order as a tuple
# at the end do 
# list1, list2, list3 = 
# lookup grequests, or the threading library





extracting data for KLLA0C17270g 1 of 2330
extracting data for KLLA0C14564g 2 of 2330
extracting data for KLLA0C09020g 3 of 2330
extracting data for KLLA0C01892g 4 of 2330
extracting data for KLLA0A07667g 5 of 2330
extracting data for KLLA0E03895g 6 of 2330
extracting data for KLLA0C12309g 7 of 2330
extracting data for KLLA0D13948g 8 of 2330
extracting data for KLLA0E05567g 9 of 2330
extracting data for KLLA0A06952g 10 of 2330
extracting data for KLLA0D15939g 11 of 2330
extracting data for KLLA0C14476g 12 of 2330
extracting data for KLLA0D00660g 13 of 2330
extracting data for KLLA0C16137g 14 of 2330
extracting data for KLLA0C17248g 15 of 2330
extracting data for KLLA0D16896g 16 of 2330
extracting data for KLLA0E17469g 17 of 2330
extracting data for KLLA0A01496g 18 of 2330
extracting data for KLLA0D08734g 19 of 2330
extracting data for KLLA0D12892g 20 of 2330
extracting data for KLLA0A01991g 21 of 2330
extracting data for KLLA0F17457g 22 of 2330
extracting data for KLLA0B13134g 23 of 23

In [5]:
#saving a subset of genes. (in case the routine breaks) 
kl_gene_refseqs, kl_gene_locs, kl_gene_strands, kl_gene_prom_seqs, kl_gene_seqs = zip(*kl_genes_data)
kl_promoters_dict = {"kl_gene": kl_genes_subset, "refseq":kl_gene_refseqs, "loc":kl_gene_locs , "strand":kl_gene_strands , "prom_seq":kl_gene_prom_seqs, "gene_seq":kl_gene_seqs }
kl_promoters = pd.DataFrame.from_dict(kl_promoters_dict)
kl_promoters = kl_promoters[["kl_gene","refseq","loc","strand","prom_seq","gene_seq"]]
kl_promoters.set_index("kl_gene", inplace = True)
kl_promoters


Unnamed: 0_level_0,refseq,loc,strand,prom_seq,gene_seq
kl_gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KLLA0C17270g,NC_006039.1,"[1510929, 1512215]",plus,AGCCGATATCTGGCTTAATTGCCAGATTGACCCTCATTGGATTCTC...,ATGAGCGTGCCGCAGTTGCCAGGTTTGACATATGATCCTGTAAGGA...
KLLA0C14564g,NC_006039.1,"[1273731, 1274228]",plus,AAAGTCGGTAGGCTAGAGTGCCTATGTGTGTTTCATTATCACTGTA...,ATGGTTAAGTGTGAGGTTTGTGGTGATAAAGAAGCACTTTATAGAT...
KLLA0C09020g,NC_006039.1,"[789284, 790816]",minus,GGAAGTACCTAAATCAACGTCAGTTAAAGGCGACCAAAAGTTGTCA...,ATGTCACAAAGCTATAGATCTACGAGGTCATCCGATCCTGCTACCA...
KLLA0C01892g,NC_006039.1,"[149917, 151911]",minus,CATGTCATAGCTTCCAAATGACAAGAGGGAAGGGATTTCAAGGCAT...,ATGGCAGATTTATGGGATGATAATGACGATGACGATGATATCCTAG...
KLLA0A07667g,NC_006037.1,"[685696, 686613]",plus,TATTTCGACGTTTAGGCGAAAACCGCTTATTCAATAAGCTCTTCAT...,ATGGCTGATCTTTACGAGGCTAGAGTTAGTTCTGGCGGTTTGTCGA...
KLLA0E03895g,NC_006041.1,"[355734, 356234]",minus,AGCAAGGGTATACTCAGCATGGTGGTTTGAGACCATTTGGTGTTTC...,ATGAAGTATGTCTGATGGTAGCGTTATATCCGCCGTTTAAACCGCA...
KLLA0C12309g,NC_006039.1,"[1046848, 1048557]",plus,GAAATTCTGTAAAGTTTCAAGGATCATACAGCCAGGTCATATAGAA...,ATGAACGATACTTCAGAGCATTCCAGTGATGTCTCTCCCATCATTT...
KLLA0D13948g,NC_006040.1,"[1198201, 1199046]",plus,TATTGAGACTTTCCTCCCTTTTAAGCTGACAAACTTCGGCAGTTCA...,ATGAGTTTCCTATTCTATGGAGATTCGAAGCACCTGAGGAAGCGAG...
KLLA0E05567g,NC_006041.1,"[498639, 499106]",minus,TTTTCCCTTATTCACTCGTTAGGAGCCGAAGCCTAGCACAAATTCT...,ATGGTTAATGCAGTTGCAGTTTTGAAGGGTGATTCCTCTGTTTCAG...
KLLA0A06952g,NC_006037.1,"[628173, 630323]",minus,TCGCTGGCCTTGGTAACGGAACGGCTGGATCAGGTTCCAGTTCTGG...,ATGGCTACACTATTTGGAAAGACGTTGTCTCAGGTTCATCCGAAGC...


In [6]:
#appending finished promoters to old promoters list:
kl_promoters_old = pd.read_pickle(base_dir + "\expression_data\kl_promoters\kl_promoters_0_2745.pkl")
kl_promoters_new = pd.concat([kl_promoters_old,kl_promoters])

In [20]:
#check elements in index
kl_genes[100] in kl_promoters_new.index

True

In [19]:
#output finished promoters list. 
kl_promoters_new.to_pickle(base_dir + "\expression_data\kl_promoters\kl_promoters.pkl")

In [26]:
#Upload the promoter database.  
kl_promoters = pd.read_pickle(base_dir + "\expression_data\kl_promoters\kl_promoters.pkl")


In [43]:
#Make a fasta file of all promoters to use as a null model

fname = base_dir + '\expression_data\kl_PKA_as_20160824\\all_promoters.fasta'
with open(fname,'w') as f: 
    for row in kl_promoters.itertuples():
        header_line = '>' + row[0] + ' 700bp_upstream\n'
        seq_line = row[4] + '\n'
        f.write(header_line)
        f.write(seq_line)


#iterate through a dataframe of kl genes and output a fasta file

In [23]:
#Upload kl_sc_data to build gene subset
kl_sc_PKA_data = pd.read_pickle(base_dir + "\expression_data\kl_PKA_as_20160824\kl_sc_PKA_data")


In [65]:
#build subset for JSO Msn2 targets
jso_msn2_targets = ['SSA1', 'SSA4', 'HSP104', 'CTT1', 'TPK1', 'TFS1', 'DDR2', 'TSA1', 'HSP12', 'TMA10', 'TPS2', 'RTC3', 'GPH1', 'HXK1', 'TSA2', 'CYC7', 'PNC1', 'GSY1', 'FMP48', 'SS13', 'ALD4', 'YNR014W', 'HOR2', 'YOR173W', 'RAS2', 'GLK1', 'GPD1', 'HSP26', 'SIP18', 'TKL2', 'FMP16', 'ALD3', 'PGM2']
jso_msn2_targets_kl_orth = list(kl_sc_PKA_data[kl_sc_PKA_data['SC_common_name'].isin(jso_msn2_targets)]['kl_genename'])
kl_promoters_jso_msn2_targets = kl_promoters.loc[jso_msn2_targets_kl_orth,]

fname = base_dir + '\expression_data\kl_PKA_as_20160824\\jso_msn2_promoters.fasta'
with open(fname,'w') as f: 
    for row in kl_promoters_jso_msn2_targets.itertuples():
        header_line = '>' + row[0] + ' 700bp_upstream\n'
        seq_line = row[4] + '\n'
        f.write(header_line)
        f.write(seq_line)


#iterate through a dataframe of kl genes and output a fasta file

In [38]:
# Frome meme: 
#ame --verbose 1 --oc . --control all_promoters.fasta --bgformat 1 --scoring avg --method ranksum --pvalue-report-threshold 0.05 jso_msn2_promoters.fasta db/JASPAR/JASPAR_CORE_2016_fungi.meme


u'ATGTCTGACGTTGAAGAAGTCCAACAAGTCCCAGTCGCTGAATTGACCATCGAAGATGCCCTAAAGGTCGTCTTGAGAACCTCTTTGGTTCACGATGGTTTGGCCAGAGGTTTGAGAGAATCTGCCAAGGCTTTGACCAGAGGTGAAGGTCAACTAGCTGTTTTGGTTGAATCGGTTACTGAAGAAGCCATCAGCAAGTTGGTCCAAGGTTTGGCCACTGAAAACAACGTTCCATTAATCAAGGTTGCTGATGCCAAGCAATTAGGTGAATGGGCCGGTTTGGGTAAGATCGACCGTGACGGTAACGCCAGAAAGGTCGTCGGTGCTTCCGTTGTTGTTGTTAAGAACTGGGGTGCTGACACCCAAGAAAGAGAAATTCTTTTGGAACATTTCAGCCAACAATAA'

5076

In [None]:
# Given a set of KLAC + and - genes and a motif, determine enrichment for that motif.  

# Experiments: 
# STRE in Jacob's Msn2 targets in S.Cer v.s. K.Lac
# Highlight top STRE genes in both species
# STRE enrichment in genes above diagonal v.s. genes on diagonal v.s. genes below. 

# Any enrichment in genes 
# A) above diagonal
# B) Above Diagonal and repressed v.s. on diagonal and repressed. 
# B2) Compare to S.Cer
# C) Above a certain threshold compared to S.Cer for same genes. 


# Eventually would be nice to compare results to SCer

# Extract promoter regions from the database







[1, 'cat']