In [32]:
pwd

'/mnt/c/Users/helen/hrh/part_ii_project/chromatin_domains'

In [156]:
!ls

canonical_genesets     chromatin_domains_to_convert.bed     oot
chromatin_domains.bed  from_evans
chromatin_domains.csv  making_chromatin_domain_table.ipynb


**Goal**: To get all of the csvs (each one corresponds to a sheet from the excel file called 'Dataset_S02' from the Evans et. al paper, webpage https://www.pnas.org/content/113/45/E7020/tab-figures-data) into one master document with all the information about chromatin domains

In [1]:
import numpy as np
import pandas as pd
import pybedtools

In [2]:
names=['chrom','chromStart','chromEnd','name',
       'score','strand','thickStart','thickEnd','itemRgb']

In [3]:
ee_active_domains=pd.read_csv('from_evans/evans_ee_active_domains.csv', names=names)
ee_borders=pd.read_csv('from_evans/evans_ee_borders.csv', names=names)
ee_regulated_domains=pd.read_csv('from_evans/evans_ee_regulated_domains.csv',names=names)

l3_active_domains=pd.read_csv('from_evans/evans_l3_active_domains.csv', names=names)
l3_borders=pd.read_csv('from_evans/evans_l3_borders.csv', names=names)
l3_regulated_domains=pd.read_csv('from_evans/evans_l3_regulated_domains.csv',names=names)

In [4]:
tables=[ee_active_domains,ee_borders,ee_regulated_domains,l3_active_domains,l3_borders,l3_regulated_domains]
table_names=['ee_active_domains','ee_borders','ee_regulated_domains','l3_active_domains','l3_borders','l3_regulated_domains']

In [5]:
dark_red='138,0,0'
dark_blue='0,0,255'
dark_green='0,138,0'

In [6]:
for x in range(len(tables)):
    table=tables[x]

    if 'active' in table_names[x]:identifier='a_';colour=dark_green
    if 'regulated' in table_names[x]:identifier='r_';colour=dark_blue
    if 'border' in table_names[x]:identifier='b_';colour=dark_red
    
    name_column=[]
    for i in range(len(table)):
        name=identifier+str(i)
        name_column.append(name)
    table['name']=name_column
    table['itemRgb']=colour

In [7]:
ee_domains=pd.concat(tables[:3], ignore_index=True)
l3_domains=pd.concat(tables[3:6], ignore_index=True)

In [8]:
ee_domains

Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb
0,I,3997,54536,a_0,,,,,01380
1,I,71121,112986,a_1,,,,,01380
2,I,180181,216072,a_2,,,,,01380
3,I,287193,403734,a_3,,,,,01380
4,I,446369,542113,a_4,,,,,01380
...,...,...,...,...,...,...,...,...,...
4095,V,20499472,20577203,r_1036,,,,,00255
4096,V,20591750,20697792,r_1037,,,,,00255
4097,V,20707430,20708891,r_1038,,,,,00255
4098,V,20767014,20826730,r_1039,,,,,00255


In [9]:
l3_domains

Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb
0,I,3745,50073,a_0,,,,,01380
1,I,69642,113186,a_1,,,,,01380
2,I,129359,133787,a_2,,,,,01380
3,I,179114,216936,a_3,,,,,01380
4,I,286638,334155,a_4,,,,,01380
...,...,...,...,...,...,...,...,...,...
4997,V,20591450,20697692,r_1252,,,,,00255
4998,V,20706229,20709291,r_1253,,,,,00255
4999,V,20747560,20773785,r_1254,,,,,00255
5000,V,20794810,20826680,r_1255,,,,,00255


In [22]:
!ls

c10_ee_domains_to_convert.bed  ee_domains_final.bed
c10_l3_domains_to_convert.bed  from_evans
convert_c10_to_c11_domains     l3_domains_final.bed
convert_to_briggsae	       making_chromatin_domain_table.ipynb


In [41]:
ee_domains

Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb
0,I,3997,54536,a_0,,,,,01380
1,I,71121,112986,a_1,,,,,01380
2,I,180181,216072,a_2,,,,,01380
3,I,287193,403734,a_3,,,,,01380
4,I,446369,542113,a_4,,,,,01380
...,...,...,...,...,...,...,...,...,...
4095,V,20499472,20577203,r_1036,,,,,00255
4096,V,20591750,20697792,r_1037,,,,,00255
4097,V,20707430,20708891,r_1038,,,,,00255
4098,V,20767014,20826730,r_1039,,,,,00255


In [42]:
l3_domains

Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb
0,I,3745,50073,a_0,,,,,01380
1,I,69642,113186,a_1,,,,,01380
2,I,129359,133787,a_2,,,,,01380
3,I,179114,216936,a_3,,,,,01380
4,I,286638,334155,a_4,,,,,01380
...,...,...,...,...,...,...,...,...,...
4997,V,20591450,20697692,r_1252,,,,,00255
4998,V,20706229,20709291,r_1253,,,,,00255
4999,V,20747560,20773785,r_1254,,,,,00255
5000,V,20794810,20826680,r_1255,,,,,00255


In [43]:
chrom_dict_2={'I':'chrI','II':'chrII','III':'chrIII','IV':'chrIV','V':'chrV','X':'chrX'}

l3=l3_domains.copy()
new_chroms=[]
for index, row in l3.iterrows():
    new_chroms.append(chrom_dict_2[row['chrom']])
l3.iloc[:,0]=new_chroms

ee=ee_domains.copy()
new_chroms=[]
for index, row in ee.iterrows():
    new_chroms.append(chrom_dict_2[row['chrom']])
ee.iloc[:,0]=new_chroms

In [46]:
l3_for_bed=l3.fillna('0')
ee_for_bed=ee.fillna('0')

ee_for_bed.to_csv('c10_ee_domains.bed', header=False, index=False, sep='\t')
l3_for_bed.to_csv('c10_l3_domains.bed', header=False, index=False, sep='\t')

In [47]:
pd.read_csv('c10_ee_domains.bed', header=None, sep='\t')

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,chrI,3997,54536,a_0,0,0,0,0,01380
1,chrI,71121,112986,a_1,0,0,0,0,01380
2,chrI,180181,216072,a_2,0,0,0,0,01380
3,chrI,287193,403734,a_3,0,0,0,0,01380
4,chrI,446369,542113,a_4,0,0,0,0,01380
...,...,...,...,...,...,...,...,...,...
4095,chrV,20499472,20577203,r_1036,0,0,0,0,00255
4096,chrV,20591750,20697792,r_1037,0,0,0,0,00255
4097,chrV,20707430,20708891,r_1038,0,0,0,0,00255
4098,chrV,20767014,20826730,r_1039,0,0,0,0,00255


### Converting genome coordinates from ce10 (used by Evans et al.) to ce11 (current assembly)

In [33]:
ee_convert=pd.read_csv('convert_c10_to_c11_domains/ee_domains.bed', header=None, sep='\t').iloc[:,:3]
l3_convert=pd.read_csv('convert_c10_to_c11_domains/l3_domains.bed', header=None, sep='\t').iloc[:,:3]

In [34]:
l3_convert

Unnamed: 0,0,1,2
0,I,3745,50073
1,I,69642,113186
2,I,129359,133787
3,I,179114,216936
4,I,286638,334155
...,...,...,...
4997,V,20591450,20697692
4998,V,20706229,20709291
4999,V,20747560,20773785
5000,V,20794810,20826680


In [35]:
new_chrom=[]
for i in range(len(ee_convert)):
    if ee_convert.iloc[i,0]=='I':new_chrom.append('chrI')
    if ee_convert.iloc[i,0]=='II':new_chrom.append('chrII')
    if ee_convert.iloc[i,0]=='III':new_chrom.append('chrIII')
    if ee_convert.iloc[i,0]=='IV':new_chrom.append('chrIV')
    if ee_convert.iloc[i,0]=='V':new_chrom.append('chrV')
    if ee_convert.iloc[i,0]=='X':new_chrom.append('chrX')
ee_convert.iloc[:,0]=new_chrom

In [37]:
new_chrom=[]
for i in range(len(l3_convert)):
    if l3_convert.iloc[i,0]=='I':new_chrom.append('chrI')
    if l3_convert.iloc[i,0]=='II':new_chrom.append('chrII')
    if l3_convert.iloc[i,0]=='III':new_chrom.append('chrIII')
    if l3_convert.iloc[i,0]=='IV':new_chrom.append('chrIV')
    if l3_convert.iloc[i,0]=='V':new_chrom.append('chrV')
    if l3_convert.iloc[i,0]=='X':new_chrom.append('chrX')

l3_convert.iloc[:,0]=new_chrom

In [38]:
l3_convert

Unnamed: 0,0,1,2
0,chrI,3745,50073
1,chrI,69642,113186
2,chrI,129359,133787
3,chrI,179114,216936
4,chrI,286638,334155
...,...,...,...
4997,chrV,20591450,20697692
4998,chrV,20706229,20709291
4999,chrV,20747560,20773785
5000,chrV,20794810,20826680


In [39]:
ee_convert.to_csv('c10_ee_domains_to_convert.bed',header=False,index=False, sep='\t')
l3_convert.to_csv('c10_l3_domains_to_convert.bed',header=False, index=False, sep='\t')

In [40]:
!ls

briggsae_domains	       c11_l3_domains_final.bed
c10_ee_domains.bed	       convert_c10_to_c11_domains
c10_ee_domains_to_convert.bed  ee_domains_final.bed
c10_l3_domains.bed	       from_evans
c10_l3_domains_to_convert.bed  l3_domains_final.bed
c11_ee_domains_final.bed       making_chromatin_domain_table.ipynb


Then, in linux command line, ran: 

./liftOver l3_domains_to_convert.bed ce10ToCe11.over.chain c11_l3_domains.bed unmapped_c11_l3_domains.bed

And the same for ee

ce10ToCe11.over.chain is the chain file downloaded from http://hgdownload.soe.ucsc.edu/goldenPath/ce10/liftOver/ 
http://hgdownload.soe.ucsc.edu/downloads.html

### Remaking the full bed files

In [23]:
c11_ee=pd.read_csv('convert_c10_to_c11_domains/c11_ee_domains.bed', names=names, sep='\t')
c11_l3=pd.read_csv('convert_c10_to_c11_domains/c11_l3_domains.bed', names=names, sep='\t')

In [24]:
c10_ee=pd.read_csv('convert_c10_to_c11_domains/ee_domains.bed', names=names, sep='\t')
c10_l3=pd.read_csv('convert_c10_to_c11_domains/l3_domains.bed', names=names, sep='\t')

In [25]:
ee_domains_final=pd.concat([c10_ee.iloc[:,0],c11_ee.iloc[:,1:3],c10_ee.iloc[:,3:]],axis=1)
l3_domains_final=pd.concat([c10_l3.iloc[:,0],c11_l3.iloc[:,1:3],c10_l3.iloc[:,3:]],axis=1)

In [26]:
ee_domains_final['thickStart']=ee_domains_final['chromStart']
ee_domains_final['thickEnd']=ee_domains_final['chromEnd']
l3_domains_final['thickStart']=l3_domains_final['chromStart']
l3_domains_final['thickEnd']=l3_domains_final['chromEnd']

In [27]:
ee_domains_final

Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb
0,I,3997,54536,a_0,0,0,3997,54536,01380
1,I,71121,112986,a_1,0,0,71121,112986,01380
2,I,180181,216072,a_2,0,0,180181,216072,01380
3,I,287193,403731,a_3,0,0,287193,403731,01380
4,I,446366,542110,a_4,0,0,446366,542110,01380
...,...,...,...,...,...,...,...,...,...
4095,V,20499503,20577234,r_1036,0,0,20499503,20577234,00255
4096,V,20591781,20697823,r_1037,0,0,20591781,20697823,00255
4097,V,20707461,20708922,r_1038,0,0,20707461,20708922,00255
4098,V,20767045,20826761,r_1039,0,0,20767045,20826761,00255


In [28]:
ee_domains_final.to_csv('c11_ee_domains_final.bed', header=False, index=False, sep='\t')
l3_domains_final.to_csv('c11_l3_domains_final.bed', header=False, index=False, sep='\t')

In [2]:
import pybedtools