This notebook has the code for:
1) Making a table with the corresponding gene name for each transcript from wormbase
2) Making a table with just one entry per gene, with locations corresponding to the longest transcript

In [96]:
import pandas as pd
import numpy as np
import pybedtools

In [97]:
!ls

 background_genes.txt		       genes_in_ee_domains_95.bed
 briggsae_get_good_gene_df.ipynb       genes_in_ee_domains_99.bed
 changed_domain_gene_characteristics   genes_in_l3_domains_100.bed
 changed_domain_genes.txt	       genes_in_l3_domains_51.bed
 elegans_get_good_gene_df.ipynb        genes_in_l3_domains_55.bed
 genes_in_ee_domain		       genes_in_l3_domains_60.bed
 genes_in_ee_domains_100.bed	       genes_in_l3_domains_65.bed
 genes_in_ee_domains_51.bed	       genes_in_l3_domains_70.bed
 genes_in_ee_domains_55.bed	       genes_in_l3_domains_75.bed
 genes_in_ee_domains_60.bed	       genes_in_l3_domains_80.bed
 genes_in_ee_domains_65.bed	       genes_in_l3_domains_85.bed
 genes_in_ee_domains_70.bed	       genes_in_l3_domains_90.bed
 genes_in_ee_domains_75.bed	       genes_in_l3_domains_95.bed
 genes_in_ee_domains_80.bed	       genes_in_l3_domains_99.bed
 genes_in_ee_domains_85.bed	       get_intersects.sh
 genes_in_ee_domains_90.bed	      'what_genes_in_what_domains?.i

### Reading in files

In [98]:
chrom_bed_columns=['chrom','chromStart','chromEnd','name',
       'score','strand','thickStart','thickEnd','itemRgb']
bed_12_columns=['chrom','chromStart','chromEnd','name',
       'score','strand','thickStart','thickEnd','itemRgb',
        'blockCount', 'blockSizes', 'blockStart']

In [99]:
# all the transcripts with all the info from wormbase (no gene IDs)
ts=pd.read_csv('../canonical_genesets/c_briggsae_PRJNA10731.WS275.canonical_geneset.bed', names=bed_12_columns,sep='\t')

# only coding transcripts and their corresponding gene ID from Francesco
ts_gs=pd.read_csv('../canonical_genesets/c_briggsae_transcripts_and_genes.txt',names=['gene','transcript'], sep='\t')

In [100]:
ts_gs

Unnamed: 0,gene,transcript
0,WBGene00031772,CBG10375.1
1,WBGene00031775,CBG10379.1
2,WBGene00031776,CBG10380.1
3,WBGene00031777,CBG10381.1
4,WBGene00031778,CBG10382.1
...,...,...
24811,WBGene00089540,CBG28126c.1
24812,WBGene00089540,CBG28126b.1
24813,WBGene00089540,CBG28126a.1
24814,WBGene00031934,CBG10582.1


In [101]:
ts

Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStart
0,cb25.NA_274,620,1742,CBG24927.1,0,+,620,1742,0,2,81819,0303
1,cb25.NA_274,3322,4554,CBG24928.1,0,+,3322,4554,0,2,141258,0974
2,cb25.NA_001,1567,2399,CBG26632,0,+,2399,2399,0,4,2837972106,0329606726
3,cb25.NA_001,3619,4187,CBG23550.1,0,+,3619,4187,0,3,11348109,0182459
4,cb25.NA_042,175,2171,CBG23789.1,0,+,175,2171,0,6,98154121102109151,06981368154016881845
...,...,...,...,...,...,...,...,...,...,...,...,...
27167,cb25.fpc4366,13494,17431,CBG22564.1,0,+,14884,17383,0,6,1043555140308401300,013901996218231883637
27168,cb25.fpc4366,14514,17431,CBG22564.2,0,+,14884,17383,0,5,925140308401300,0976116221682617
27169,cb25.fpc4366,20644,21665,CBG22565.1,0,-,20644,21665,0,3,252426294,0299727
27170,cb25.fpc4366,28986,29391,CBG22568.1,0,-,28986,29391,0,1,405,0


### Making a bed file that has every gene only once, using the coordinates of the longest transcript from the wormbase document

In [122]:
# add the gene IDs to the transcript data frame from wormbase

total_df=ts.copy()
total_df['gene']=''

for i in range(len(ts_gs)):
    row=total_df.index[total_df['name']==ts_gs['transcript'][i]]
    total_df.at[row,'gene']=ts_gs['gene'][i]

In [103]:
total_df

Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStart,gene
0,cb25.NA_274,620,1742,CBG24927.1,0,+,620,1742,0,2,81819,0303,WBGene00042923
1,cb25.NA_274,3322,4554,CBG24928.1,0,+,3322,4554,0,2,141258,0974,WBGene00042924
2,cb25.NA_001,1567,2399,CBG26632,0,+,2399,2399,0,4,2837972106,0329606726,
3,cb25.NA_001,3619,4187,CBG23550.1,0,+,3619,4187,0,3,11348109,0182459,WBGene00041882
4,cb25.NA_042,175,2171,CBG23789.1,0,+,175,2171,0,6,98154121102109151,06981368154016881845,WBGene00042051
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27167,cb25.fpc4366,13494,17431,CBG22564.1,0,+,14884,17383,0,6,1043555140308401300,013901996218231883637,WBGene00041096
27168,cb25.fpc4366,14514,17431,CBG22564.2,0,+,14884,17383,0,5,925140308401300,0976116221682617,WBGene00041096
27169,cb25.fpc4366,20644,21665,CBG22565.1,0,-,20644,21665,0,3,252426294,0299727,WBGene00041097
27170,cb25.fpc4366,28986,29391,CBG22568.1,0,-,28986,29391,0,1,405,0,WBGene00041100


In [104]:
total_df[total_df['gene']=='']

Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStart,gene
2,cb25.NA_001,1567,2399,CBG26632,0,+,2399,2399,0,4,2837972106,0329606726,
7,cb25.NA_042,17538,17912,CBG23792,0,-,17912,17912,0,4,55586260,0103209314,
14,cb25.NA_330,980,1053,CBG28302,0,+,1053,1053,0,1,73,0,
24,cb25.NA_209,15900,17726,CBG26745,0,-,17726,17726,0,6,427866289043329,0473602127214081497,
33,cb25.NA_389,1414,1599,CBG25072,0,+,1599,1599,0,1,185,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26949,II,15877712,15879209,CBG07150,0,+,15879209,15879209,0,1,1497,0,
26976,II,15964584,15964657,CBG28903,0,-,15964657,15964657,0,1,73,0,
27042,II,16301773,16302465,CBG25792,0,-,16302465,16302465,0,3,28194220,0327472,
27074,II,16406151,16406292,CBG29459,0,-,16406292,16406292,0,1,141,0,


In [117]:
ts_gs_rows_without_dot=[]
for index, row in ts_gs.iterrows():
    if '.' not in row['transcript']:ts_gsrows_without_dot.append(index)
        
ts_rows_without_dot=[]
for index, row in total_df.iterrows():
    if '.' not in row['name']:ts_rows_without_dot.append(index)
        
transcript_stem_in_ts_gs=[]
for index, row in ts_gs.iterrows():
    if 'CBG26632'  in row['transcript']:transcript_stem_in_ts_gs.append(index)
    # this is a gene without a dot in the ts dataframe.
    # like all the other genes without a dot in the ts dataframe, it is not present in the ts_gs dataframe.
        
print(len(ts_gs_rows_without_dot))
print(len(ts_rows_without_dot))
print(len(transcript_stem_in_ts_gs))

0
2356
0


In [123]:
# These are the transcripts from wormbase that don't have a corresponding gene ID from the document from Francesco
# These correspond mostly to noncoding RNAs, and the project won't deal with them (too divergent and repetitive, difficult to do orthology)

absent_df=total_df[total_df['gene']==''].copy()
absent_df.to_csv('absent_transcripts.bed',header=False,index=False, sep='\t')

In [124]:
# These are the coding transcripts that we'll be looking at

genes_df=total_df[total_df['gene']!=''].copy()
len(genes_df)

24816

In [125]:
# get a list of each gene just once

one_of_each_gene=[]
for i in ts_gs['gene']:
    if i not in one_of_each_gene:
        one_of_each_gene.append(i)

In [126]:
# Find longest isoform

genes_df['transcript_size']=list(genes_df['chromEnd']-genes_df['chromStart'])

In [127]:
# make a table of just the longest isoforms of each gene

one_to_one=pd.DataFrame(columns=genes_df.columns)
for i in one_of_each_gene:
    to_compare=genes_df[genes_df['gene']==i]
    longest=to_compare['transcript_size'].argmax()
    row=to_compare.iloc[longest,:]
    to_concat=row.to_frame().T
    one_to_one=pd.concat([one_to_one, to_concat])

In [128]:
# get into bed format

one_to_one['name']=one_to_one['gene']
one_to_one=one_to_one.drop('transcript_size', axis=1).drop('gene', axis=1)

In [129]:
one_to_one

Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStart
12068,cb25.fpc2310b,379,1542,WBGene00031772,0,+,379,1542,0,3,328191510,0416653
12070,cb25.fpc2310b,11420,13201,WBGene00031775,0,+,11420,13201,0,4,171573130134,084114661647
12071,cb25.fpc2310b,13842,14490,WBGene00031776,0,-,13863,14460,0,3,117210219,0171429
12072,cb25.fpc2310b,15090,15623,WBGene00031777,0,-,15090,15623,0,3,12528240,0169493
12073,cb25.fpc2310b,17740,18677,WBGene00031778,0,-,17740,18677,0,3,456191187,0512750
...,...,...,...,...,...,...,...,...,...,...,...,...
16769,X,21506171,21506490,WBGene00089542,0,+,21506171,21506392,0,2,144128,0191
16771,X,21508226,21509705,WBGene00089541,0,-,21508226,21509705,0,2,148131,01348
16772,X,21512187,21514843,WBGene00089540,0,-,21512187,21514843,0,11,432225237164127192357417713484,"0,850,1120,1400,1610,1780,2014,2051,2173,2393,..."
16775,X,21517869,21522461,WBGene00031934,0,+,21517869,21522461,0,2,107049,04543


In [130]:
one_to_one.to_csv('../canonical_genesets/briggsae_one_of_each_gene.bed', header=False, index=False, sep='\t')