This notebook has the code for:
1) Making a table with the corresponding gene name for each transcript from wormbase
2) Making a table with just one entry per gene, with locations corresponding to the longest transcript

In [1]:
import pandas as pd
import numpy as np
import pybedtools

In [2]:
from collections import defaultdict

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
!ls

README.txt
briggsae_get_good_gene_df.ipynb
briggsae_one_of_each_gene.bed
briggsae_to_compare.bed
c_briggsae_PRJNA10731.WS275.canonical_geneset.bed
c_briggsae_transcripts_and_genes.txt
c_elegans_PRJNA13758.WS275.canonical_geneset.bed
c_elegans_transcripts_and_genes.txt
elegans_get_good_gene_df.ipynb
elegans_one_of_each_gene.bed
elegans_to_compare.bed
get_transcript_to_gene_map.ipynb
non_bed


### Reading in files

In [2]:
chrom_bed_columns=['chrom','chromStart','chromEnd','name',
       'score','strand','thickStart','thickEnd','itemRgb']
bed_12_columns=['chrom','chromStart','chromEnd','name',
       'score','strand','thickStart','thickEnd','itemRgb',
        'blockCount', 'blockSizes', 'blockStart']

In [6]:
# all the transcripts with all the info from wormbase (no gene IDs)
ts=pd.read_csv('../canonical_genesets/c_elegans_PRJNA13758.WS275.canonical_geneset.bed', names=bed_12_columns,sep='\t')

# only coding transcripts and their corresponding gene ID from Francesco
ts_gs=pd.read_csv('../canonical_genesets/c_elegans_transcripts_and_genes.txt',names=['gene','transcript'], sep='\t')

In [8]:
ts

Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStart
0,MtDNA,0,55,MTCE.1,0,+,55,55,0,1,55,0
1,MtDNA,57,111,MTCE.2,0,+,111,111,0,1,54,0
2,MtDNA,112,549,MTCE.3.1,0,+,112,544,0,1,437,0
3,MtDNA,548,783,MTCE.4.1,0,+,548,779,0,1,235,0
4,MtDNA,784,840,MTCE.5,0,+,840,840,0,1,56,0
...,...,...,...,...,...,...,...,...,...,...,...,...
71095,II,15268211,15271972,2RSSE.1c.1,0,+,15268480,15270990,0,4,3433361721177,0124716482584
71096,II,15273616,15273693,2RSSE.6,0,+,15273693,15273693,0,1,77,0
71097,II,15277889,15278575,2RSSE.3,0,-,15278575,15278575,0,3,156108301,0230385
71098,II,15279228,15279420,2RSSE.5,0,+,15279420,15279420,0,1,192,0


In [7]:
ts_gs

Unnamed: 0,gene,transcript
0,WBGene00022277,Y74C9A.3.2
1,WBGene00022277,Y74C9A.3.1
2,WBGene00022276,Y74C9A.2a.1
3,WBGene00022276,Y74C9A.2a.3
4,WBGene00022276,Y74C9A.2a.4
...,...,...
43035,WBGene00017625,F20B4.6.1
43036,WBGene00017625,F20B4.6.2
43037,WBGene00019189,H11L12.1.1
43038,WBGene00007068,cTel55X.1b.1


### Making a bed file that has every gene only once, using the coordinates of the longest transcript from the wormbase document

In [9]:
# add the gene IDs to the transcript data frame from wormbase

total_df=ts
total_df['gene']=''

for i in range(len(ts_gs)):
    row=total_df.index[total_df['name']==ts_gs['transcript'][i]]
    total_df.at[row,'gene']=ts_gs['gene'][i]

In [256]:
len(total_df)

71100

In [267]:
# These are the transcripts from wormbase that don't have a corresponding gene ID from the document from Francesco
# These correspond mostly to noncoding RNAs, and the project won't deal with them (too divergent and repetitive, difficult to do orthology)

absent_df=total_df[total_df['gene']=='']
absent_df.to_csv('absent_transcripts.bed',header=False,index=False, sep='\t')

In [273]:
# These are the coding transcripts that we'll be looking at

genes_df=total_df[total_df['gene']!='']
len(genes_df)

43040

In [None]:
# get a list of each gene just once

one_of_each_gene=[]
for i in ts_gs['gene']:
    if i not in one_of_each_gene:
        one_of_each_gene.append(i)

In [277]:
# Find longest isoform

genes_df['transcript_size']=list(genes_df['chromEnd']-genes_df['chromStart'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [326]:
# make a table of just the longest isoforms of each gene

one_to_one=pd.DataFrame(columns=genes_df.columns)
for i in one_of_each_gene:
    to_compare=genes_df[genes_df['gene']==i]
    longest=to_compare['transcript_size'].argmax()
    row=to_compare.iloc[longest,:]
    to_concat=row.to_frame().T
    one_to_one=pd.concat([one_to_one, to_concat])

In [334]:
# get into bed format

one_to_one['name']=one_to_one['gene']
one_to_one=one_to_one.drop('transcript_size', axis=1).drop('gene', axis=1)

In [336]:
one_to_one.to_csv('/home/helena_hatrick/part_ii_project/canonical_genesets/elegans_one_of_each_gene.bed', header=False, index=False, sep='\t')

In [1]:
import pandas as pd

In [5]:
test=pd.read_csv('/home/helena_hatrick/part_ii_project/canonical_genesets/elegans_one_of_each_gene.bed',names=bed_12_columns,sep='\t')

In [6]:
test[test['name']=='WBGene00001488']

Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStart
2855,I,14875783,14900213,WBGene00001488,0,-,14876395,14900213,0,13,"702,153,277,101,123,11688,68,167,251,398,151,1...","0,1414,2181,3109,5268,6271,18792,18909,19402,2..."
