In [1]:
import pandas as pd

In [2]:
chr12_gtf = pd.read_table('../dataset/genes_ucsc.chr12.mod.gtf')

In [6]:
chr12_gtf.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attributes
0,chr12_partial,unknown,exon,1,892,.,+,0,"gene_id ""ATXN2_partial""; gene_name ""ATXN2_part..."
1,chr12_partial,unknown,CDS,162,892,.,+,0,"gene_id ""ATXN2_partial""; gene_name ""ATXN2_part..."
2,chr12,unknown,exon,43757,43793,.,+,0,"gene_id ""ATXN2""; gene_name ""ATXN2""; p_id ""P137..."
3,chr12,unknown,CDS,43757,43793,.,+,0,"gene_id ""ATXN2""; gene_name ""ATXN2""; p_id ""P137..."
4,chr12,unknown,exon,45459,45518,.,+,0,"gene_id ""ATXN2""; gene_name ""ATXN2""; p_id ""P137..."


In [5]:
# let's re-enter the dataset, this time setting our column names
chr12_gtf = pd.read_table('../dataset/genes_ucsc.chr12.mod.gtf', names=['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attributes'])

In [None]:
# get the first 4 records
chr12_gtf.head(4)

In [7]:
# get the (row, col) counts of the dataframe
chr12_gtf.shape

(47113, 9)

In [9]:
# get the datatypes of the data by column
chr12_gtf.dtypes

seqname       object
source        object
feature       object
start          int64
end            int64
score         object
strand        object
frame         object
attributes    object
dtype: object

In [10]:
# look at the top 10 entries in the feature column of the GTF file
print(chr12_gtf['feature'].head(10))

0    exon
1     CDS
2    exon
3     CDS
4    exon
5     CDS
6    exon
7     CDS
8    exon
9     CDS
Name: feature, dtype: object


In [11]:
# how many entries are there for each kind of "feature" in this GTF file?
print(chr12_gtf['feature'].value_counts())

exon           23203
CDS            20284
start_codon     1818
stop_codon      1808
Name: feature, dtype: int64


In [13]:
# find the length for each string in the gtf file, append the result to the dataframe as a new column.
chr12_gtf['length'] = chr12_gtf['end'] - chr12_gtf['start']

In [14]:
# print the first 5 records
print(chr12_gtf.head())

         seqname   source feature  start    end score strand frame  \
0  chr12_partial  unknown    exon      1    892     .      +     0   
1  chr12_partial  unknown     CDS    162    892     .      +     0   
2          chr12  unknown    exon  43757  43793     .      +     0   
3          chr12  unknown     CDS  43757  43793     .      +     0   
4          chr12  unknown    exon  45459  45518     .      +     0   

                                          attributes  length  
0  gene_id "ATXN2_partial"; gene_name "ATXN2_part...     891  
1  gene_id "ATXN2_partial"; gene_name "ATXN2_part...     730  
2  gene_id "ATXN2"; gene_name "ATXN2"; p_id "P137...      36  
3  gene_id "ATXN2"; gene_name "ATXN2"; p_id "P137...      36  
4  gene_id "ATXN2"; gene_name "ATXN2"; p_id "P137...      59  


In [19]:
# get all the exons in the dataframe
exons = chr12_gtf['feature'] == 'exon' # returns a 1D series of boolean values (True or False)
chr12_gtfsmall = chr12_gtf[exons] # if True, record gets written into chr12_gtfsmall
# for the exons, pull out the following columns: seqname, start, end and attributes.
print(chr12_gtfsmall[['seqname','start','end','attributes']])

             seqname      start        end  \
0      chr12_partial          1        892   
2              chr12      43757      43793   
4              chr12      45459      45518   
6              chr12      46699      46770   
8              chr12      47246      47396   
...              ...        ...        ...   
47104          chr12  133803522  133803738   
47106          chr12  133804410  133804533   
47108          chr12  133808103  133808230   
47110          chr12  133810694  133810964   
47112          chr12  133812393  133812422   

                                              attributes  
0      gene_id "ATXN2_partial"; gene_name "ATXN2_part...  
2      gene_id "ATXN2"; gene_name "ATXN2"; p_id "P137...  
4      gene_id "ATXN2"; gene_name "ATXN2"; p_id "P137...  
6      gene_id "ATXN2"; gene_name "ATXN2"; p_id "P137...  
8      gene_id "ATXN2"; gene_name "ATXN2"; p_id "P137...  
...                                                  ...  
47104  gene_id "ANHX"; gene_name "

In [20]:
# find all the rows where the attributes column contains ATXN2 as a gene name
# AND the row feature == CDS
atxn2_cds = (chr12_gtf['attributes'].str.contains('ATXN2')) & (chr12_gtf['feature']=='CDS')

In [21]:
print(chr12_gtf[atxn2_cds].sort_values('start')) # sort the resultset by start values

          seqname   source feature   start     end score strand frame  \
1   chr12_partial  unknown     CDS     162     892     .      +     0   
3           chr12  unknown     CDS   43757   43793     .      +     0   
5           chr12  unknown     CDS   45459   45518     .      +     0   
7           chr12  unknown     CDS   46699   46770     .      +     0   
9           chr12  unknown     CDS   47246   47396     .      +     0   
11          chr12  unknown     CDS   74360   74484     .      +     0   
13          chr12  unknown     CDS   78703   78794     .      +     0   
15          chr12  unknown     CDS   79600   79797     .      +     0   
17          chr12  unknown     CDS   81249   81427     .      +     0   
19          chr12  unknown     CDS   83313   83522     .      +     0   
21          chr12  unknown     CDS   86137   86319     .      +     0   
23          chr12  unknown     CDS   89094   89291     .      +     0   
25          chr12  unknown     CDS   89678   89785 