### Drosophila melanogaster genes 

To execute bash code we created a snippets through nbextensions containing the following paths in order to avoid copy and paste cell contains. %%bash magic do not recognize previous variables. Adding mkdir command to create necesary paths

In [None]:
%%bash
DATA='/home/jmurga/mkt/201902/rawData/annotations'
BASIC='/home/jmurga/mkt/201902/rawData/annotations/basicAnnotation'
CDS='/home/jmurga/mkt/201902/rawData/annotations/cds'
GENES='/home/jmurga/mkt/201902/rawData/annotations/genes'
TEMPORAL='/home/jmurga/mkt/201902/rawData/annotations/tmp'

mkdir -p ${DATA}
mkdir -p ${BASIC}
mkdir -p ${CDS}
mkdir -p ${GENES}
mkdir -p ${TEMPORAL}

Defining global path to python. In this case these variables will be stored after one execution

In [None]:
SRC='/home/jmurga/mkt/201902/scripts/src'
DATA='/home/jmurga/mkt/201902/rawData'

Required libraries

In [None]:
import os
import re
import sys
import numpy as np
import pandas as pd
from numpy import array 
from pyfaidx import Fasta

#### Download Flybase annotation

In [None]:
%%bash
DATA='/home/jmurga/mkt/201902/rawData/annotations'

cd ${DATA}
# Flybase annotation
wget ftp://ftp.flybase.net/genomes/Drosophila_melanogaster/dmel_r5.57_FB2014_03/gff/dmel-all-no-analysis-r5.57.gff.gz
gunzip dmel-all-no-analysis-r5.57.gff.gz

#### Parsing and cleaning flybase annotation to execute all operations by chr
Next operation will be faster due to grep on smaller files. Space is not a problem in our server right now. Each folder contain an specific file foreach chromosome. 

In [None]:
%%bash
DATA='/home/jmurga/mkt/201902/rawData/annotations'
BASIC='/home/jmurga/mkt/201902/rawData/annotations/basicAnnotation'
CDS='/home/jmurga/mkt/201902/rawData/annotations/cds'
GENES='/home/jmurga/mkt/201902/rawData/annotations/genes'
TEMPORAL='/home/jmurga/mkt/201902/rawData/annotations/tmp'

mkdir -p ${BASIC}
mkdir -p ${CDS}
mkdir -p ${GENES}
mkdir -p ${TEMPORAL}

# Extract sequences f
# Extract annotations from gff file
sed -e '/^>/,$d' ${DATA}/dmel-all-no-analysis-r5.57.gff | sed -e '/^2LHet/d' -e  '/2RHet/d' -e '/3LHet/d' -e '/3RHet/d' -e  '/^4\t/d' -e '/dmel_mitochondrion_genome/d' -e '/^U\t/d' -e '/^Uextra\t/d' -e  '/^XHet\t/d' -e '/^YHet\t/d' -e'/\tCG/d' > ${DATA}/dmelFiltered.gff
# Extract genes information
grep -P "\tgene\t" ${DATA}/dmelFiltered.gff  > ${DATA}/dmelFilteredGenes.gff
# Extract CDS information. Only protein coding genes
grep -P "\tCDS\t" ${DATA}/dmelFiltered.gff  > ${DATA}/dmelFilteredCds.gff
# Coding gene list. gene_id always on column 9, 3th field
cut -f1,9 ${DATA}/dmelFilteredCds.gff | tr ';' '\t' | cut -f1,2 | sort -u | sort -k1,1 > ${DATA}/codingGeneList.txt


CHR=( 2L 2R 3L 3R X )
# Parse gff file by chr
for chrNumber in "${CHR[@]}"
do
    grep -P "${chrNumber}\t"  ${DATA}/dmelFilteredGenes.gff | sort -k4,4n > ${BASIC}/gencode.v27lift37.basic.annotation.chr${chrNumber}.gff3 
    grep -P "${chrNumber}\t"  ${DATA}/dmelFilteredCds.gff | sort -k1,1 -k4,4n > ${CDS}/dmelFilteredCdsChr${chrNumber}.gff3 
    grep -P "${chrNumber}\t"  ${DATA}/dmelFilteredGenes.gff | sort -k1,1 -k4,4n > ${GENES}/dmelFilteredGenesChr${chrNumber}.gff3 
done

rm ${DATA}/dmel-all-no-analysis-r5.57.gff

#### Basic cleaned gene file
This file will include information about chromosomes, start coordinates, end coordinates strand, gene id and gene name. It will be and perform calculations on gene coordinates. Kind of gff file easier to work with

In [None]:
%%bash
DATA='/home/jmurga/mkt/201902/rawData/annotations'
GENES='/home/jmurga/mkt/201902/rawData/annotations/genes'
touch ${DATA}/flybaseGenesCleaned.tab 
printf "chr\tstartGene\tendGene\tstrand\tid\tname\n" > ${DATA}/flybaseGenesCleaned.tab

time while read LINE;
do 
    CHR=$(echo ${LINE} | cut -d' ' -f1)
    GENE=$(echo ${LINE} | cut -d' ' -f2 | sed 's/-cds//g')
    # echo $GENE

    fgrep `echo "${GENE};"`  ${GENES}/dmelFilteredGenesChr${CHR}.gff3 | fgrep ${CHR} |  cut -f1,4,5,7,9 | tr ';' '\t' | cut -f1,2,3,4,5,6 
    
done < ${DATA}/codingGeneList.txt | tr ' ' '\t' | sed 's/ID=//g' | sed 's/Name=//g' | sort -k1,1 -k2,2n >> ${DATA}/flybaseGenesCleaned.tab

In [None]:
dfGenes = pd.read_csv(DATA + '/annotations/flybaseGenesCleaned.tab',header = 0,sep='\t')
dfGenes.head()

File to merge features annotated with gene name instead of gene id

In [None]:
idName = dfGenes[['id','name']]
idName.to_csv(DATA + '/annotations/idName.tab',sep='\t',index=False,header=True)

#### Extracting CDS coordinates from largest transcript by gene
From the largest transcript we recover id, size, and CDS coordinates. Moreover we save number of transcripts too.  
Coordinates will be used to recover CDS sequences and to calcute Derived Allele Frequencies from 0fold and 4fold degenerate sequences

In [None]:
%%bash
DATA='/home/jmurga/mkt/201902/rawData/annotations'
CDS='/home/jmurga/mkt/201902/rawData/annotations/cds'
TEMPORAL='/home/jmurga/mkt/201902/rawData/annotations/tmp'

touch ${DATA}/cdsCoordinates.tab
printf "name\tchr\tnumberOfTranscript\ttranscript\ttranscriptSize\tcoordinates\n" > ${DATA}/cdsCoordinates.tab

count=0
time tail -n+2 ${DATA}/flybaseGenesCleaned.tab | while read LINE;
do 
    echo '*************'    
    CHR=$(echo ${LINE} | cut -d' ' -f1)
    GENE=$(echo ${LINE} | cut -d' ' -f6)
    
    printf ${count}
    # fgrep ${GENE} ${CDS}/dmelFilteredCdsChr${CHR}.gff3 | cut -f9 | cut -d';' -f2 | cut -d'=' -f2 | sort -u > ${TEMPORAL}/transcriptTmp.tab
    
    fgrep `echo "Name=${GENE}-cds;"` ${CDS}/dmelFilteredCdsChr${CHR}.gff3 | grep -P "${CHR}\t" | cut -f9 | tr ';' '\n' | fgrep Parent | sort -u | tr ',' '\n' | cut -d'=' -f2 | sort -u > ${TEMPORAL}/transcriptTmp.tab

    largestTranscript=$(while read transcript; do fgrep ${transcript} ${CDS}/dmelFilteredCdsChr${CHR}.gff3 | awk '{print $5-$4}' | awk -v transcript="$transcript"  '{sum+=$1} END{print transcript"\t"sum}' ;done < ${TEMPORAL}/transcriptTmp.tab | sort -nrk2,2 | head -1 )
    
    paste <(echo $GENE) <(echo $CHR) <(wc -l ${TEMPORAL}/transcriptTmp.tab | cut -d' ' -f1) <(echo $largestTranscript | cut -d' ' -f1) <(echo $largestTranscript | cut -d' ' -f2) <(fgrep `echo ${largestTranscript} | cut -d' ' -f1` ${CDS}/dmelFilteredCdsChr${CHR}.gff3  | cut -f4,5 | sort -k1,1n | awk '{printf $1","$2","}' | sed 's/.$//') >> ${DATA}/cdsCoordinates.tab
    
    (( count++ ))

done 


Opening cdsCoordinates file to extract fasta sequence using pyfaidx

In [13]:
dfGenes = pd.read_csv(DATA + '/annotations/flybaseGenesCleaned.tab',header = 0,usecols=['id','name','chr','strand'],sep='\t')

cds = pd.read_csv(DATA + '/annotations/cdsCoordinates.tab',header=0,usecols=['name','chr','coordinates'],sep='\t')

cds = pd.merge(cds, dfGenes,  how='inner', left_on=['chr','name'], right_on = ['chr','name'])

In [14]:
cds.head()

Unnamed: 0,name,chr,coordinates,strand,id
0,CG11023,2L,768081168193858986689276,+,FBgn0031208
1,l(2)gl,2L,"11215,11344,11410,11518,11779,12221,12286,1292...",-,FBgn0002121
2,Ir21a,2L,"21919,22687,22743,22935,22994,23873,23929,2421...",-,FBgn0031209
3,Cda5,2L,"26521,26688,26766,26964,27053,27490,28015,2824...",-,FBgn0051973
4,dbr,2L,6762567762678926802368085705497060770895,+,FBgn0067779


In [17]:
sys.path.insert(0, SRC)
from reverseComplement import reverseComplement
from degen import degenerate

In [28]:
# Rewrite file each execution
# multifasta = open(DATA + '/annotations/fop/cds.fa', 'w')

for index, row in cds[(cds['chr']=='2L') & (cds['strand']=='+')].iterrows():
# for index, row in cds.iterrows():
    print(index)
    # Convert CDS list into numeric array
    coordinates = array(row['coordinates'].split(',')).astype(int).tolist()
    coordinates =  [coordinates[i:i+2] for i in range(0, len(coordinates), 2)]
    
    # Open ref and outgroup
    ref = Fasta(DATA + '/fastas/ref/Chr' + row['chr'] +'.fasta')
    outgroup = Fasta(DATA + '/fastas/outgroup/dsim/Simulans_Chr' + row['chr'] +'.seq')
    
    ## Extract ref and outgroup seq
    refSeq = ref.get_spliced_seq(row['chr'], coordinates).seq.upper()
    outgroupSeq = outgroup.get_spliced_seq('dsim' + row['chr'], coordinates).seq.upper()
   
    # Open population multifasta
    popFasta = Fasta(DATA + '/fastas/alignments/RAL/' + 'RAL_' + 'Chr' + row['chr'] +'.seq')
    
    #Extract samples
    samples = list(popFasta.keys())

    # Open variables and file to write
#     multifasta = open(DATA + '/fastas/genes/RAL/' + row['id'] +'.fa', 'w')
    aln = ''
    ## Empty matrix to append ref, pop and outgroup
    matrix0f=np.empty([len(samples)+2,len(refSeq)],dtype='str')
    matrix4f=np.empty([len(samples)+2,len(refSeq)],dtype='str')

    if(row['strand'] == '-'):
    #     if((len(refSeq)/3).is_integer()):
        refSeq = reverseComplement(refSeq)
        refSeq0f,refSeq4f = degenerate(refSeq)
        outgroupSeq = reverseComplement(outgroupSeq)
        
        matrix4f[0] = list(refSeq4f)
        matrix0f[0] = list(refSeq0f)
        matrix4f[len(matrix4f)] = list(outgroupSeq)
        matrix0f[len(matrix0f)] = list(outgroupSeq)

        
        
        for i in range(1,len(samples)+1,1):
            tmp = popFasta.get_spliced_seq(samples[i], coordinates).seq.upper()
            tmp = reverseComplement(tmp)
            matrix0f[i] = list(tmp)
        # aln += '>' + i + '\n' + tmp + '\n'
    else:

        refSeq0f,refSeq4f = degenerate(refSeq)

        matrix4f[0] = list(refSeq4f)
        matrix4f[len(matrix4f)-1] = list(outgroupSeq)

        # Iter by row matrix to input sequences
        deleteIndex = []
        for i in range(0,len(samples),1):
            tmp = popFasta.get_spliced_seq(samples[i], coordinates).seq.upper()
            if(tmp == 'N' * len(tmp)):
                # Save lines index with only N
                deleteIndex.append(i+1)
            else:
                matrix4f[i+1] = list(tmp)
            # aln += '>' + i + '\n' + tmp + '\n'
        # Delete lines with only N
        matrix4f = np.delete(matrix4f,deleteIndex,0)
        # Iter by matrix column to degenerate all sequences based on reference sequence
        for j in range(0,matrix4f.shape[1],1):
            # print(j)
            if(matrix4f[0][j] == 'N'):
                matrix4f[:,j]='N'
            if(matrix0f[0][j]=='N'):
                matrix0f[:,j]='N'
        
        df4f = pd.DataFrame(matrix4f)
        df4f = df4f.loc[:,df4f.iloc[0]!='N']
        div = [] 
        daf = []
#         print(df4f).head()
        for i in df4f.columns:
            if((df4f.loc[len(df4f)-1,i] != df4f.loc[0,i]) & (len(df4f.loc[1:len(df4f)-2,i].unique()[df4f.loc[1:len(df4f)-2,i].unique()!='N'])==1)): 
                div.append(i) 
            else:
                AA = df4f.loc[len(df4f)-1,i]
                AN = len(df4f.loc[1:len(df4f)-1,i])
                AC = df4f.loc[1:len(df4f)-1,i]
                totalAlleles = AC[AC!='N'].unique()
                AC = AC[AC!='N'].value_counts()
                
                if(len(totalAlleles) == 1):
                    af = 0
                else:
                    af = AC.sort_values(ascending=False)
                    af = AC.drop(AA)[0]/AN
                    daf.append(af)
#             print(daf)

   5    11   20   29   35   41   53   56   59   65    ... 1364 1367 1373 1379  \
0     C    G    A    A    G    A    G    A    A    G  ...    G    A    C    C   
1     N    N    N    N    N    N    N    N    N    N  ...    G    A    C    C   
2     C    G    A    A    G    A    G    A    A    G  ...    G    A    C    C   
3     C    G    A    A    G    A    G    A    A    G  ...    G    A    C    C   
4     C    G    A    A    G    A    G    A    A    G  ...    G    A    C    C   
5     C    G    A    A    G    A    G    A    A    G  ...    G    A    C    C   
6     C    G    A    A    G    A    G    A    A    G  ...    G    A    C    C   
7     C    G    A    A    G    A    G    A    A    G  ...    G    A    C    C   
8     C    G    A    A    G    A    G    A    A    G  ...    G    A    C    C   
9     C    G    A    A    G    A    G    A    A    G  ...    G    A    C    C   
10    C    G    A    A    G    A    G    A    A    G  ...    G    A    C    C   
11    C    G    A    A    G 

AttributeError: 'NoneType' object has no attribute 'head'