Defining global path in python. These variables will be stored after one execution

In [None]:
SRC='/home/jmurga/mktComparison/scripts/src'
DATA='/home/jmurga/mktComparison/rawData/dmel'
FASTAS='/data/shared/dgn'

Required libraries

In [None]:
import os
import re
import sys
import numpy as np
import pandas as pd
import pyfaidx as px

Ad-hoc scripts

In [None]:
sys.path.insert(0, SRC)
from reverseComplement import reverseComplement
from degenerancy import degenerate
from foldPositions import foldPositions
from nDistribution import nDistribution
from reverseComplement import reverseComplement

### Drosophila melanogaster genes 

To execute bash code we created a snippets through nbextensions containing the following paths in order to avoid copy and paste cells. %%bash magic do not recognize previous variables. Adding mkdir command to create necesary paths

In [4]:
%%bash
DATA='/home/jmurga/mktComparison/rawData/dmel/annotations'
BASIC='/home/jmurga/mktComparison/rawData/dmel/annotations/basicAnnotation'
CDS='/home/jmurga/mktComparison/rawData/dmel/annotations/cds'
GENES='/home/jmurga/mktComparison/rawData/dmel/annotations/genes'
TEMPORAL='/home/jmurga/mktComparison/rawData/dmel/annotations/tmp'
ALLELEFREQ='/home/jmurga/mktComparison/rawData/dmel/alleleFrequencies'

mkdir -p ${DATA}
mkdir -p ${BASIC}
mkdir -p ${CDS}
mkdir -p ${GENES}
mkdir -p ${TEMPORAL}
mkdir -p ${ALLELEFREQ}
mkdir -p ${ALLELEFREQ}/dsim
mkdir -p ${ALLELEFREQ}/dyak

#### Download Flybase annotation

In [None]:
%%bash
DATA='/home/jmurga/mktComparison/rawData/dmel/annotations'

cd ${DATA}
# Flybase annotation
wget ftp://ftp.flybase.net/genomes/Drosophila_melanogaster/dmel_r5.57_FB2014_03/gff/dmel-all-no-analysis-r5.57.gff.gz
gunzip dmel-all-no-analysis-r5.57.gff.gz

#### Parsing and cleaning flybase annotation to execute all operations by chr
Operate by chr is faster due to grep on smaller files. Each folder contain an specific file foreach chromosome. 

In [None]:
%%bash
DATA='/home/jmurga/mktComparison/rawData/dmel/annotations'
BASIC='/home/jmurga/mktComparison/rawData/dmel/annotations/basicAnnotation'
CDS='/home/jmurga/mktComparison/rawData/dmel/annotations/cds'
GENES='/home/jmurga/mktComparison/rawData/dmel/annotations/genes'
TEMPORAL='/home/jmurga/mktComparison/rawData/dmel/annotations/tmp'

mkdir -p ${BASIC}
mkdir -p ${CDS}
mkdir -p ${GENES}
mkdir -p ${TEMPORAL}

# Extract annotations from gff file
sed -e '/^>/,$d' ${DATA}/dmel-all-no-analysis-r5.57.gff | sed -e '/^2LHet/d' -e  '/2RHet/d' -e '/3LHet/d' -e '/3RHet/d' -e  '/^4\t/d' -e '/dmel_mitochondrion_genome/d' -e '/^U\t/d' -e '/^Uextra\t/d' -e  '/^XHet\t/d' -e '/^YHet\t/d' -e'/\tCG/d' > ${DATA}/dmelFiltered.gff
# Extract genes information
grep -P "\tgene\t" ${DATA}/dmelFiltered.gff  > ${DATA}/dmelFilteredGenes.gff
# Extract CDS information. Only protein coding genes
grep -P "\tCDS\t" ${DATA}/dmelFiltered.gff  > ${DATA}/dmelFilteredCds.gff
# Coding gene list. gene_id always on column 9, 3th field
cut -f1,9 ${DATA}/dmelFilteredCds.gff | tr ';' '\t' | cut -f1,2 | sort -u | sort -k1,1 > ${DATA}/codingGeneList.txt


CHR=( 2L 2R 3L 3R X )
# Parse gff file by chr
for chrNumber in "${CHR[@]}"
do
    grep -P "${chrNumber}\t"  ${DATA}/dmelFilteredGenes.gff | sort -k4,4n > ${BASIC}/gencode.v27lift37.basic.annotation.chr${chrNumber}.gff3 
    grep -P "${chrNumber}\t"  ${DATA}/dmelFilteredCds.gff | sort -k1,1 -k4,4n > ${CDS}/dmelFilteredCdsChr${chrNumber}.gff3 
    grep -P "${chrNumber}\t"  ${DATA}/dmelFilteredGenes.gff | sort -k1,1 -k4,4n > ${GENES}/dmelFilteredGenesChr${chrNumber}.gff3 
done

rm ${DATA}/dmel-all-no-analysis-r5.57.gff

#### Basic cleaned gene file
This file will include information about chromosomes, start coordinates, end coordinates strand, gene id and gene name. It will be and perform calculations on gene coordinates. Kind of gff file easier to work with

In [None]:
%%bash
DATA='/home/jmurga/mktComparison/rawData/dmel/annotations'
GENES='/home/jmurga/mktComparison/rawData/dmel/annotations/genes'
touch ${DATA}/flybaseGenesCleaned.tab 
printf "chr\tstartGene\tendGene\tstrand\tid\tname\n" > ${DATA}/flybaseGenesCleaned.tab

time while read LINE;
do 
    CHR=$(echo ${LINE} | cut -d' ' -f1)
    GENE=$(echo ${LINE} | cut -d' ' -f2 | sed 's/-cds//g')
    # echo $GENE

    fgrep `echo "${GENE};"`  ${GENES}/dmelFilteredGenesChr${CHR}.gff3 | fgrep ${CHR} |  cut -f1,4,5,7,9 | tr ';' '\t' | cut -f1,2,3,4,5,6 
    
done < ${DATA}/codingGeneList.txt | tr ' ' '\t' | sed 's/ID=//g' | sed 's/Name=//g' | sort -k1,1 -k2,2n >> ${DATA}/flybaseGenesCleaned.tab

In [None]:
dfGenes = pd.read_csv(DATA + '/annotations/flybaseGenesCleaned.tab',header = 0,sep='\t')
dfGenes.head()

File to merge features annotated with gene name instead of gene id

In [None]:
idName = dfGenes[['id','name']]
idName.to_csv(DATA + '/annotations/idName.tab',sep='\t',index=False,header=True)

### CDS *D. melanogaster* degenerancy 

#### Raw cds coordinates

In [None]:
%%bash
DATA='/home/jmurga/mktComparison/rawData/dmel/annotations'
CDS='/home/jmurga/mktComparison/rawData/dmel/annotations/cds'
TEMPORAL='/home/jmurga/mktComparison/rawData/dmel/annotations/tmp'

touch ${DATA}/cdsCoordinates.tab
printf "name\tchr\ttranscript\ttranscriptSize\tcoordinates\n" > ${DATA}/cdsCoordinates.tab

count=0
time tail -n+2 ${DATA}/flybaseGenesCleaned.tab | while read LINE;
do 
    echo '*************'    
    CHR=$(echo ${LINE} | cut -d' ' -f1)
    GENE=$(echo ${LINE} | cut -d' ' -f6)
    
    printf "${GENE} ${CHR}"
    
    fgrep `echo "Name=${GENE}-cds;"` ${CDS}/dmelFilteredCdsChr${CHR}.gff3 | grep -P "${CHR}\t" | cut -f9 | tr ';' '\n' | fgrep Parent | sort -u | tr ',' '\n' | cut -d'=' -f2 | sort -u > ${TEMPORAL}/transcriptTmp.tab

    while read transcript; do fgrep ${transcript} ${CDS}/dmelFilteredCdsChr${CHR}.gff3 | awk '{print $4,$5,$5-$4}' | awk -v gene="${GENE}" -v chr="${CHR}" -v transcript="$transcript" '{sum+=$1} {printf $1","$2","} END{print "\t"gene"\t"chr"\t"transcript"\t"sum}' | awk '{print $2,$3,$4,$5,$1}' | sed 's/,$//' |tr ' ' '\t' >> ${DATA}/cdsCoordinates.tab ;done < ${TEMPORAL}/transcriptTmp.tab 
    
    (( count++ ))

done

#### Check degenerancy by position

**Checking degenerancy by positions taking into account all transcripts and genes independently**  
Recoding CDS sequences to get 0fold, 2fold, 3fold and 4fold positions by transcript and genes, in order to estimate frequencies and divergence by type of functional sites.

In [None]:
dfGenes = pd.read_csv(DATA + '/annotations/flybaseGenesCleaned.tab',header = 0,usecols=['id','name','chr','strand'],sep='\t')
cds = pd.read_csv(DATA + '/annotations/cdsCoordinates.tab',header=0,sep='\t')
cds = pd.merge(cds, dfGenes,  how='inner', left_on=['chr','name'], right_on = ['chr','name'])

In [None]:
def foldPositions(x):
    if('0' in x):
        return('0fold')
    elif('4' in x and '0' not in x and '2' not in x and '3' not in x):
        return('4fold')
    elif('2' in x and '0' not in x and '4' not in x and '3' not in x):
        return('2fold')
    elif('2' not in x and '0' not in x and '4' not in x) :
        return('3fold')
    else:
        return('mixed')

In [None]:
import time
# for index, row in cds.iterrows():
df = pd.DataFrame()
# for geneId in np.unique(cds[(cds['id']=='FBgn0000028')].id):
for j in range(0,len(np.unique(cds.id))):
    start_time = time.time()
   
    geneId = np.unique(cds.id)[j]
#     geneId = geneId
    df = pd.DataFrame()
    print(geneId)
    
    for index, row in cds[(cds['id']==geneId)].iterrows():
        
        # Rewrite file each execution
        degen = []
        
        chrFile = px.Fasta(FASTAS + '/ref/Chr' + row['chr'] +'.fasta',sequence_always_upper=True)

        # Convert CDS list into numeric array
        coordinates = np.array(row['coordinates'].split(',')).astype(int).tolist()
        coordinates =  [coordinates[i:i+2] for i in range(0, len(coordinates), 2)]

        # Extract all CDS positions in a list in order to merge with degenerate sequences (same length -> same index)
        positions=[]
        for i in range(0,len(coordinates),1):
            positions.append(list(range(coordinates[i][0],coordinates[i][1]+1)))  
        allPositions = [item for sublist in positions for item in sublist]
        allPositions = np.asarray(allPositions)

        # Extract cds sequences
        seq = chrFile.get_spliced_seq(row['chr'], coordinates).seq
        if(row['strand'] == '-'):
            seq = reverseComplement(seq)
            allPositions = allPositions[::-1]
        if((len(seq)/3).is_integer() and seq[0:3]=='ATG'):
            # Append in list 
            m = degenerateFullPositions(seq)
            degen.append(list(m))
            nchr = np.array([row['chr']] * len(m))
            
            degen = [item for sublist in degen for item in sublist]
            degen = np.asarray(degen)

            data =  {'CHROM':nchr,'POS':allPositions,'degen':degen}
            tmp = pd.DataFrame(data)

            df = df.append(tmp)
            df = df.groupby(['CHROM','POS']).agg({'degen':','.join}).reset_index()

    #Save df
    df.to_csv(DATA + '/annotations/degeneracyDmelPositions.tab',mode='a',index=False,header=False,sep='\t')
    print(j,geneId,"--- %s seconds ---" % (time.time() - start_time))

Cleaning positions degenerancy based on most constrain posibility

In [None]:
df = pd.read_csv(DATA + '/annotations/degeneracyDmelPositions.tab',sep='\t',header=None,names=['CHROM','POS','degen'])
    
chrList = ['2L','2R','3L','3R','X']
import time
for nchr in chrList:
    start_time = time.time()
    print(nchr)
    tmp = df[df['CHROM'] == nchr]
    tmp = tmp.groupby(['CHROM','POS']).agg({'degen':','.join}).reset_index()
    tmp['type'] = np.nan

    tmp['type'] = tmp['degen'].apply(lambda row: foldPositions(row))
    tmp = tmp.sort_values('POS')
    
    tmp.to_csv(DATA + '/annotations/zeroFourFoldPositions.tab',header=False,index=False,mode='a',sep='\t')
    print("--- %s seconds ---" % (time.time() - start_time))

#### Undetermined nucleotide distributions by cds position

In [None]:
dfGenes = pd.read_csv(DATA + '/annotations/flybaseGenesCleaned.tab',header = 0,sep='\t')
cds = pd.read_csv(DATA + '/annotations/cdsCoordinates.tab',header = 0,sep='\t')
cds = pd.merge(cds, dfGenes,  how='inner', left_on=['chr','name'], right_on = ['chr','name'])
cds = cds.loc[cds.reset_index().groupby(['chr','id'])['transcriptSize'].idxmax()].reset_index(drop=True)

Raleigh

In [None]:
nDistribution(cds,'RAL')

Zambia

In [None]:
nDistribution(cds,'ZI')

Retrieve by population the number of N

In [None]:
for pop in ['RAL','ZI']:
    print(pop)
    
    df = pd.read_csv(DATA + '/nCall/ncall' + pop + '.tab',sep='\t',header=None)
    nCalls = df[0].value_counts().reset_index()
    
    nCalls = pd.DataFrame(nCalls)
    nCalls = nCalls.sort_values('index')
    
    nCalls['freq'] = nCalls[0].apply(lambda x: x/sum(nCalls[0])*100)
    
    nCalls.columns = ['positions','count','freq']
    nCalls.to_csv(DATA + '/nCall/' + pop + 'DistributionN.tab',sep='\t',header=True,index=False)

### Extracting Derived Allele Frequency and Divergence by population and type of site

Opening cdsCoordinates file to extract fasta sequence using pyfaidx

In [1]:
!python /home/jmurga/mktComparison/scripts/src/sfsDivergence.py --help

usage: sfsDivergence.py [-h] --genes GENES --cds CDS --outgroup {dsim,dyak}
                        --population POPULATION --sampling SAMPLING
                        [--singleton SINGLETON] [--seed SEED] [--path PATH]

Extract alleles frequencies from multi-FASTA aligment

optional arguments:
  -h, --help            show this help message and exit
  --genes GENES         File basic gene information.
  --cds CDS             File all transcript coordinates by genes.
  --outgroup {dsim,dyak}
                        Select outgroup to compute diverenge and derived
                        allele frequency
  --population POPULATION
                        Select population to extract
  --sampling SAMPLING   Resampling size
  --singleton SINGLETON
                        Resampling size
  --seed SEED           Input seed
  --path PATH           Path to output file


#### *D. simulans*

##### Raleigh population

In [5]:
!python /home/jmurga/mktComparison/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population RAL --sampling 160 --outgroup dsim

0 FBgn0000018
--- 0.1572861671447754 seconds ---
1 FBgn0000052
--- 0.6151106357574463 seconds ---
2 FBgn0000053
--- 0.568856954574585 seconds ---
3 FBgn0000055
--- 0.11774158477783203 seconds ---
4 FBgn0000056
--- 0.08265113830566406 seconds ---
5 FBgn0000061
--- 0.12929248809814453 seconds ---
6 FBgn0000075
--- 0.1530139446258545 seconds ---
7 FBgn0000097
--- 0.2642357349395752 seconds ---
8 FBgn0000114
--- 0.38005852699279785 seconds ---
9 FBgn0000120
--- 0.11574554443359375 seconds ---
10 FBgn0000146
--- 0.2838728427886963 seconds ---
11 FBgn0000153
--- 0.1543560028076172 seconds ---
12 FBgn0000180
--- 0.32988953590393066 seconds ---
13 FBgn0000182
--- 0.3536031246185303 seconds ---
14 FBgn0000183
--- 0.36509060859680176 seconds ---
15 FBgn0000227
--- 0.16064977645874023 seconds ---
16 FBgn0000228
--- 0.50632643699646 seconds ---
17 FBgn0000229
--- 0.16573643684387207 seconds ---
18 FBgn0000239
19 FBgn0000250
--- 0.3312554359436035 seconds ---
20 FBgn0000251
--- 0.24735641479492188 

##### Zambia population

In [None]:
!python /home/jmurga/mktComparison/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population ZI --sampling 154 --outgroup dsim

#### *D. yakuba*

##### Raleigh population

In [None]:
!python /home/jmurga/mktComparison/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population RAL --sampling 160 --outgroup dyak

##### Zambia population

In [None]:
!python /home/jmurga/mktComparison/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population ZI --sampling 154 --outgroup dyak

### Estimating metrics

#### Extracting mi and m0

In [None]:
dfGenes = pd.read_csv('/home/jmurga/mktComparison/rawData/dmel/annotations/flybaseGenesCleaned.tab',header = 0,usecols=['id','name','chr','strand'],sep='\t')
cds = pd.read_csv('/home/jmurga/mktComparison/rawData/dmel/annotations/cdsLargest.tab',header=0,sep='\t')
cds = pd.merge(cds, dfGenes,  how='inner', left_on=['chr','name'], right_on = ['chr','name'])
degeneratePositions = pd.read_csv('/home/jmurga/mktComparison/rawData/dmel/annotations/zeroFourFoldPositions.tab',header=None,names=['CHROM','POS','degen','type'],sep='\t')

In [None]:
# Rewrite file each execution
columns = ['id','mi','m0']
totalFoldPositions = pd.DataFrame(columns=columns)
for j in cds['chr'].unique():
    print(j)
    chrPositions = degeneratePositions[degeneratePositions['chr']==j]
    for index, row in cds[cds['chr']==j].iterrows():
#     for index, row in cds[cds['id']=='FBgn0000017'].iterrows():
        print(index,row['id'])
        chrFile = Fasta('/data/shared/dgn/ref/Chr' + row['chr'] +'.fasta')
    #     Convert CDS list into numeric array
        coordinates = array(row['coordinates'].split(',')).astype(int).tolist()
        coordinates =  [coordinates[i:i+2] for i in range(0, len(coordinates), 2)]
        # Extract all CDS positions in a list in order to merge with degenerate sequences (same length -> same index)
        positions=[]
        for i in range(0,len(coordinates),1):
            positions.append(list(range(coordinates[i][0],coordinates[i][1]+1)))  
        allPositions = [item for sublist in positions for item in sublist]
        # Extract cds sequences
        seq = chrFile.get_spliced_seq(row['chr'].replace('chr',''), coordinates).seq.upper()
    #     print(len(seq))
        if(row['strand'] == '-'):
            seq = reverseComplement(seq)
            allPositions = allPositions[::-1]
        if((len(seq)/3).is_integer() and seq[0:3]=='ATG'):
            m = degenerate(seq)
            tmp = pd.DataFrame({'POS':allPositions,'m':list(m)})
            tmp['chr']=row['chr']
            tmp = pd.merge(chrPositions,tmp,on=['chr','POS'],how='right')              
            tmp['m']=tmp.apply(lambda x: '0' if x['newType']=='0fold' else '4' if x['newType']=='4fold' else x['m'],axis=1)
            counts = tmp['m'].value_counts()
            if(counts.shape[0]<4 or '4' not in counts.index):
                m0=0
                mi=counts['0']
            else:
                m0=counts['4']
                mi=counts['0']
            data = pd.DataFrame({'id':row['id'],'mi':mi,'m0':m0},index=[0])
            data.to_csv('/home/jmurga/mktComparison/rawData/refAnalyzableSites.tab',sep='\t',header=False,mode='a',index=False)
#             totalFoldPositions = totalFoldPositions.append(data,ignore_index=True)
        else:
            data = pd.DataFrame({'id':row['id'],'mi':0,'m0':0},index=[0])
            data.to_csv('/home/jmurga/mktComparison/rawData/refAnalyzableSites.tab',sep='\t',header=False,mode='a',index=False)
#             totalFoldPositions = totalFoldPositions.append(data,ignore_index=True)

####  *D. simulans*

##### Extract by population Derived Allele Frequency and Divergence in a file to format by functional class and populations

In [None]:
degeneratePositions = pd.read_csv('/home/jmurga/mktComparison/rawData/dmel/annotations/zeroFourFoldPositions.tab',header=None,names=['CHROM','POS','degen','type'],sep='\t')

In [None]:
pops=['RAL','ZI']    

for p in pops:
    print(p)
    popSites = pd.read_csv(DATA + '/alleleFrequencies/dsim/' + 'dsimDmelSites' + p + '.tab',sep='\t',header=None,names=['id','chr','POS','div','rawDerivedAllele','type','pop'])
        
    popSites.to_csv(DATA+ '/alleleFrequencies/dsim/dsimDmelSites.tab',sep='\t',index=False,header=False,mode='a')

In [None]:
dmelSites = pd.read_csv(DATA+ '/alleleFrequencies/dsim/dsimDmelSites.tab', sep='\t', header=None)
dmelSites.columns = ['id','CHROM','POS','div','rawDerivedAllele','type','pop']
dmelSites = dmelSites.drop('type',axis=1)
dmelSites.head()

In [None]:
dmelSites = pd.merge(dmelSites,degeneratePositions,how='left',left_on=['CHROM','POS'],right_on=['CHROM','POS'])
dmelSites = dmelSites[dmelSites['type']!='mixed']
dmelSites.head()

##### Divergence

In [None]:
div = dmelSites.groupby(['id','type','pop'])['div'].sum().reset_index()
div = div.pivot_table(index=['id','pop'],columns=['type'],values='div').reset_index()
div.columns = ['id','pop','di','d0']

In [None]:
div.head()

##### Derived Allele Frequency

In [None]:
daf = dmelSites[['id','rawDerivedAllele','type','pop']][dmelSites['rawDerivedAllele']!=0]

bins = np.arange(0,1.05,0.05)
labels =  np.arange(0.05,1.05,0.05)

daf['categories'] = pd.cut(daf['rawDerivedAllele'],bins=bins,labels=labels)

sfs = daf.groupby(['id','type','categories','pop']).count().reset_index()
sfs['rawDerivedAllele'] = sfs['rawDerivedAllele'].fillna(0).astype(int)
sfs = sfs.groupby(['id','pop','type'])['rawDerivedAllele'].apply(list).reset_index()

sfs = sfs.pivot_table(index=['id','pop'], columns=['type'],values='rawDerivedAllele',aggfunc=lambda x:x).reset_index()
sfs['p0'] = sfs['4fold'].apply(lambda x: sum(x))
sfs['pi'] = sfs['0fold'].apply(lambda x: sum(x))

sfs['0fold'] = sfs['0fold'].apply(lambda x:';'.join(map(str,x)))
sfs['4fold'] = sfs['4fold'].apply(lambda x:';'.join(map(str,x)))

sfs.columns = ['id','pop','daf0f','daf4f','p0','pi']

In [None]:
sfs.head()

##### Merging info

In [None]:
mktData = pd.merge(sfs,div,on=['id','pop'],how='outer')

In [None]:
mktData.head()

In [None]:
dfGenes = pd.read_csv(DATA + '/annotations/flybaseGenesCleaned.tab',sep='\t',header=0)
pops=['RAL','ZI']                                   
genesByPop = pd.DataFrame()

for p in pops:
    
    dfGenes['pop'] = p
    genesByPop = genesByPop.append(dfGenes)
    
genesByPop = genesByPop.reset_index(drop=True)

In [None]:
pops=['RAL','ZI']                                   
refAnalizableSites = pd.read_csv(DATA+'/refAnalyzableSites.tab',sep='\t',header=0)

totalFoldPositionsByPop = pd.DataFrame()
for p in pops:
    print(p)
    refAnalizableSites['pop'] = p
    
    totalFoldPositionsByPop = totalFoldPositionsByPop.append(refAnalizableSites)

totalFoldPositionsByPop = totalFoldPositionsByPop.reset_index(drop=True)

In [None]:
genesByPop = pd.merge(genesByPop,totalFoldPositionsByPop,on=['id','pop'],how='outer')

In [None]:
mktData = pd.merge(mktData,genesByPop,on=['id','pop'],how='outer')
mktData[mktData['daf0f'].isna()].loc[:,'daf0f'] = '0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0'
mktData[mktData['daf4f'].isna()].loc[:,'daf4f'] = '0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0'
mktData['daf0f'] = mktData['daf0f'].fillna('0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0')
mktData['daf4f'] = mktData['daf4f'].fillna('0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0')
mktData = mktData.fillna(0)

In [None]:
dfRecomb = pd.read_csv(DATA + '/genesRecombination.tab',sep='\t')
dfRecomb.columns = ['id','recomb']
mktData = pd.merge(mktData,dfRecomb,on=['id'])

In [None]:
mktData = mktData[['id', 'pop', 'daf0f', 'daf4f', 'p0', 'pi', 'di', 'd0', 'chr','mi','m0','recomb']]

In [None]:
mktData.head()

In [None]:
mktData.to_csv('/home/jmurga/mktComparison/results/dsimDmelData.tab',sep='\t',header=True,index=False)

#### *D. yakuba*

##### Extract by population Derived Allele Frequency and Divergence in a file to format by functional class and populations

In [None]:
degeneratePositions = pd.read_csv('/home/jmurga/mktComparison/rawData/dmel/annotations/zeroFourFoldPositions.tab',header=None,names=['CHROM','POS','degen','type'],sep='\t')

In [None]:
pops=['RAL','ZI']    

for p in pops:
    print(p)
    popSites = pd.read_csv(DATA + '/alleleFrequencies/dyak/' + 'dyakDmelSites' + p + '.tab',sep='\t',header=None,names=['id','chr','POS','div','rawDerivedAllele','type','pop'])
        
    popSites.to_csv(DATA+ '/alleleFrequencies/dyak/dyakDmelSites.tab',sep='\t',index=False,header=False,mode='a')

In [None]:
dmelSites = pd.read_csv(DATA+ '/alleleFrequencies/dyak/dyakDmelSites.tab', sep='\t', header=None)
dmelSites.columns = ['id','CHROM','POS','div','rawDerivedAllele','type','pop']
dmelSites = dmelSites.drop('type',axis=1)
dmelSites.head()

In [None]:
dmelSites = pd.merge(dmelSites,degeneratePositions,how='left',left_on=['CHROM','POS'],right_on=['CHROM','POS'])
dmelSites = dmelSites[dmelSites['type']!='mixed']
dmelSites.head()

##### Divergence

In [None]:
div = dmelSites.groupby(['id','type','pop'])['div'].sum().reset_index()
div = div.pivot_table(index=['id','pop'],columns=['type'],values='div').reset_index()
div.columns = ['id','pop','di','d0']

In [None]:
div.head()

##### Derived Allele Frequency

In [None]:
daf = dmelSites[['id','rawDerivedAllele','type','pop']][dmelSites['rawDerivedAllele']!=0]

bins = np.arange(0,1.05,0.05)
labels =  np.arange(0.05,1.05,0.05)

daf['categories'] = pd.cut(daf['rawDerivedAllele'],bins=bins,labels=labels)

sfs = daf.groupby(['id','type','categories','pop']).count().reset_index()
sfs['rawDerivedAllele'] = sfs['rawDerivedAllele'].fillna(0).astype(int)
sfs = sfs.groupby(['id','pop','type'])['rawDerivedAllele'].apply(list).reset_index()

sfs = sfs.pivot_table(index=['id','pop'], columns=['type'],values='rawDerivedAllele',aggfunc=lambda x:x).reset_index()
sfs['p0'] = sfs['4fold'].apply(lambda x: sum(x))
sfs['pi'] = sfs['0fold'].apply(lambda x: sum(x))

sfs['0fold'] = sfs['0fold'].apply(lambda x:';'.join(map(str,x)))
sfs['4fold'] = sfs['4fold'].apply(lambda x:';'.join(map(str,x)))

sfs.columns = ['id','pop','daf0f','daf4f','p0','pi']

In [None]:
sfs.head()

##### Merging info

In [None]:
mktData = pd.merge(sfs,div,on=['id','pop'],how='outer')

In [None]:
mktData.head()

In [None]:
dfGenes = pd.read_csv(DATA + '/annotations/flybaseGenesCleaned.tab',sep='\t',header=0)
pops=['RAL','ZI']                                   
genesByPop = pd.DataFrame()

for p in pops:
    
    dfGenes['pop'] = p
    genesByPop = genesByPop.append(dfGenes)
    
genesByPop = genesByPop.reset_index(drop=True)

In [None]:
pops=['RAL','ZI']                                   
refAnalizableSites = pd.read_csv(DATA+'/refAnalyzableSites.tab',sep='\t',header=0)

totalFoldPositionsByPop = pd.DataFrame()
for p in pops:
    print(p)
    refAnalizableSites['pop'] = p
    
    totalFoldPositionsByPop = totalFoldPositionsByPop.append(refAnalizableSites)

totalFoldPositionsByPop = totalFoldPositionsByPop.reset_index(drop=True)

In [None]:
genesByPop = pd.merge(genesByPop,totalFoldPositionsByPop,on=['id','pop'],how='outer')

In [None]:
mktData = pd.merge(mktData,genesByPop,on=['id','pop'],how='outer')
mktData[mktData['daf0f'].isna()].loc[:,'daf0f'] = '0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0'
mktData[mktData['daf4f'].isna()].loc[:,'daf4f'] = '0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0'
mktData['daf0f'] = mktData['daf0f'].fillna('0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0')
mktData['daf4f'] = mktData['daf4f'].fillna('0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0')
mktData = mktData.fillna(0)

In [None]:
dfRecomb = pd.read_csv(DATA + '/genesRecombination.tab',sep='\t')
dfRecomb.columns = ['id','recomb']
mktData = pd.merge(mktData,dfRecomb,on=['id'])

In [None]:
mktData = mktData[['id', 'pop', 'daf0f', 'daf4f', 'p0', 'pi', 'di', 'd0', 'chr','mi','m0','recomb']]

In [None]:
mktData.head()

In [None]:
mktData[mktData['pop']=='RAL'].d0.sum()/mktData[mktData['pop']=='RAL'].m0.sum()

In [None]:
mktData.to_csv('/home/jmurga/mktComparison/results/dyakDmelData.tab',sep='\t',header=True,index=False)