Defining global path to python. In this case these variables will be stored after one execution

In [4]:
SRC='/home/jmurga/mkt/201903/scripts/src'
DATA='/home/jmurga/mkt/201903/rawData/dmel'
FASTAS='/data/shared/dgn'

Required libraries

In [3]:
import os
import re
import sys
import numpy as np
import pandas as pd
import pyfaidx as px

Ad-hoc scripts

In [5]:
sys.path.insert(0, SRC)
from reverseComplement import reverseComplement
from degenerancy import degenerate

### Drosophila melanogaster genes 

To execute bash code we created a snippets through nbextensions containing the following paths in order to avoid copy and paste cells. %%bash magic do not recognize previous variables. Adding mkdir command to create necesary paths

In [None]:
%%bash
DATA='/home/jmurga/mkt/201903/rawData/dmel/annotations'
BASIC='/home/jmurga/mkt/201903/rawData/dmel/annotations/basicAnnotation'
CDS='/home/jmurga/mkt/201903/rawData/dmel/annotations/cds'
GENES='/home/jmurga/mkt/201903/rawData/dmel/annotations/genes'
TEMPORAL='/home/jmurga/mkt/201903/rawData/dmel/annotations/tmp'
ALLELEFREQ='/home/jmurga/mkt/201903/rawData/dmel/alleleFrequencies'

mkdir -p ${DATA}
mkdir -p ${BASIC}
mkdir -p ${CDS}
mkdir -p ${GENES}
mkdir -p ${TEMPORAL}
mkdir -p ${ALLELEFREQ}

#### Download Flybase annotation

In [None]:
%%bash
DATA='/home/jmurga/mkt/201903/rawData/dmel/annotations'

cd ${DATA}
# Flybase annotation
wget ftp://ftp.flybase.net/genomes/Drosophila_melanogaster/dmel_r5.57_FB2014_03/gff/dmel-all-no-analysis-r5.57.gff.gz
gunzip dmel-all-no-analysis-r5.57.gff.gz

#### Parsing and cleaning flybase annotation to execute all operations by chr
Operate by chr is faster due to grep on smaller files. Each folder contain an specific file foreach chromosome. 

In [None]:
%%bash
DATA='/home/jmurga/mkt/201903/rawData/dmel/annotations'
BASIC='/home/jmurga/mkt/201903/rawData/dmel/annotations/basicAnnotation'
CDS='/home/jmurga/mkt/201903/rawData/dmel/annotations/cds'
GENES='/home/jmurga/mkt/201903/rawData/dmel/annotations/genes'
TEMPORAL='/home/jmurga/mkt/201903/rawData/dmel/annotations/tmp'

mkdir -p ${BASIC}
mkdir -p ${CDS}
mkdir -p ${GENES}
mkdir -p ${TEMPORAL}

# Extract annotations from gff file
sed -e '/^>/,$d' ${DATA}/dmel-all-no-analysis-r5.57.gff | sed -e '/^2LHet/d' -e  '/2RHet/d' -e '/3LHet/d' -e '/3RHet/d' -e  '/^4\t/d' -e '/dmel_mitochondrion_genome/d' -e '/^U\t/d' -e '/^Uextra\t/d' -e  '/^XHet\t/d' -e '/^YHet\t/d' -e'/\tCG/d' > ${DATA}/dmelFiltered.gff
# Extract genes information
grep -P "\tgene\t" ${DATA}/dmelFiltered.gff  > ${DATA}/dmelFilteredGenes.gff
# Extract CDS information. Only protein coding genes
grep -P "\tCDS\t" ${DATA}/dmelFiltered.gff  > ${DATA}/dmelFilteredCds.gff
# Coding gene list. gene_id always on column 9, 3th field
cut -f1,9 ${DATA}/dmelFilteredCds.gff | tr ';' '\t' | cut -f1,2 | sort -u | sort -k1,1 > ${DATA}/codingGeneList.txt


CHR=( 2L 2R 3L 3R X )
# Parse gff file by chr
for nchr in "${CHR[@]}"
do
    echo ${nchr}
    grep -P "${nchr}\t"  ${DATA}/dmelFiltered.gff | sort -k4,4n > ${BASIC}/dmelFiltered${nchr}.gff3 
    grep -P "${nchr}\t"  ${DATA}/dmelFilteredCds.gff | sort -k1,1 -k4,4n > ${CDS}/dmelFilteredCdsChr${nchr}.gff3 
    grep -P "${nchr}\t"  ${DATA}/dmelFilteredGenes.gff | sort -k1,1 -k4,4n > ${GENES}/dmelFilteredGenesChr${nchr}.gff3 
done

rm ${DATA}/dmel-all-no-analysis-r5.57.gff

#### Basic cleaned gene file
This file will include information about chromosomes, start coordinates, end coordinates strand, gene id and gene name. It will be and perform calculations on gene coordinates. Kind of gff file easier to work with

In [None]:
%%bash
DATA='/home/jmurga/mkt/201903/rawData/dmel/annotations'
GENES='/home/jmurga/mkt/201903/rawData/dmel/annotations/genes'
touch ${DATA}/flybaseGenesCleaned.tab 
printf "chr\tstart\tend\tstrand\tid\tname\n" > ${DATA}/flybaseGenesCleaned.tab

time while read LINE;
do 
    CHR=$(echo ${LINE} | cut -d' ' -f1)
    GENE=$(echo ${LINE} | cut -d' ' -f2 | sed 's/-cds//g')
    # echo $GENE

    fgrep `echo "${GENE};"`  ${GENES}/dmelFilteredGenesChr${CHR}.gff3 | fgrep ${CHR} |  cut -f1,4,5,7,9 | tr ';' '\t' | cut -f1,2,3,4,5,6 
    
done < ${DATA}/codingGeneList.txt | tr ' ' '\t' | sed 's/ID=//g' | sed 's/Name=//g' | sort -k1,1 -k2,2n >> ${DATA}/flybaseGenesCleaned.tab

In [None]:
dfGenes = pd.read_csv(DATA + '/annotations/flybaseGenesCleaned.tab',header = 0,sep='\t')
dfGenes.head()

File to merge features annotated with gene name instead of gene id

In [None]:
idName = dfGenes[['id','name']]
idName.to_csv(DATA + '/annotations/idName.tab',sep='\t',index=False,header=True)

### CDS *D. melanogaster* degenerancy 

#### Raw cds coordinates

In [None]:
%%bash
DATA='/home/jmurga/mkt/201903/rawData/dmel/annotations'
CDS='/home/jmurga/mkt/201903/rawData/dmel/annotations/cds'
TEMPORAL='/home/jmurga/mkt/201903/rawData/dmel/annotations/tmp'

touch ${DATA}/cdsCoordinates.tab
printf "name\tchr\ttranscript\ttranscriptSize\tcoordinates\n" > ${DATA}/cdsCoordinates.tab

count=0
time tail -n+2 ${DATA}/flybaseGenesCleaned.tab | while read LINE;
do 
    echo '*************'    
    CHR=$(echo ${LINE} | cut -d' ' -f1)
    GENE=$(echo ${LINE} | cut -d' ' -f6)
    
    printf "${GENE} ${CHR}"
    
    fgrep `echo "Name=${GENE}-cds;"` ${CDS}/dmelFilteredCdsChr${CHR}.gff3 | grep -P "${CHR}\t" | cut -f9 | tr ';' '\n' | fgrep Parent | sort -u | tr ',' '\n' | cut -d'=' -f2 | sort -u > ${TEMPORAL}/transcriptTmp.tab

    while read transcript; do fgrep ${transcript} ${CDS}/dmelFilteredCdsChr${CHR}.gff3 | awk '{print $4,$5,$5-$4}' | awk -v gene="${GENE}" -v chr="${CHR}" -v transcript="$transcript" '{sum+=$1} {printf $1","$2","} END{print "\t"gene"\t"chr"\t"transcript"\t"sum}' | awk '{print $2,$3,$4,$5,$1}' | sed 's/,$//' |tr ' ' '\t' >> ${DATA}/cdsCoordinates.tab ;done < ${TEMPORAL}/transcriptTmp.tab 
    
    (( count++ ))

done

#### Check degenerancy by position

**Checking degenerancy by positions taking into account all transcripts and genes independently**  
Recoding CDS sequences to get 0fold, 2fold, 3fold and 4fold positions by transcript and genes, in order to estimate frequencies and divergence by type of functional sites.

In [None]:
sys.path.insert(0, SRC)
from reverseComplement import reverseComplement
from degenerancy import degenerate

In [None]:
dfGenes = pd.read_csv(DATA + '/annotations/flybaseGenesCleaned.tab',header = 0,usecols=['id','name','chr','strand'],sep='\t')
cds = pd.read_csv(DATA + '/annotations/cdsCoordinates.tab',header=0,sep='\t')
cds = pd.merge(cds, dfGenes,  how='inner', left_on=['chr','name'], right_on = ['chr','name'])

In [None]:
def foldPositions(x):
    if('0' in x):
        return('0fold')
    elif('4' in x and '0' not in x and '2' not in x and '3' not in x):
        return('4fold')
    elif('2' in x and '0' not in x and '4' not in x and '3' not in x):
        return('2fold')
    elif('2' not in x and '0' not in x and '4' not in x) :
        return('3fold')
    else:
        return('mixed')

In [None]:
import time
# for index, row in cds.iterrows():
df = pd.DataFrame()
# for geneId in np.unique(cds[(cds['id']=='FBgn0000028')].id):
for j in range(0,len(np.unique(cds.id))):
    start_time = time.time()
   
    geneId = np.unique(cds.id)[j]
#     geneId = geneId
    df = pd.DataFrame()
    print(geneId)
    
    for index, row in cds[(cds['id']==geneId)].iterrows():
        
        # Rewrite file each execution
        degen = []
        
        chrFile = px.Fasta(FASTAS + '/ref/Chr' + row['chr'] +'.fasta',sequence_always_upper=True)

        # Convert CDS list into numeric array
        coordinates = np.array(row['coordinates'].split(',')).astype(int).tolist()
        coordinates =  [coordinates[i:i+2] for i in range(0, len(coordinates), 2)]

        # Extract all CDS positions in a list in order to merge with degenerate sequences (same length -> same index)
        positions=[]
        for i in range(0,len(coordinates),1):
            positions.append(list(range(coordinates[i][0],coordinates[i][1]+1)))  
        allPositions = [item for sublist in positions for item in sublist]
        allPositions = np.asarray(allPositions)

        # Extract cds sequences
        seq = chrFile.get_spliced_seq(row['chr'], coordinates).seq
        if(row['strand'] == '-'):
            seq = reverseComplement(seq)
            allPositions = allPositions[::-1]
        if((len(seq)/3).is_integer() and seq[0:3]=='ATG'):
            # Append in list 
            m = degenerateFullPositions(seq)
            degen.append(list(m))
            nchr = np.array([row['chr']] * len(m))
            
            degen = [item for sublist in degen for item in sublist]
            degen = np.asarray(degen)

            data =  {'CHROM':nchr,'POS':allPositions,'degen':degen}
            tmp = pd.DataFrame(data)

            df = df.append(tmp)
            df = df.groupby(['CHROM','POS']).agg({'degen':','.join}).reset_index()

    #Save df
    df.to_csv(DATA + '/annotations/degeneracyDmelPositions.tab',mode='a',index=False,header=False,sep='\t')
    print(j,geneId,"--- %s seconds ---" % (time.time() - start_time))

Cleaning positions degenerancy based on most constrain posibility

In [None]:
df = pd.read_csv(DATA + '/annotations/degeneracyDmelPositions.tab',sep='\t',header=None,names=['CHROM','POS','degen'])
    
chrList = ['2L','2R','3L','3R','X']
import time
for nchr in chrList:
    start_time = time.time()
    print(nchr)
    tmp = df[df['CHROM'] == nchr]
    tmp = tmp.groupby(['CHROM','POS']).agg({'degen':','.join}).reset_index()
    tmp['type'] = np.nan

    tmp['type'] = tmp['degen'].apply(lambda row: foldPositions(row))
    tmp = tmp.sort_values('POS')
    
    tmp.to_csv(DATA + '/annotations/zeroFourFoldPositions.tab',header=False,index=False,mode='a',sep='\t')
    print("--- %s seconds ---" % (time.time() - start_time))

#### Undetermined nucleotide distributions by cds position

In [None]:
dfGenes = pd.read_csv(DATA + '/annotations/flybaseGenesCleaned.tab',header = 0,sep='\t')
cds = pd.read_csv(DATA + '/annotations/cdsCoordinates.tab',header = 0,sep='\t')
cds = pd.merge(cds, dfGenes,  how='inner', left_on=['chr','name'], right_on = ['chr','name'])
cds = cds.loc[cds.reset_index().groupby(['chr','id'])['transcriptSize'].idxmax()].reset_index(drop=True)

In [None]:
nCallsByPositions = []
for index, row in cds.iterrows():    
# for index, row in cds.iterrows():
    print(row['id'])
    # Convert CDS list into numeric array
    coordinates = array(row['coordinates'].split(',')).astype(int).tolist()
    coordinates =  [coordinates[i:i+2] for i in range(0, len(coordinates), 2)]
    # Open ref and outgroup
    ref = Fasta(DATA + '/fastas/ref/Chr' + row['chr'] +'.fasta')  
    ## Extract ref and outgroup seq
    refSeq = ref.get_spliced_seq(row['chr'],coordinates).seq.upper()    
#     refSeq = ref.get_seq(row['chr'],row['startGene'],row['endGene']).seq.upper()    
#     if(('N' not in refSeq) and (len(refSeq)/3).is_integer()):
    if('N' not in refSeq):
        # Open population multifasta
        popFasta = Fasta(DATA + '/fastas/alignments/' + 'ZI_' + 'Chr' + row['chr'] +'.seq')
        #Extract samples
        samples = list(popFasta.keys())
        matrixDna = np.empty([len(samples)+1,len(refSeq)],dtype='str')
        if(row['strand'] == '-'):            
            refSeq = reverseComplement(refSeq)
            matrixDna[0] = list(refSeq)
            for i in range(0,len(samples),1):
                tmp = popFasta.get_spliced_seq(samples[i], coordinates).seq.upper()
#                 tmp = popFasta.get_seq(samples[i], row['startGene'],row['endGene']).seq.upper()
                tmp = reverseComplement(tmp)
                matrixDna[i+1] = list(tmp)
        else:
            matrixDna[0] = list(refSeq)
            for i in range(0,len(samples),1):
#                 tmp = popFasta.get_seq(samples[i],row['startGene'],row['endGene']).seq.upper()
                tmp = popFasta.get_spliced_seq(samples[i], coordinates).seq.upper()
                matrixDna[i+1] = list(tmp)
        # Count occurences
        df = pd.DataFrame(matrixDna).transpose()
        for i,r in df.iterrows():
            if('N' in r.values):
                tmp = pd.DataFrame({'m':r.value_counts()['N']},index=[0])
                tmp.to_csv('/home/jmurga/mkt/201903/rawData/dmel/nCall/ncallzi.tab',header=False,index=False,mode='a')
            else:
                continue
    else:
        continue

In [None]:
nCalls = pd.Series(nCallsByPositions).value_counts().reset_index()
nCalls = pd.DataFrame(nCalls)
nCalls = nCalls.sort_values('index')
nCalls['freq'] = nCalls[0].apply(lambda x: x/sum(nCalls[0])*100)
nCalls.columns = ['positions','count','freq']
nCalls.to_csv('/home/jmurga/mkt/201903/rawData/dmel/nCall/ncallea.tab',sep='\t',header=True,index=False)
nCalls

In [None]:
nCalls.to_csv('/home/jmurga/mkt/201903/rawData/dmel/nCall/ByPositionsAllGenes.tab')

### Extracting Derived Allele Frequency and Divergence by population and type of site

Opening cdsCoordinates file to extract fasta sequence using pyfaidx

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --help

#### American populations

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population RAL --sampling 160 --outgroup dsim

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population USI --sampling 15 --outgroup dsim

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population USW --sampling 27 --outgroup dsim

#### African populations

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population CO --sampling 9 --outgroup dsim

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population EA --sampling 10 --outgroup dsim

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population EF --sampling 25 --outgroup dsim

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population EG --sampling 10 --outgroup dsim

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population GA --sampling 7 --outgroup dsim

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population RG --sampling 21 --outgroup dsim

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population SP --sampling 20 --outgroup dsim

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population SD --sampling 30 --outgroup dsim

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population ZI --sampling 154 --outgroup dsim

#### Asia

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population CHB --sampling 12 --outgroup dsim

#### Europe

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population FR --sampling 70 --outgroup dsim

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population NTH --sampling 11 --outgroup dsim

#### Oceania

In [None]:
!python /home/jmurga/mkt/201903/scripts/src/sfsDivergence.py --genes flybaseGenesCleaned.tab --cds cdsCoordinates.tab --population AUS --sampling 14 --outgroup dsim

### Estimating metrics *D. simulans*

#### Extracting mi and m0

In [None]:
dfGenes = pd.read_csv('/home/jmurga/mkt/201903/rawData/dmel/annotations/flybaseGenesCleaned.tab',header = 0,usecols=['id','name','chr','strand'],sep='\t')
cds = pd.read_csv('/home/jmurga/mkt/201903/rawData/dmel/annotations/cdsLargest.tab',header=0,sep='\t')
cds = pd.merge(cds, dfGenes,  how='inner', left_on=['chr','name'], right_on = ['chr','name'])
degeneratePositions = pd.read_csv('/home/jmurga/mkt/201903/rawData/dmel/annotations/zeroFourFoldPositions.tab',header=None,names=['CHROM','POS','degen','type'],sep='\t')

In [None]:
# Rewrite file each execution
columns = ['id','mi','m0']
totalFoldPositions = pd.DataFrame(columns=columns)
for j in cds['chr'].unique():
    print(j)
    chrPositions = degeneratePositions[degeneratePositions['chr']==j]
    for index, row in cds[cds['chr']==j].iterrows():
        print(index,row['id'])
        chrFile = Fasta('/data/shared/dgn/ref/Chr' + row['chr'] +'.fasta')
    #     Convert CDS list into numeric array
        coordinates = np.array(row['coordinates'].split(',')).astype(int).tolist()
        coordinates =  [coordinates[i:i+2] for i in range(0, len(coordinates), 2)]
        # Extract all CDS positions in a list in order to merge with degenerate sequences (same length -> same index)
        positions=[]
        for i in range(0,len(coordinates),1):
            positions.append(list(range(coordinates[i][0],coordinates[i][1]+1)))  
        allPositions = [item for sublist in positions for item in sublist]
        # Extract cds sequences
        seq = chrFile.get_spliced_seq(row['chr'].replace('chr',''), coordinates).seq.upper()
    #     print(len(seq))
        if(row['strand'] == '-'):
            seq = reverseComplement(seq)
            allPositions = allPositions[::-1]
        if((len(seq)/3).is_integer() and seq[0:3]=='ATG'):
            m = degenerate(seq)
            tmp = pd.DataFrame({'POS':allPositions,'m':list(m)})
            tmp['chr']=row['chr']
            tmp = pd.merge(chrPositions,tmp,on=['chr','POS'],how='right')              
            tmp['m'] = tmp.apply(lambda x: '0' if x['newType']=='0fold' else '4' if x['newType']=='4fold' else x['m'],axis=1)

            counts = tmp['m'].value_counts()
            if(counts.shape[0]<4 or '4' not in counts.index):
                m0=0
                mi=counts['0']
            else:
                m0=counts['4']
                mi=counts['0']
            data = pd.DataFrame({'id':row['id'],'mi':mi,'m0':m0},index=[0])
            totalFoldPositions = totalFoldPositions.append(data,ignore_index=True)
        else:
            data = pd.DataFrame({'id':row['id'],'mi':0,'m0':0},index=[0])
            totalFoldPositions = totalFoldPositions.append(data,ignore_index=True)

#### Extract by population Derived Allele Frequency and Divergence in a file to format by functional class and populations

In [6]:
degeneratePositions = pd.read_csv('/home/jmurga/mkt/201903/rawData/dmel/annotations/zeroFourFoldPositions.tab',header=None,names=['CHROM','POS','degen','type'],sep='\t')

In [7]:
pops=['AUS','CHB','CO','EA','EF','EG','FR','GA','NTH','RAL','RG','SD','SP','USI','USW','ZI']    

for p in pops:
    print(p)
    popSites = pd.read_csv(DATA + '/alleleFrequencies/dsim/' + 'dsimDmelSites' + p + '.tab',sep='\t',header=None,names=['id','chr','POS','div','rawDerivedAllele','type','pop'])
        
    popSites.to_csv(DATA+ '/alleleFrequencies/dsim/dsimDmelSites.tab',sep='\t',index=False,header=False,mode='a')

AUS
CHB
CO
EA
EF
EG
FR
GA
NTH
RAL
RG
SD
SP
USI
USW
ZI


In [42]:
dmelSites = pd.read_csv(DATA+ '/alleleFrequencies/dsim/dsimDmelSites.tab', sep='\t', header=None)
dmelSites.columns = ['id','CHROM','POS','div','rawDerivedAllele','type','pop']
dmelSites = dmelSites.drop('type',axis=1)
dmelSites.head()

Unnamed: 0,id,CHROM,POS,div,rawDerivedAllele,pop
0,FBgn0000018,2L,10975207,1,0.0,AUS
1,FBgn0000018,2L,10975203,1,0.0,AUS
2,FBgn0000018,2L,10975191,1,0.0,AUS
3,FBgn0000018,2L,10975141,0,0.428571,AUS
4,FBgn0000018,2L,10975091,1,0.0,AUS


In [43]:
dmelSites = pd.merge(dmelSites,degeneratePositions,how='left',left_on=['CHROM','POS'],right_on=['CHROM','POS'])
dmelSites = dmelSites[dmelSites['type']!='mixed']
dmelSites.head()

Unnamed: 0,id,CHROM,POS,div,rawDerivedAllele,pop,degen,type
0,FBgn0000018,2L,10975207,1,0.0,AUS,4,4fold
1,FBgn0000018,2L,10975203,1,0.0,AUS,0,0fold
2,FBgn0000018,2L,10975191,1,0.0,AUS,0,0fold
3,FBgn0000018,2L,10975141,0,0.428571,AUS,4,4fold
4,FBgn0000018,2L,10975091,1,0.0,AUS,0,0fold


#### Divergence

In [45]:
div = dmelSites.groupby(['id','type','pop'])['div'].sum().reset_index()
div = div.pivot_table(index=['id','pop'],columns=['type'],values='div').reset_index()
div.columns = ['id','pop','di','d0']

In [46]:
div.head()

Unnamed: 0,id,pop,di,d0
0,FBgn0000008,CHB,27.0,34.0
1,FBgn0000008,CO,25.0,33.0
2,FBgn0000008,EA,27.0,36.0
3,FBgn0000008,EF,27.0,33.0
4,FBgn0000008,EG,27.0,36.0


In [47]:
print(div[div['pop']=='RAL'].di.sum())
print(div[div['pop']=='RAL'].d0.sum())

253685.0
236315.0


#### Derived Allele Frequency

In [49]:
daf = dmelSites[['id','rawDerivedAllele','type','pop']][dmelSites['rawDerivedAllele']!=0]

bins = np.arange(0,1.05,0.05)
labels =  np.arange(0.05,1.05,0.05)

daf['categories'] = pd.cut(daf['rawDerivedAllele'],bins=bins,labels=labels)

sfs = daf.groupby(['id','type','categories','pop']).count().reset_index()
sfs['rawDerivedAllele'] = sfs['rawDerivedAllele'].fillna(0).astype(int)
sfs = sfs.groupby(['id','pop','type'])['rawDerivedAllele'].apply(list).reset_index()

sfs = sfs.pivot_table(index=['id','pop'], columns=['type'],values='rawDerivedAllele',aggfunc=lambda x:x).reset_index()
sfs['p0'] = sfs['4fold'].apply(lambda x: sum(x))
sfs['pi'] = sfs['0fold'].apply(lambda x: sum(x))

sfs['0fold'] = sfs['0fold'].apply(lambda x:';'.join(map(str,x)))
sfs['4fold'] = sfs['4fold'].apply(lambda x:';'.join(map(str,x)))

sfs.columns = ['id','pop','daf0f','daf4f','p0','pi']

In [50]:
sfs[(sfs['pop']=='RAL')].head()

Unnamed: 0,id,pop,daf0f,daf4f,p0,pi
9,FBgn0000008,RAL,30;7;4;0;1;0;0;0;1;1;0;0;1;0;0;0;0;0;0;0,24;5;3;3;0;0;1;1;1;0;1;1;0;1;0;0;1;0;0;0,42,45
25,FBgn0000014,RAL,2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,1;0;1;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1;0,4,2
41,FBgn0000015,RAL,3;0;0;0;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0,5;0;3;1;0;1;0;0;0;0;1;0;0;1;1;0;0;0;0;0,13,4
57,FBgn0000017,RAL,12;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,14;2;0;1;0;0;0;0;0;0;0;0;0;0;2;0;0;0;0;2,21,12
73,FBgn0000018,RAL,5;0;0;0;0;1;0;0;1;1;0;0;2;1;0;0;0;0;0;0,5;3;1;0;0;0;2;0;1;1;0;0;0;0;0;0;0;0;0;0,13,11


In [51]:
print(sfs[sfs['pop']=='RAL'].pi.sum())
print(sfs[sfs['pop']=='RAL'].p0.sum())

130978
167254


#### Merge info

In [52]:
PopFlyData = pd.merge(sfs,div,on=['id','pop'],how='outer')

In [53]:
dfGenes = pd.read_csv(DATA + '/annotations/flybaseGenesCleaned.tab',sep='\t',header=0)
pops=['AUS','CHB','CO','EA','EF','EG','FR','GA','NTH','RAL','RG','SD','SP','USI','USW','ZI']                                   
genesByPop = pd.DataFrame()

for p in pops:
    
    dfGenes['pop'] = p
    genesByPop = genesByPop.append(dfGenes)
    
genesByPop = genesByPop.reset_index(drop=True)

In [54]:
pops=['AUS','CHB','CO','EA','EF','EG','FR','GA','NTH','RAL','RG','SD','SP','USI','USW','ZI']                                   
refAnalizableSites = pd.read_csv(DATA+'/refAnalyzableSites.tab',sep='\t',header=0)

totalFoldPositionsByPop = pd.DataFrame()
for p in pops:
    print(p)
    refAnalizableSites['pop'] = p
    
    totalFoldPositionsByPop = totalFoldPositionsByPop.append(refAnalizableSites)

totalFoldPositionsByPop = totalFoldPositionsByPop.reset_index(drop=True)

AUS
CHB
CO
EA
EF
EG
FR
GA
NTH
RAL
RG
SD
SP
USI
USW
ZI


In [55]:
genesByPop = pd.merge(genesByPop,totalFoldPositionsByPop,on=['id','pop'],how='outer')

In [56]:
PopFlyData = pd.merge(PopFlyData,genesByPop,on=['id','pop'],how='outer')
PopFlyData[PopFlyData['daf0f'].isna()].loc[:,'daf0f'] = '0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0'
PopFlyData[PopFlyData['daf4f'].isna()].loc[:,'daf4f'] = '0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0'
PopFlyData['daf0f'] = PopFlyData['daf0f'].fillna('0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0')
PopFlyData['daf4f'] = PopFlyData['daf4f'].fillna('0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0')
PopFlyData = PopFlyData.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [57]:
dfRecomb = pd.read_csv(DATA + '/genesRecombination.tab',sep='\t')
dfRecomb.columns = ['id','recomb']
PopFlyData = pd.merge(PopFlyData,dfRecomb,on=['id'])

In [58]:
PopFlyData.head()

Unnamed: 0,id,pop,daf0f,daf4f,p0,pi,di,d0,chr,startGene,endGene,strand,name,mi,m0,recomb
0,FBgn0000008,AUS,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,0.0,0.0,0.0,0.0,2R,18024473,18060339,+,a,2539,667,2.169284
1,FBgn0000008,CHB,0;3;0;1;1;0;0;0;0;0;0;0;0;0;1;0;1;0;1;0,0;4;0;1;1;0;0;0;1;1;0;2;0;0;0;0;1;0;2;0,13.0,8.0,27.0,34.0,2R,18024473,18060339,+,a,2539,667,2.169284
2,FBgn0000008,CO,0;0;11;0;0;0;0;0;1;0;0;0;0;0;0;1;0;0;0;0,0;0;6;0;6;0;3;0;6;0;0;0;0;1;0;2;0;0;0;0,24.0,13.0,25.0,33.0,2R,18024473,18060339,+,a,2539,667,2.169284
3,FBgn0000008,EA,0;11;0;1;0;0;0;1;0;1;0;0;0;0;0;0;0;1;0;0,0;10;0;6;0;3;0;2;0;0;0;1;0;1;0;1;0;0;0;0,24.0,15.0,27.0,36.0,2R,18024473,18060339,+,a,2539,667,2.169284
4,FBgn0000008,EF,8;3;3;0;1;0;0;0;0;0;0;0;0;0;0;1;0;0;0;0,11;5;1;8;1;3;1;1;2;0;0;2;0;0;0;0;1;0;0;0,36.0,16.0,27.0,33.0,2R,18024473,18060339,+,a,2539,667,2.169284


In [None]:
PopFlyData = PopFlyData[['id','pop','daf0f','daf4f','p0','pi','di','d0','chr','name','mi','m0','recomb']]

In [62]:
PopFlyData.columns = ['Name','Pop','DAF0f','DAF4f','p0','pi','di','d0','chr','symbol','mi','m0','recomb']
PopFlyData.head()

Unnamed: 0,Name,Pop,DAF0f,DAF4f,p0,pi,di,d0,chr,symbol,mi,m0,recomb
0,FBgn0000008,AUS,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,0.0,0.0,0.0,0.0,2R,a,2539,667,2.169284
1,FBgn0000008,CHB,0;3;0;1;1;0;0;0;0;0;0;0;0;0;1;0;1;0;1;0,0;4;0;1;1;0;0;0;1;1;0;2;0;0;0;0;1;0;2;0,13.0,8.0,27.0,34.0,2R,a,2539,667,2.169284
2,FBgn0000008,CO,0;0;11;0;0;0;0;0;1;0;0;0;0;0;0;1;0;0;0;0,0;0;6;0;6;0;3;0;6;0;0;0;0;1;0;2;0;0;0;0,24.0,13.0,25.0,33.0,2R,a,2539,667,2.169284
3,FBgn0000008,EA,0;11;0;1;0;0;0;1;0;1;0;0;0;0;0;0;0;1;0;0,0;10;0;6;0;3;0;2;0;0;0;1;0;1;0;1;0;0;0;0,24.0,15.0,27.0,36.0,2R,a,2539,667,2.169284
4,FBgn0000008,EF,8;3;3;0;1;0;0;0;0;0;0;0;0;0;0;1;0;0;0;0,11;5;1;8;1;3;1;1;2;0;0;2;0;0;0;0;1;0;0;0,36.0,16.0,27.0,33.0,2R,a,2539,667,2.169284


In [63]:
PopFlyData.to_csv('/home/jmurga/mkt/201903/results/dsimDmelManual.tab',sep='\t',header=True,index=False)