Skip to content
Permalink
Browse files

allow opb detection of PS

  • Loading branch information
lgueguen committed Mar 4, 2021
1 parent f815d2d commit 18ca78770b839941709b0f12716ffa201f81e8b6
Showing with 215 additions and 133 deletions.
  1. +2 −1 README.md
  2. +87 −5 lib/BranchAnalysis.py
  3. +8 −8 lib/Init.py
  4. +1 −6 lib/PSPFunc.py
  5. +6 −12 lib/PosSelFunc.py
  6. +111 −101 lib/SiteAnalysis.py
@@ -205,7 +205,8 @@ mixedlikelihood:
# Option for using BIO++ for the detection of branches under positive selection
# If True, parameter file will be automatically generated
# Can be used to indicate the path to a BIO++ parameter file
# Positive selection on each is assessed through LRT M2 vs M1 model in bio++.
# Parameters different from omega are shared between all branches.
opb:
```

@@ -1,6 +1,9 @@
import os, shlex, logging, subprocess, re
from AnalysisFunc import cmd
from subprocess import PIPE
from shutil import copyfile
from SiteAnalysis import getNewParfromOptim, setIgnoreParams
import PSPFunc

def bppBranch(OPBFile, outDir, baseName, alnFile, alnFormat, treeFile, logger):
### BRANCH ANALYSIS: BIO++ ONE PER BRANCH
@@ -12,10 +15,12 @@ def bppBranch(OPBFile, outDir, baseName, alnFile, alnFormat, treeFile, logger):
if not os.path.exists(outOPB):
os.makedirs(outOPB)

model = "M2"

outFileName = outOPB+baseName
outTree = outFileName+"_branch.dnd"
outParams = outFileName+"_branch.params"
outBackup = outFileName+"_branch_optimization"
outTree = outFileName+"_"+model+".dnd"
outParams = outFileName+"_"+model+".params"
outBackup = outFileName+"_optimization_"+model

# create dictionary with all elements of the two argument lists to build commands
dBppCmd = {"INPUTFILE": alnFile,
@@ -24,15 +29,92 @@ def bppBranch(OPBFile, outDir, baseName, alnFile, alnFormat, treeFile, logger):
"OUTTREE": outTree,
"OUTPARAMS": outParams,
"BACKUP": outBackup,
"param": OPBFile}

"model1":"YNGP_"+model+"(frequencies=F3X4(initFreqs=observed))",
"param": OPBFile,
"process1":"OnePerBranch(model=1, tree=1, rate=1, root_freq=1, shared_parameters=(*kappa, *Full.theta*))"}
# running bppml
logger.info("Running Branch optimization")

### look for previous M0 optim
outSite = outDir+"bpp_site/"
outSiteFileName = outSite+baseName

lModels=["M0", "M2"]
dPrevModelLog = {model:outSiteFileName+"_optimization_"+model for model in lModels}
prevmodel, dnewpar = getNewParfromOptim(model, lModels, dPrevModelLog, logger)
lignore=[]
if prevmodel!="":
fnew=open(outBackup,"w")
for k,v in dnewpar.items():
fnew.write(k+"="+v.strip()+"\n")
fnew.close()
lignore= setIgnoreParams(model, prevmodel, lModels, logger)

dBppCmd["IGNORE"]=",".join(lignore)

# join each couple of the cmd dictionary so that it reads "k1 = v1" "k2 = v2" etc...
argsOPB = "bppml \""+"\" \"".join([k+"="+v for k, v in dBppCmd.items()])+"\""
logger.debug(argsOPB)
runOPB = cmd(argsOPB, False)

# test each branch
# Scan all parameter names
fback = open(dBppCmd["BACKUP"]+".def","r")
dparam={param.split("=")[0]:float(param.split("=")[1]) for param in fback.readlines()}
fback.close()

valM2=float(dparam["f(x)"])

# cp outParams for each branch with M2 replaced with M1
# only for the where theta1 < 0.999 & theta2 < 0.999

fparam = open(outParams, "r")
lcmd=[l for l in fparam.readlines()]
fparam.close()

## Look for correspondance model_nb <-> node_id
lprocess=[l for l in lcmd if l[:7]=="process"][0]
lid=lprocess.split(".nodes_id=(")
cormodid={}
for i in range(0,len(lid),2):
mod=int(lid[i][lid[i].rfind("l")+1:])
idi=int(lid[i+1][:lid[i+1].find(")"):])
cormodid[idi]=mod

## Compute lk for each node with theta1_mod * theta2_mod < 0.999:
fresbranch=open(outFileName+"_branch.txt","w")
fresbranch.write("Id\tomega2\tprop\tM2\tM1\tLR\tp\n")
del(dBppCmd["process1"])
del(dBppCmd["model1"])
for idi,mod in cormodid.items():
if dparam["YNGP_M2.theta2_%d"%mod] * dparam["YNGP_M2.theta1_%d"%mod] >= 0.999:
continue
fback = open(outBackup+"_%d"%mod,"w")
[fback.write(key+"="+str(val)+"\n") for key,val in dparam.items() if key!="YNGP_M2.theta2_%d"%mod]
fback.write("YNGP_M2.theta2_%d=1\n"%mod)
fback.write("YNGP_M2.omega2_%d=1\n"%mod)
fback.close()

lignore2=lignore[:]+[key for key in dparam if key not in ["YNGP_M2.theta1_%d"%mod, "YNGP_M2.omega0_%d"%mod]]
dBppCmd["IGNORE"]=",".join(lignore2)
dBppCmd["params"] = outParams
dBppCmd["OUTPARAMS"] = outParams+"_%d"%idi
dBppCmd["BACKUP"] = outBackup+"_%d"%mod

argsOPB = "bppml \""+"\" \"".join([k+"="+v for k, v in dBppCmd.items()])+"\""
# logger.debug(argsOPB)
runOPB = cmd(argsOPB, False)

fback = open(dBppCmd["BACKUP"]+".def","r")
dparam2={param.split("=")[0]:float(param.split("=")[1]) for param in fback.readlines()}
fback.close()

valM1=float(dparam2["f(x)"])
LR, p = PSPFunc.LRT(valM2,valM1,2)
fresbranch.write("%d\t%f\t%f\t%f\t%f\t%f\t%f\n"%(idi,dparam["YNGP_M2.omega2_%d"%mod],(1-dparam["YNGP_M2.theta2_%d"%mod]) * (1-dparam["YNGP_M2.theta1_%d"%mod]),valM2,valM1,LR,p))
if p<0.05:
logger.info("Node {:d} is interesting (w = {:f})".format(idi, dparam["YNGP_M2.omega2_%d"%mod]))
fresbranch.close()
return(outParams)

def parseNodes(outParams, logger):
@@ -213,7 +213,7 @@ def paramDef(params, inf, queryName, outdir):
print(M + " isn't a valid model.")
else:
ltemp.append(M)
dParams[opt] = ",".join(ltemp)
dParams["models"] = ",".join(ltemp)

elif dParams["step"] == "positiveSelection":
print("Error: positiveSelection option set to false and step set to positiveSelection.")
@@ -317,13 +317,13 @@ def initLogger(args, debug, version):
logger.info("Starting {:s} (v{:s})".format(__file__, version))

mainData = DataObject.basicData(args["infile"],
args["outdir"],
args["blastdb"],
timeStamp,
args["sptree"],
args["alnfile"],
args["treefile"],
args["queryName"])
args["outdir"],
args["blastdb"],
timeStamp,
args["sptree"],
args["alnfile"],
args["treefile"],
args["queryName"])

if args["step"] == "duplication" and args["duplication"] == False:
args["duplication"] = True
@@ -8,7 +8,7 @@ def getParams(models, paml, bppml, mixed, Busted, Meme, opb, gnh):
dCtrls = {}
lModels = []
if models != "":
if bppml not in ["", "False", False] and mixed not in ["", "False", False]:
if bppml not in ["", "False", False]:# and mixed not in ["", "False", False]:
dCtrls["bppml"] = bppml
dCtrls["bppmixedlikelihood"] = mixed
if paml not in ["", "False", False]:
@@ -92,11 +92,6 @@ def pspFileCreation(path, option):
dparams["model1"] = "$(MODEL)"
dparams["process1"] = "Homogeneous(model=1, tree=1, rate=1, root_freq=1)"
dparams["scenario1"] = "split(model=1)"
elif option == "opb":
dparams["nonhomogeneous"] = "one_per_branch"
dparams["model"] = "YNGP_M0(frequencies=F3X4(initFreqs=observed))"
dparams["process1"] = "OnePerBranch(model=1, tree=1, rate=1, shared_parameters=(YN98.kappa, YN98.123_*))"
dparams["nonhomogeneous_one_per_branch.shared_parameters"] = "YN98.kappa, YN98.*theta*"
elif option == "gnh":
dparams["nonhomogeneous"] = "general"
dparams["nonhomogeneous.number_of_models"] = "2"
@@ -33,8 +33,6 @@ def pspAnalysis(data, parms, aln, tree):
logger.info("Alignement is in {:s} format.".format(data.alnFormat))
logger.info("Tree: {:s}".format(tree))

nodes = PSPFunc.nbNode(tree, logger)

### Run the different analysis as determined by control file
logger.info("Starting positive selection analyses.")
logger.info("POSITIVE SELECTION ANALYSIS: ")
@@ -85,26 +83,22 @@ def pspAnalysis(data, parms, aln, tree):

lPSNodes = []
if "OPB" in dCtrls:
try:
# try:
params = BranchAnalysis.bppBranch(dCtrls["OPB"],
outDir,
data.baseName,
aln,
data.alnFormat,
tree,
logger)
except Exception:
logger.error("Bio++ Branch Analysis encountered an unexpected error, skipping.")
try:
lPSNodes = BranchAnalysis.parseNodes(params, logger)
except Exception:
logger.error("Could not find info about positively selected branches in Bio++ results. If unexpected, check Bio++ parameter files for OPB.")
# except Exception:
# logger.error("Bio++ Branch Analysis encountered an unexpected error, skipping.")

if "OPB" and "GNH" in dCtrls and len(lPSNodes) > 1:
try:
# try:
BranchAnalysis.bppBranchSite(dCtrls["GNH"], lPSNodes, outDir, data.baseName, aln, data.alnFormat, tree, logger)
except Exception:
logger.error("Bio++ Pseudo Branch-Site Analysis encountered an unexpected error, skipping.")
# except Exception:
# logger.error("Bio++ Pseudo Branch-Site Analysis encountered an unexpected error, skipping.")

if "paml" in dCtrls:
SiteAnalysis.pamlSite(aln,

0 comments on commit 18ca787

Please sign in to comment.