In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import urllib.parse
import urllib.request
from time import sleep
import os
import subprocess
import time
from scipy import stats

In [2]:
def uniprotMapping(query, From="ACC",To="ACC",Format="fasta",Columns=""):
    url = 'https://www.uniprot.org/uploadlists/'
    params={
        "query":query,
        "from":From,
        "to":To,
        "format":Format,
        "columns":Columns,
    }
    data = urllib.parse.urlencode(params)
    data = data.encode('utf-8')
    req = urllib.request.Request(url, data)
    with urllib.request.urlopen(req) as f:
        response = str(f.read(),encoding="utf-8")
    return response

def extractIdFasta(fastaStr,Id):
    newFastaList=["",""]
    for line in (fastaStr+" ").split("\n"):
        if line.startswith(">"):
            if line.replace(">","")==Id:
                i=0
            else:
                i=1
        newFastaList[i]+=line+"\n"
    return newFastaList[0]

def clustalo(inputFasta):
    tempInputFile="{}.fasta".format(time.time())
    with open(tempInputFile,"w") as f:
        f.write(inputFasta)
    tempOutputFile="{}.fasta".format(time.time())
    
    command = "clustalo -i {} -o {}".format(tempInputFile,tempOutputFile)
    cmd = subprocess.Popen(command,
                           shell=True,
                           stdin=subprocess.PIPE,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.STDOUT)
    cmd.communicate()
    
    with open(tempOutputFile) as f:
        output = f.read()
    
    for file in [tempInputFile,tempOutputFile]:
        os.remove(file)
    
    return output
    

def cdhit(inputFasta, identity=0.9):
    tempInputFile="{}.fasta".format(time.time())
    with open(tempInputFile,"w") as f:
        f.write(inputFasta)
    tempOutputFile="{}.fasta".format(time.time())
    tempOutputFile2=tempOutputFile+".clstr"
    
    command = "cd-hit -i {} -o {} -c {}".format(tempInputFile,tempOutputFile,identity)
    cmd = subprocess.Popen(command,
                           shell=True,
                           stdin=subprocess.PIPE,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.STDOUT)
    cmd.communicate()
    
    with open(tempOutputFile) as f:
        output = f.read()
        
    for file in [tempInputFile,tempOutputFile,tempOutputFile2]:
        os.remove(file)
        
    return output

def simplifyIds(fastaStr):
    newFastaStr=""
    for line in fastaStr.split("\n"):
        if line.startswith(">"):
            newFastaStr+=">{}\n".format(line.split("|")[1])
        else:
            newFastaStr+=line+"\n"
    return newFastaStr

def cutSpFromMsa(msaStr,signalPeptideLength, Id):
    def fastaStr2Lol(fastaStr):
        lol=[entry.replace("\n","toSplit",1).replace("\n","").split("toSplit")
            for entry
            in fastaStr[1:].split(">")]
        return [(entry.split("|")[:2][-1],seq) for entry,seq in lol]
    
    def lol2dic(lol):
        return {key:value for key,value in lol}

    def lol2FastaStr(lol):
        return ">"+"\n>".join(map("\n".join,lol))+"\n"

    def dic2FastaStr(dic):
        fastaStr=""
        for key,value in dic.items():
            fastaStr+=">{}\n{}\n".format(key,value)
        return fastaStr

    def findRelPosInMsa(msaDic, Id, pos):
        seq = msaDic[Id]
        msaPos=0
        i=0
        while i<=pos:
            msaPos+=1
            if seq[msaPos]!="-":
                i+=1
        return msaPos

    def cutFromMsa(msaDic, newStart):
        return {key:value[newStart:] for key,value in msaDic.items()}
    
    lol = fastaStr2Lol(msaStr)
    dic = lol2dic(lol)
    relPos = findRelPosInMsa(dic,Id, signalPeptideLength)
    newDic=cutFromMsa(dic,relPos)
    return dic2FastaStr(newDic)

def getMsaPeridictions(msaString):
    tempInputFile="{}.MSA.fasta".format(time.time())
    with open(tempInputFile,"w") as f:
        f.write(msaString)
    tempOutputFile="{}.MSA.pred".format(time.time())
    
    command = "python2 efoldminePredictionsMsa.py {} {}".format(tempInputFile,tempOutputFile)
    cmd = subprocess.Popen(command,
                           shell=True,
                           stdin=subprocess.PIPE,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.STDOUT)
    cmd.communicate()
    
    with open(tempOutputFile) as f:
        msaPredictionString= f.read()
    
    for file in [tempInputFile, tempOutputFile]:
        os.remove(file)
        
    return msaPredictionString

def msaPredictions2Dic(msaPredictionString):
    msaPredictionDic=dict()
    for line in msaPredictionString.split("\n"):
        if line.startswith(">"):
            Id=line.replace(">","")
            msaPredictionDic[Id]=dict()
        else:
            if line:
                feature, featureSequence = line.split("\t")
                if feature=="sequence":
                    msaPredictionDic[Id][feature]=featureSequence.split(" ")
                else:
                    msaPredictionDic[Id][feature]=[float(elm) 
                                                   for elm 
                                                   in featureSequence.split(" ")]
    return msaPredictionDic

def msaPredictionsDic2Arrays(msaPredictionDic):
    Ids =  sorted(msaPredictionDic.keys())
    features = sorted(msaPredictionDic[Ids[0]].keys())
    cols = len(msaPredictionDic[Ids[0]][features[0]])
    rows = len(Ids)
    
    dicOfArrays=dict()
    for feature in features:
        if feature=="sequence":
            dicOfArrays[feature]=np.zeros((rows,cols),dtype=str)
            dicOfArrays[feature][:,:]="-"
        else:
            dicOfArrays[feature]=np.zeros((rows,cols),dtype=float)
            dicOfArrays[feature][:,:]=np.nan
            
    for i,Id in enumerate(Ids):
        for feature in features:
            dicOfArrays[feature][i,:]=msaPredictionDic[Id][feature]
    return dicOfArrays

def allignTwins(CYTO_ID, PERI_ID, SP_END):
    print()
    print("Compare Twins:", CYTO_ID,"and",PERI_ID)
    print("Retrieve Cytoplasmic homologues")
    CYTO_UNIREF50 = uniprotMapping(CYTO_ID, From="ACC", To="NF50", Format="list")
    CYTO_SEQS = uniprotMapping(CYTO_UNIREF50, From="NF50", To="ACC", Format="fasta")
    CYTO_SEQS = simplifyIds(CYTO_SEQS)
    CYTO_SEQS_MAIN = extractIdFasta(CYTO_SEQS,CYTO_ID)
    CYTO_IDS=[LINE.replace(">","") for LINE in CYTO_SEQS.split("\n") if LINE.startswith(">")]
    print("Retrieve Periplasmic homologues")
    PERI_UNIREF50 = uniprotMapping(PERI_ID, From="ACC", To="NF50", Format="list")
    PERI_SEQS = uniprotMapping(PERI_UNIREF50, From="NF50", To="ACC", Format="fasta")
    PERI_SEQS = simplifyIds(PERI_SEQS)
    PERI_SEQS_MAIN = extractIdFasta(PERI_SEQS,PERI_ID)
    PERI_IDS=[LINE.replace(">","") for LINE in PERI_SEQS.split("\n") if LINE.startswith(">")]
    print("reduce Cytoplasmic redundance with cdhit")
    CYTO_MAIN_PLUS_CDHIT=CYTO_SEQS_MAIN+cdhit(CYTO_SEQS)
    print("reduce Periplasmic redundance with cdhit")
    PERI_MAIN_PLUS_CDHIT=PERI_SEQS_MAIN+cdhit(PERI_SEQS)
    print("generate Cytoplasmic MSA with clustalo")
    CYTO_MSA=clustalo(CYTO_MAIN_PLUS_CDHIT)
    print("generate Periplasmic MSA with clustalo")
    PERI_MSA=clustalo(PERI_MAIN_PLUS_CDHIT)
    PERI_MSA_CUT=cutSpFromMsa(PERI_MSA,SP_END, PERI_ID)
    print("Allign cytoplasmic and periplasmic proteins to each other with clustalo")
    ALL_MSA=clustalo(PERI_MSA_CUT+CYTO_MSA)
    ALL_MSA_CUT=cutSpFromMsa(ALL_MSA,0,PERI_ID)
    print("Generate predictions")
    ALL_MSA_PREDICTIONS=getMsaPeridictions(ALL_MSA_CUT)
    ALL_MSA_PREDICTIONS_DIC = msaPredictions2Dic(ALL_MSA_PREDICTIONS)
    CYTO_MSA_PREDICTIONS_DIC = {KEY:VALUE 
                                for KEY,VALUE 
                                in ALL_MSA_PREDICTIONS_DIC.items() 
                                if KEY in CYTO_IDS}
    PERI_MSA_PREDICTIONS_DIC = {KEY:VALUE 
                                for KEY,VALUE 
                                in ALL_MSA_PREDICTIONS_DIC.items() 
                                if KEY in PERI_IDS}
    print("generate arrays")
    CYTO_MSA_PREDICTIONS_ARRAYS = msaPredictionsDic2Arrays(CYTO_MSA_PREDICTIONS_DIC)
    PERI_MSA_PREDICTIONS_ARRAYS = msaPredictionsDic2Arrays(PERI_MSA_PREDICTIONS_DIC)
    ALL_MSA_PREDICTIONS_ARRAYS=dict()
    ALL_MSA_PREDICTIONS_ARRAYS["Cytoplasm"]=CYTO_MSA_PREDICTIONS_ARRAYS
    ALL_MSA_PREDICTIONS_ARRAYS["Periplasm"]=PERI_MSA_PREDICTIONS_ARRAYS
    return ALL_MSA_PREDICTIONS_ARRAYS

def wilcoxonRanksumTest(arrays,pValue=5e-2,minDataSize=10):
    groups = sorted(arrays.keys())
    cytoplasm=groups[0]
    periplasm=groups[1]
    features = [feature for feature in arrays[groups[0]].keys() if feature!="sequence"]
    rows, cols = arrays[groups[0]][features[0]].shape
    
    wilcoxonResults=dict()
    
    for feature in features:
        wilcoxonResults[feature]=list()
        for i in range(cols):
            cytoData=[value for value in arrays[cytoplasm][feature][:,i] if not np.isnan(value)]
            periData=[value for value in arrays[periplasm][feature][:,i] if not np.isnan(value)]
            if len(cytoData)<minDataSize or len(periData)<minDataSize:
                wilcoxonResults[feature].append(float("nan"))
            else:
                statistic, pValue_observed = stats.ranksums(cytoData,periData)
                if pValue_observed > pValue:
                    wilcoxonResults[feature].append(0)
                else:
                    wilcoxonResults[feature].append(statistic)
    return wilcoxonResults
                
def writeWilcoxonResults(file,wilcoxonResults,cyto_id, peri_id):
    f=open(file,"a")
    f.write(">{}|{}\n".format(cyto_id, peri_id))
    for feature,sequence in wilcoxonResults.items():
        sequenceString=" ".join([str(elm) for elm in sequence])
        f.write("{}\t{}\n".format(feature,sequenceString))
    f.close()

In [1]:
TWINS = pd.read_csv("twins_very_strict.tab",sep="\t")
TWINS

NameError: name 'pd' is not defined

In [None]:
INPUT_DATA=list()
for i,Series in tqdm(sorted(TWINS.iterrows(),reverse=False)):
    INPUT_DATA.append((Series["Cytoplasm"],Series["Periplasm"],Series["SP_end"]))
INPUT_DATA

In [6]:
I=0
CYTO_ID, PERI_ID, SP_END =INPUT_DATA[I]
ARRAYS = allignTwins(CYTO_ID, PERI_ID, SP_END)
WILCOXON = wilcoxonRanksumTest(ARRAYS)
WILCOXON_FILE="wilcoxon_results_{}_{}.txt".format(CYTO_ID, PERI_ID)
writeWilcoxonResults(WILCOXON_FILE, WILCOXON,CYTO_ID,PERI_ID)


Compare Twins: A0A169XQ31 and A0A2I8WC74
Retrieve Cytoplasmic homologues
Retrieve Periplasmic homologues
reduce Cytoplasmic redundance with cdhit
reduce Periplasmic redundance with cdhit
generate Cytoplasmic MSA with clustalo
generate Periplasmic MSA with clustalo
Allign cytoplasmic and periplasmic proteins to each other with clustalo
Generate predictions
generate arrays


In [21]:
I=1
CYTO_ID, PERI_ID, SP_END =INPUT_DATA[I]
ARRAYS = allignTwins(CYTO_ID, PERI_ID, SP_END)
WILCOXON = wilcoxonRanksumTest(ARRAYS)
WILCOXON_FILE="wilcoxon_results_{}_{}.txt".format(CYTO_ID, PERI_ID)
writeWilcoxonResults(WILCOXON_FILE, WILCOXON,CYTO_ID,PERI_ID)


Compare Twins: Q9ZGM4 and Q9WXB9
Retrieve Cytoplasmic homologues
Retrieve Periplasmic homologues
reduce Cytoplasmic redundance with cdhit
reduce Periplasmic redundance with cdhit
generate Cytoplasmic MSA with clustalo
generate Periplasmic MSA with clustalo
Allign cytoplasmic and periplasmic proteins to each other with clustalo
Generate predictions
generate arrays


In [20]:
I=2
CYTO_ID, PERI_ID, SP_END =INPUT_DATA[I]
ARRAYS = allignTwins(CYTO_ID, PERI_ID, SP_END)
WILCOXON = wilcoxonRanksumTest(ARRAYS)
WILCOXON_FILE="wilcoxon_results_{}_{}.txt".format(CYTO_ID, PERI_ID)
writeWilcoxonResults(WILCOXON_FILE, WILCOXON,CYTO_ID,PERI_ID)


Compare Twins: P23869 and P0AFL3
Retrieve Cytoplasmic homologues
Retrieve Periplasmic homologues
reduce Cytoplasmic redundance with cdhit
reduce Periplasmic redundance with cdhit
generate Cytoplasmic MSA with clustalo
generate Periplasmic MSA with clustalo
Allign cytoplasmic and periplasmic proteins to each other with clustalo
Generate predictions
generate arrays


In [11]:
I=3
CYTO_ID, PERI_ID, SP_END =INPUT_DATA[I]
ARRAYS = allignTwins(CYTO_ID, PERI_ID, SP_END)
WILCOXON = wilcoxonRanksumTest(ARRAYS)
WILCOXON_FILE="wilcoxon_results_{}_{}.txt".format(CYTO_ID, PERI_ID)
writeWilcoxonResults(WILCOXON_FILE, WILCOXON,CYTO_ID,PERI_ID)


Compare Twins: P12994 and P77368
Retrieve Cytoplasmic homologues
Retrieve Periplasmic homologues
reduce Cytoplasmic redundance with cdhit
reduce Periplasmic redundance with cdhit
generate Cytoplasmic MSA with clustalo
generate Periplasmic MSA with clustalo
Allign cytoplasmic and periplasmic proteins to each other with clustalo
Generate predictions


FileNotFoundError: [Errno 2] No such file or directory: '1589534633.9599469.MSA.pred'

In [12]:
I=4
CYTO_ID, PERI_ID, SP_END =INPUT_DATA[I]
ARRAYS = allignTwins(CYTO_ID, PERI_ID, SP_END)
WILCOXON = wilcoxonRanksumTest(ARRAYS)
WILCOXON_FILE="wilcoxon_results_{}_{}.txt".format(CYTO_ID, PERI_ID)
writeWilcoxonResults(WILCOXON_FILE, WILCOXON,CYTO_ID,PERI_ID)


Compare Twins: P0A962 and P00805
Retrieve Cytoplasmic homologues
Retrieve Periplasmic homologues
reduce Cytoplasmic redundance with cdhit
reduce Periplasmic redundance with cdhit
generate Cytoplasmic MSA with clustalo
generate Periplasmic MSA with clustalo
Allign cytoplasmic and periplasmic proteins to each other with clustalo
Generate predictions


FileNotFoundError: [Errno 2] No such file or directory: '1589534741.9817603.MSA.pred'

In [9]:
I=5
CYTO_ID, PERI_ID, SP_END =INPUT_DATA[I]
ARRAYS = allignTwins(CYTO_ID, PERI_ID, SP_END)
WILCOXON = wilcoxonRanksumTest(ARRAYS)
WILCOXON_FILE="wilcoxon_results_{}_{}.txt".format(CYTO_ID, PERI_ID)
writeWilcoxonResults(WILCOXON_FILE, WILCOXON,CYTO_ID,PERI_ID)


Compare Twins: A0A1C3H8R2 and A0A2V4G4C5
Retrieve Cytoplasmic homologues
Retrieve Periplasmic homologues
reduce Cytoplasmic redundance with cdhit
reduce Periplasmic redundance with cdhit
generate Cytoplasmic MSA with clustalo
generate Periplasmic MSA with clustalo
Allign cytoplasmic and periplasmic proteins to each other with clustalo
Generate predictions
generate arrays


In [14]:
I=6
CYTO_ID, PERI_ID, SP_END =INPUT_DATA[I]
ARRAYS = allignTwins(CYTO_ID, PERI_ID, SP_END)
WILCOXON = wilcoxonRanksumTest(ARRAYS)
WILCOXON_FILE="wilcoxon_results_{}_{}.txt".format(CYTO_ID, PERI_ID)
writeWilcoxonResults(WILCOXON_FILE, WILCOXON,CYTO_ID,PERI_ID)


Compare Twins: A0A3R0NUQ6 and P0AAL5
Retrieve Cytoplasmic homologues
Retrieve Periplasmic homologues
reduce Cytoplasmic redundance with cdhit
reduce Periplasmic redundance with cdhit
generate Cytoplasmic MSA with clustalo
generate Periplasmic MSA with clustalo
Allign cytoplasmic and periplasmic proteins to each other with clustalo
Generate predictions
generate arrays


In [15]:
I=7
CYTO_ID, PERI_ID, SP_END =INPUT_DATA[I]
ARRAYS = allignTwins(CYTO_ID, PERI_ID, SP_END)
WILCOXON = wilcoxonRanksumTest(ARRAYS)
WILCOXON_FILE="wilcoxon_results_{}_{}.txt".format(CYTO_ID, PERI_ID)
writeWilcoxonResults(WILCOXON_FILE, WILCOXON,CYTO_ID,PERI_ID)


Compare Twins: A0A193QGU2 and Q2NVU4
Retrieve Cytoplasmic homologues
Retrieve Periplasmic homologues
reduce Cytoplasmic redundance with cdhit
reduce Periplasmic redundance with cdhit
generate Cytoplasmic MSA with clustalo
generate Periplasmic MSA with clustalo
Allign cytoplasmic and periplasmic proteins to each other with clustalo
Generate predictions
generate arrays


In [16]:
I=8
CYTO_ID, PERI_ID, SP_END =INPUT_DATA[I]
ARRAYS = allignTwins(CYTO_ID, PERI_ID, SP_END)
WILCOXON = wilcoxonRanksumTest(ARRAYS)
WILCOXON_FILE="wilcoxon_results_{}_{}.txt".format(CYTO_ID, PERI_ID)
writeWilcoxonResults(WILCOXON_FILE, WILCOXON,CYTO_ID,PERI_ID)


Compare Twins: A0A4V0HDT9 and A0A4Q8ES36
Retrieve Cytoplasmic homologues
Retrieve Periplasmic homologues
reduce Cytoplasmic redundance with cdhit
reduce Periplasmic redundance with cdhit
generate Cytoplasmic MSA with clustalo
generate Periplasmic MSA with clustalo
Allign cytoplasmic and periplasmic proteins to each other with clustalo
Generate predictions
generate arrays


In [17]:
I=9
CYTO_ID, PERI_ID, SP_END =INPUT_DATA[I]
ARRAYS = allignTwins(CYTO_ID, PERI_ID, SP_END)
WILCOXON = wilcoxonRanksumTest(ARRAYS)
WILCOXON_FILE="wilcoxon_results_{}_{}.txt".format(CYTO_ID, PERI_ID)
writeWilcoxonResults(WILCOXON_FILE, WILCOXON,CYTO_ID,PERI_ID)


Compare Twins: Q07YY9 and Q07WU7
Retrieve Cytoplasmic homologues
Retrieve Periplasmic homologues
reduce Cytoplasmic redundance with cdhit
reduce Periplasmic redundance with cdhit
generate Cytoplasmic MSA with clustalo
generate Periplasmic MSA with clustalo
Allign cytoplasmic and periplasmic proteins to each other with clustalo
Generate predictions
generate arrays


In [18]:
I=10
CYTO_ID, PERI_ID, SP_END =INPUT_DATA[I]
ARRAYS = allignTwins(CYTO_ID, PERI_ID, SP_END)
WILCOXON = wilcoxonRanksumTest(ARRAYS)
WILCOXON_FILE="wilcoxon_results_{}_{}.txt".format(CYTO_ID, PERI_ID)
writeWilcoxonResults(WILCOXON_FILE, WILCOXON,CYTO_ID,PERI_ID)


Compare Twins: P0A9L3 and P45523
Retrieve Cytoplasmic homologues
Retrieve Periplasmic homologues
reduce Cytoplasmic redundance with cdhit
reduce Periplasmic redundance with cdhit
generate Cytoplasmic MSA with clustalo
generate Periplasmic MSA with clustalo
Allign cytoplasmic and periplasmic proteins to each other with clustalo
Generate predictions
generate arrays


In [19]:
I=11
CYTO_ID, PERI_ID, SP_END =INPUT_DATA[I]
ARRAYS = allignTwins(CYTO_ID, PERI_ID, SP_END)
WILCOXON = wilcoxonRanksumTest(ARRAYS)
WILCOXON_FILE="wilcoxon_results_{}_{}.txt".format(CYTO_ID, PERI_ID)
writeWilcoxonResults(WILCOXON_FILE, WILCOXON,CYTO_ID,PERI_ID)


Compare Twins: P21517 and P25718
Retrieve Cytoplasmic homologues
Retrieve Periplasmic homologues
reduce Cytoplasmic redundance with cdhit
reduce Periplasmic redundance with cdhit
generate Cytoplasmic MSA with clustalo
generate Periplasmic MSA with clustalo
Allign cytoplasmic and periplasmic proteins to each other with clustalo
Generate predictions
generate arrays


In [8]:
I=12
CYTO_ID, PERI_ID, SP_END =INPUT_DATA[I]
ARRAYS = allignTwins(CYTO_ID, PERI_ID, SP_END)
WILCOXON = wilcoxonRanksumTest(ARRAYS)
WILCOXON_FILE="wilcoxon_results_{}_{}.txt".format(CYTO_ID, PERI_ID)
writeWilcoxonResults(WILCOXON_FILE, WILCOXON,CYTO_ID,PERI_ID)


Compare Twins: Q7M8K1 and Q7M827
Retrieve Cytoplasmic homologues
Retrieve Periplasmic homologues
reduce Cytoplasmic redundance with cdhit
reduce Periplasmic redundance with cdhit
generate Cytoplasmic MSA with clustalo
generate Periplasmic MSA with clustalo
Allign cytoplasmic and periplasmic proteins to each other with clustalo
Generate predictions
generate arrays
