In [2]:
# import 
import shutil
import os
import datetime

# define general functions
def makeDir(name):
    try:
        shutil.rmtree(name)
    except:
        pass
    os.makedirs(name)
    return name

def makeBackup(name):
    moment=str(datetime.datetime.now()).replace(":",".")
    backup=name+".bu."+moment
    try:
        shutil.copytree(name,backup)
    except:
        pass
    return backup

# 1 Download periplasmic and cytoplasmic proteins

First we have to download all the periplasmic and cytoplasmic proteins from Uniprot that potentially have a twin.

In [2]:
# Import
from uniprotRetrieve import uniprotRetrieve

# Make dir for results
DIR1=makeDir("1.downloadProteins")

# Download proteins Cytoplasm
# Enterobacteriaceae
# Enterobacterales
# Gammaproteobacteria
# Proteobacteria
# Bacteria
QUERY="taxonomy:Bacteria (locations:(location:cytoplasm evidence:experimental) OR locations:(location:cytosol evidence:experimental)) NOT annotation:(type:signal)"
FORMAT="tab"
COLUMNS="id,organism,ec,sequence"
FILENAME="proteinsCytoplasmic.tab"
OUTPUT="{0}/{1}" .format(DIR1,FILENAME)
uniprotRetrieve(OUTPUT, format=FORMAT, query=QUERY, columns=COLUMNS)

# Download proteins Periplasm
QUERY="taxonomy:Bacteria locations:(location:periplasm evidence:experimental) annotation:(type:signal)"
FORMAT="tab"
COLUMNS="id,organism,ec,sequence"
FILENAME="proteinsPeriplasmic.tab"
OUTPUT="{0}/{1}" .format(DIR1,FILENAME)
uniprotRetrieve(OUTPUT, format=FORMAT, query=QUERY, columns=COLUMNS)

# Make backup
BACKUP=makeBackup(DIR1)

# 2 Extract common organisms and generate FASTA's

We want to check for structural Twins (one in the periplasm and one in the cytoplasm) in the same organism.
Therefore the script will look at all the avaible proteins and check for which organisms there are both cytoplasmic and periplasmic proteins available.
Only for those, fasta files are generated.

In [3]:
# Import
import generateFastas

# Generate dirs for resuts
CYTODIR=makeDir("2.exctractCommonOrganismCytoplasm")
PERIDIR=makeDir("2.exctractCommonOrganismPeriplasm")

# Read in Tab files
CYTOPLASM_PROTEINS_FILE="1.downloadProteins/proteinsCytoplasmic.tab"
PERIPLASM_PROTEINS_FILE="1.downloadProteins/proteinsPeriplasmic.tab"

# Generate Fasta files
generateFastas.generateFastas(CYTOPLASM_PROTEINS_FILE, CYTODIR,
                              PERIPLASM_PROTEINS_FILE, PERIDIR)

#make backup
CYTODIR_BACKUP=makeBackup(CYTODIR)
PERIDIR_BACKUP=makeBackup(PERIDIR)

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))




# 3 Use Blast to search for Twins

## 3.1 Generate Databases

Blast searches for sequences against a database.
Therefore, A script will turn the fasta files of periplasm proteins into databases.

In [4]:
# import
import generateDatabases

# Generate dir for resuts
DIR_PERIPLASM_DATABASE=makeDir("3.1.periplasmDatabase")

# Make databases
generateDatabases.generateDatabases(PERIDIR, DIR_PERIPLASM_DATABASE)

#make backup
DIR_PERIPLASM_DATABASE_BACKUP=makeBackup(DIR_PERIPLASM_DATABASE)

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))




## 3.2 Run Blast to find twins

In [5]:
# import
import runBlast

# make dir for results
DIR_BLAST_RESULTS=makeDir("3.2.blastResults")

# runBlast
EVALUE=1e-8
runBlast.runBlast(CYTODIR,DIR_PERIPLASM_DATABASE,DIR_BLAST_RESULTS,evalue=EVALUE)

# make backup
DIR_BLAST_RESULTS_BACKUP=makeBackup(DIR_BLAST_RESULTS)

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))




## 3.3 Extract Twins from Blast Results

In [6]:
# import
import imp
import getTwins
imp.reload(getTwins)

# make dir for results
DIR_TWINS=makeDir("3.3.twins")

# getTwins
getTwins.getTwins(DIR_BLAST_RESULTS, DIR_TWINS)

# make backup
DIR_TWINS_BACKUP=makeBackup(DIR_TWINS)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




# 4 Generate MSA (fast way)

## 4.1 Generate lists

Make sure to provide a maximum of entries per list as the mapping system can not handle lists that are to large.

In [7]:
# import
import generateLists4mapping

# make dir for results
DIR_LISTS=makeDir("4.1.listsToMap")

# generate lists to map
MAX_ID_PER_FILE=1e3
generateLists4mapping.generateLists4mapping(DIR_TWINS,DIR_LISTS,maxLength=MAX_ID_PER_FILE)

# make backup
DIR_LISTS_BACKUP=makeBackup(DIR_LISTS)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




## 4.2 Map proteins to UniRef groups

To generate a MSA in a very fast way, we try to avoid BLAST.
Therefore we use the predefined Uniref groups.
This will result in a less extensive MSA, but it can be run for a lot of proteins.

In [8]:
# import
import mapUniprot2Uniref

# make dir for results
DIR_MAP_UNIPROT_2_UNIREF=makeDir("4.2.mapUniprot2Uniref")

# Mapping
mapUniprot2Uniref.mapUniprot2Uniref(DIR_LISTS, DIR_MAP_UNIPROT_2_UNIREF)

# make backup
DIR_MAP_UNIPROT_2_UNIREF_BACKUP=makeBackup(DIR_MAP_UNIPROT_2_UNIREF)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




## 4.3 Map Uniref groups back to the proteins they contain

In [9]:
# import 
import mapUniref2Uniprot

# make dir for resutls
DIR_MAP_UNIREF_2_UNIPROT=makeDir("4.3.mapUniRef2UniProt")

# Mapping
mapUniref2Uniprot.mapUniref2Uniprot(DIR_MAP_UNIPROT_2_UNIREF,DIR_MAP_UNIREF_2_UNIPROT)

# make backup
DIR_MAP_UNIREF_2_UNIPROT_BACKUP=makeBackup(DIR_MAP_UNIREF_2_UNIPROT)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




## 4.4 Filter proteins and download as tab with sequence in it

We want to make sure there is evidence that periplasmic protein homologues occur in the periplasm and cytoplasmic in the cytoplasm.
To achieve this, another uniprot retrieve search is performed using the yourlist:ID

In [10]:
# import 
import filterAndDownload

# make dir for resutls
DIR_FILTERED=makeDir("4.4.filteredFiles")

# Filter and download
func = filterAndDownload.filterAndDownload
func(DIR_MAP_UNIREF_2_UNIPROT, DIR_FILTERED)

# make backup
DIR_FILTERED_BACKUP=makeBackup(DIR_FILTERED)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




## 4.5 Extract fasta files 

In [11]:
# import
import extractFastas

# make dir for results
DIR_FASTAS=makeDir("4.5.fastas")

# extract fastas from tab
fun=extractFastas.extractFastas
fun(DIR_FILTERED,DIR_FASTAS,DIR_MAP_UNIPROT_2_UNIREF, DIR_MAP_UNIREF_2_UNIPROT)

# make backup
DIR_FASTAS_BACKUP=makeBackup(DIR_FASTAS)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




## 4.6 CDHIT

In [12]:
# import
import runCDHIT

# make dir for results
DIR_CDHIT=makeDir("4.6.CDHIT")

# Run CDHIT
func=runCDHIT.runCDHIT
IDENTITY=0.90
func(DIR_FASTAS,DIR_CDHIT,identity=IDENTITY)

# make Backup
DIR_CDHIT_BACKUP = makeBackup(DIR_CDHIT)

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




### 4.6.1 Add self Sequence to CDHIT

In [1]:
from uniprotRetrieve import uniprotRetrieve
import os
from tqdm import tqdm

def addSelfSequence(file):
    Id = file.split("/")[-1].split(".")[0]

    query="id:{}".format(Id)
    format="tab"
    columns="id,sequence"
    tmp="4.6.CDHIT/tmp.csv"
    
    # retrieve sequence and write temp file
    uniprotRetrieve(tmp,format=format, query=query, columns=columns)
    with open(tmp) as f:
        f.readline()
        Id,seq = f.readline().strip().split("\t")
    
    # Add to fasta
    ## Add existing content
    with open(file) as f:
        lines = f.readlines()
    ## Combine
    with open(file,"w") as f:
        f.write(">{}\n".format(Id))
        f.write("{}\n".format(seq))
        f.writelines(lines)
    
    # remove tempoerary file
    os.remove(tmp)

DIR = "4.6.CDHIT"
for FILE in tqdm([F for F in os.listdir(DIR) if F.endswith(".fasta")]):
    FILE_PATH="{}/{}".format(DIR,FILE)
    addSelfSequence(FILE_PATH)

100%|██████████| 6/6 [00:03<00:00,  1.75it/s]


## 4.7 ClustalOmega

In [3]:
# import
import runClustalOmega
import imp
imp.reload(runClustalOmega)

DIR_CDHIT="4.6.CDHIT"

# make dir
DIR_CLUSTALO=makeDir("4.7.clustalOmega")

# Run
func=runClustalOmega.runClustalOmega
func(DIR_CDHIT,DIR_CLUSTALO)

# make Backup
DIR_CLUSTALO_BACKUP = makeBackup(DIR_CLUSTALO)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))


