# 1.uniprotRetrieve

In [1]:
from uniprotRetrieve import uniprotRetrieve
import pandas as pd

Based on this [API](https://www.uniprot.org/help/api%5Fqueries).

Columns names are found [here](https://www.uniprot.org/help/uniprotkb_column_names).

## Main parameters

* **fileName**: Name of the downloaded file
* **query**: Query as you would use it on the [uniprot site](https://www.uniprot.org/)
* **format**: Format of file you want to download (html | tab | xls | fasta | gff | txt | xml | rdf | list | rss)

### Example 1.1: download list of all known proteins of genus: betacoronavirus

In [2]:
# Download list of proteins of viri in genus betacoronavirus
fileName="betacoronavirus.list"
QUERY="taxonomy:betacoronavirus"
FORMAT="list"

betacoronavirusList = uniprotRetrieve(fileName, query=QUERY, format=FORMAT)

In [3]:
# Check number of proteins
with open(betacoronavirusList) as f:
    print(len(f.readlines()))

9603


### Example 1.2 Limit results to those having an experimental structure available

In [4]:
# Download list of proteins of viri in genus betacoronavirus
fileName="betacoronavirusStructure.list"
QUERY="taxonomy:betacoronavirus database:(type:pdb)"
FORMAT="list"

betacoronavirusStructureList = uniprotRetrieve(fileName, query=QUERY, format=FORMAT)

In [5]:
# Check number of proteins
with open(betacoronavirusStructureList) as f:
    print(len(f.readlines()))

69


### Example 1.3 Download fasta file of these sequences

In [6]:
# Download list of proteins of viri in genus betacoronavirus
fileName="betacoronavirusStructure.fasta"
QUERY="taxonomy:betacoronavirus database:(type:pdb)"
FORMAT="fasta"

betacoronavirusStructureFasta = uniprotRetrieve(fileName, query=QUERY, format=FORMAT)

In [7]:
# Check number of proteins
with open(betacoronavirusStructureFasta) as f:
    print(len([line for line in f.readlines() if line.startswith(">")]))

69


### Example 1.4 Download Tab file with id, gene name, protein name, Organism, PDB identifiers

In [8]:
# Download list of proteins of viri in genus betacoronavirus
fileName="betacoronavirusStructure.tab"
QUERY="taxonomy:betacoronavirus database:(type:pdb)"
FORMAT="tab"
COLUMNS="id,genes,protein names,organism,database(pdb)" #no spaces after ','

betacoronavirusStructureTab = uniprotRetrieve(fileName, query=QUERY, format=FORMAT, columns=COLUMNS)

In [9]:
pd.read_csv(betacoronavirusStructureTab,sep="\t")

Unnamed: 0,Entry,Gene names,Protein names,Organism,Cross-reference (pdb)
0,A3EX94,S 2,Spike glycoprotein (S glycoprotein) (E2) (Pepl...,Bat coronavirus HKU4 (BtCoV) (BtCoV/HKU4/2004),4QZV;
1,P59595,N 9a,Nucleoprotein (Nucleocapsid protein) (NC) (Pro...,Human SARS coronavirus (SARS-CoV) (Severe acut...,1SSK;1X7Q;2CJR;2GIB;2JW8;2OFZ;2OG3;3I6L;
2,P33469,N 7a,Nucleoprotein (Nucleocapsid protein) (NC) (Pro...,Human coronavirus OC43 (HCoV-OC43),4J3K;4KXJ;4LI4;4LM7;4LM9;4LMC;4LMT;
3,P0C6U8,1a,Replicase polyprotein 1a (pp1a) (ORF1a polypro...,Human SARS coronavirus (SARS-CoV) (Severe acut...,1P76;1P9T;1PA5;1PUK;1Q1X;1Q2W;1QZ8;1UJ1;1UK2;1...
4,P19738,2a,Non-structural protein 2a (ns2a) (30 kDa acces...,Murine coronavirus (strain A59) (MHV-A59) (Mur...,4Z5V;
...,...,...,...,...,...
64,W6A0A7,S,Spike glycoprotein (S glycoprotein) (E2) (Pepl...,Middle East respiratory syndrome-related coron...,4ZS6;
65,Q19QX1,,Nonstructural polyprotein pp1a,Human SARS coronavirus (SARS-CoV) (Severe acut...,2FAV;
66,Q9J3E8,ORF1B,RNA-directed RNA polymerase,Murine hepatitis virus,2GTH;2GTI;
67,A0A0D3MU51,N,Nucleoprotein (Nucleocapsid protein) (NC) (Pro...,Middle East respiratory syndrome-related coron...,6G13;


# 2.Blast

API for BLAST from uniprot ([https://www.uniprot.org/blast/](https://www.uniprot.org/blast/)).

In [10]:
from blast import blast

### Example 2.1 Download UniRef90 blast results for P0DTC7

In [11]:
sequence="""
>sp|P0DTC7|NS7A_WCPV Protein 7a OS=Wuhan seafood market pneumonia virus OX=2697049 GN=7a PE=3 SV=1
MKIILFLALITLATCELYHYQECVRGTTVLLKEPCSSGTYEGNSPFHPLADNKFALTCFS
TQFAFACPDGVKHVYQLRARSVSPKLFIRQEEVQELYSPIFLIVAAIVFITLCFTLKRKT
E
"""
FILENAME="P0DTC7UniRef90.list"
DATABASE="UniRef90"
EVALUE=0.0001
HITS=1000
FORMAT="list"

P0DTC7UniRef90List = blast(sequence,
                            fileName=FILENAME, 
                            database=DATABASE,
                            eValue=EVALUE,
                            hits=HITS,
                            format=FORMAT)

# 3.uniprotGroupId

API from [Retrieve/ID mapping](https://www.uniprot.org/uploadlists/) UniProt.

In [12]:
from uniprotGroupId import uniprotGroupId

### Example 3.1 retrieve all proteins corresponding to the UniRef90 groups

In [13]:
PROTEINLIST=P0DTC7UniRef90List
DATABASEFROM="UniRef90"

GROUPID=uniprotGroupId(PROTEINLIST,databaseFrom=DATABASEFROM)
GROUPID

'yourlist:M20200325A94466D2655679D1FD8953E075198DA818E269H'

In [14]:
fileName="P0DTC7UniRef90Expaned.fasta"
QUERY="yourlist:M20200324A94466D2655679D1FD8953E075198DA818A86DE length:[110 TO 130]"
FORMAT="fasta"

P0DTC7UniRef90ExpanedFasta = uniprotRetrieve(fileName, query=QUERY, format=FORMAT)

# 4.cdhit 

API made for this [server](http://weizhong-lab.ucsd.edu/cdhit-web-server/cgi-bin/index.cgi?cmd=cd-hit).

In [15]:
from cdhit import cdhit

### Example 4.1 make clusters of fasta file

In [16]:
FASTA=P0DTC7UniRef90ExpanedFasta
CUTOFF=0.8

cdHitFasta, cdHitClstr = cdhit(FASTA, cutoff=CUTOFF)

# 5. clustalomega

API made for this [server](https://www.ebi.ac.uk/Tools/msa/clustalo/)

In [17]:
from clustalOmega import clustalOmega

### Example 5.1 generate multiple sequence alignment 

In [18]:
FASTA=cdHitFasta
OUTPUTFORMAT="Pearson/FASTA"

clustalOmegaFasta,clustalOmegaPim = clustalOmega(FASTA,outputFormat=OUTPUTFORMAT)