# Protein Similarity Matrix (ProSiM)

BLAST+ on Spark



In [1]:
# imports
from pyspark import SparkContext, SparkConf
import pprint
import re
import os
import subprocess
import shutil
import shlex
from hdfs import InsecureClient

In [2]:
# initialize Spark
if not 'sc' in globals():
    conf = SparkConf().setMaster('yarn')
    sc = SparkContext(conf=conf)
    
sc.setLogLevel("INFO")

In [3]:
# configuration
mySqlConfig = {'user': 'joseph',
          'password': 'password',
          'host': '192.168.151.11',
          'database': 'chembl_22'}

hdfsClient = InsecureClient("http://hadoop1:50070")

In [4]:
localProteinsTsv = "proteins.tsv"
hdfsProteinsTsv = "/user/hduser/prosim/proteins.tsv"
fastaFile = "/home/hduser/Lab/prosim/proteins.fasta"

In [5]:
if not os.path.isfile(localProteinsTsv):
    try:
        cnx = mysql.connector.connect(**(mySqlConfig))

        cursor = cnx.cursor()                

        query = ("SELECT @rownum := @rownum + 1 AS row_id, " +
                    "     cs.accession, cs.sequence, "+
                    "     pc.pref_name pref_name," +
                    "     pc.short_name " +
                    "FROM (select @rownum := 0) r, " +
                    "     component_sequences cs, " +
                    "     component_class cc, " + 
                    "     protein_classification pc " +
                    "WHERE cs.component_id = cc.component_id AND " +
                    "      cc.protein_class_id = pc.protein_class_id AND " +
                    "      cs.tax_id = 9606;")

        with open("proteins.tsv", 'w') as tsv:                     
            cursor.execute(query)
            for (row_id, accession, sequence, pref_name, short_name) in cursor:
                tsv.write("%s\t%s\t%s\t%s\t%s\n" % 
                          (row_id, accession, sequence, pref_name, short_name))                    

    except mysql.connector.Error as err:
        if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
            print("Something is wrong with your user name or password")
        elif err.errno == errorcode.ER_BAD_DB_ERROR:
            print("Database does not exist")
        else:
            print(err)
    else:
        cnx.close()

In [6]:
# save file in hdfs
if hdfsClient.content(hdfsProteinsTsv, strict=False) == None:
    hdfsClient.upload(hdfs_path=hdfsProteinsTsv, local_path=localProteinsTsv)

In [7]:
rawdata = sc.textFile(hdfsProteinsTsv) \
            .map(lambda line: line.split("\t"))

In [8]:
# manipulate raw data rdd and create FASTA file
fastaRdd = rawdata.map(lambda line: (line[1], line[2])) \
            .distinct() \
            .map(lambda t: str(">ebl|" + t[0] + "|\r\n" + "\r\n".join(re.findall(".{1,80}",t[1]))))            

In [9]:
proteinList = fastaRdd.collect()

In [10]:
fFile = open(fastaFile, 'w')
for item in proteinList:
    fFile.write("%s\r\n" % item)
fFile.close()

In [35]:
# create local blast db
process = subprocess.Popen(shlex.split("makeblastdb -in %s -parse_seqids -dbtype prot" % os.path.basename(fastaFile)),                           
                           stdout = subprocess.PIPE,
                           stderr = subprocess.PIPE,  
                           cwd = os.path.dirname(fastaFile),
                           shell = False)

out, err = process.communicate()

print out
print err



Building a new DB, current time: 02/09/2017 21:31:10
New DB name:   /home/hduser/Lab/prosim/proteins.fasta
New DB title:  proteins.fasta
Sequence type: Protein
Keep Linkouts: T
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 3130 sequences in 0.0848458 seconds.




## BLAST

Using BioPython library:(http://biopython.org/DIST/docs/tutorial/Tutorial.html)

In [36]:
from Bio.Blast.Applications import NcbiblastxCommandline
from Bio.Blast import NCBIXML

In [45]:
proteinSeqs = rawdata.map(lambda line: (line[1], line[2])).distinct()
proteinSeqs.count()

3130

In [46]:
proteinSeqs = proteinSeqs.collect()
for (query_acc, query_seq) in proteinSeqs:
    outFile = "/home/hduser/Lab/prosim/simresult/" + query_acc + ".xml"
    blastp_cline = NcbiblastxCommandline(cmd = "blastp",
                                     db = fastaFile,
                                     evalue = 0.05,
                                     outfmt = 5,
                                     remote = False,
                                     out = outFile)
    (out, err) = blastp_cline(stdin = query_seq)
    print "%s, out: %s, err: %s" % (query_acc, out, err)
    

O75909, out: , err: 
P50993, out: , err: 
P23946, out: , err: 
P04233, out: , err: 
P62942, out: , err: 
O95180, out: , err: 
Q9HBH9, out: , err: 
P55268, out: , err: 
Q52WX2, out: , err: 
P29074, out: , err: 
P11586, out: , err: 
P53778, out: , err: 
P06133, out: , err: 
Q11201, out: , err: 
O00204, out: , err: 
Q8NG68, out: , err: 
Q9Y4L1, out: , err: 
P08246, out: , err: 
P29274, out: , err: 
O60706, out: , err: 
P15692, out: , err: 
P31040, out: , err: 
Q92847, out: , err: 
P43005, out: , err: 
Q99816, out: , err: 
Q14694, out: , err: 
Q9UBU3, out: , err: 
Q92835, out: , err: 
O15229, out: , err: 
O15228, out: , err: 
P60903, out: , err: 
Q15208, out: , err: 
Q13332, out: , err: 
P19404, out: , err: 
Q8TCW9, out: , err: 
Q16772, out: , err: 
Q9NQS7, out: , err: 
Q99808, out: , err: 
Q9C000, out: , err: 
O43772, out: , err: 
O75493, out: , err: 
Q5NUL3, out: , err: 
Q15349, out: , err: 
O75914, out: , err: 
Q9H3N8, out: , err: 
Q9UBF8, out: , err: 
Q8NER5, out: , err: 
P04183, out: 