In [1]:
# Copyright 2019 The Authors. All Rights Reserved.
#
# GNU General Public License v3.0
# Permissions of this strongest copyleft license are conditioned on 
# making available complete source code of licensed works and modifications, 
# which include larger works using a licensed work, under the same license. 
# Copyright and license notices must be preserved. 
# Contributors provide an express grant of patent rights. 
# When a modified version is used to provide a service over a network, 
# the complete source code of the modified version must be made available.
# ==============================================================================

# Title: Analysis of PPV_ATP_composite_nonbacteria_list
# Author: Hyunjin Shim
# Date created: 20191217
# Email: jinenstar@gmail.com

## Composite protiens of ATPase annotations from PPV
- Total ATPase from PPV: 1003
- Bacteria: 441
- Nonbacteria (from contigs annotated <50% bacteria -> higher chance of mobilome): 562

In [3]:
# Data
import os
import pandas as pd
import numpy as np
import csv

# Biopython
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# Regular expression
import re

# Plot
from pylab import *
import pylab as pylab
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
import matplotlib.patches as mpatches
%matplotlib inline
import matplotlib.pyplot as plt

In [56]:
# location of raw data file
datapath = '/Users/jinenstar/Desktop/Pro_ATP/'
os.chdir(datapath)

# open dictionary (geneID and ggKbase names) from cleanNetwork
file_in ='PPV_ATP_composite_nonbacteria_list.txt'

with open(file_in) as f:
    reader = csv.reader(f, delimiter="\n")
    dat_ATP = pd.DataFrame(reader)

In [61]:
dat_ATP[0][0]

'BML_08182015_6_5m_scaffold_1_346|BML_08182015_6_5m_Completed_Jumbo_Phage_40_15|BML_08182015_6_5m'

In [9]:
# location of raw data file
datapath = '/Users/jinenstar/Desktop/Data/Progress'
os.chdir(datapath)

# use Biopython to process and parse sequences
file_in ='phage-plasmid-virus-protein-families.proteins.faa'

with open(file_in) as fasta_file:  # Will close handle cleanly
    all_ID = []
    all_sequence = []
    all_length = []
    all_des = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        # remove .id from .description record (remove all before first space)
        #seq_record.description=' '.join(seq_record.description.split()[1:])
        # a list of IDs and sequences
        all_des.append(seq_record.description)
        all_ID.append(seq_record.id)
        all_sequence.append(seq_record.seq)
        all_length.append(len(seq_record.seq))

# find maximum seq length
max_length = max(all_length)
min_length = min(all_length)

In [10]:
print(seq_record)

ID: C1_074_025G2_scaffold_104_54|C1_074_025G2_Ecoli_41_128|C1_074_025G2
Name: C1_074_025G2_scaffold_104_54|C1_074_025G2_Ecoli_41_128|C1_074_025G2
Description: C1_074_025G2_scaffold_104_54|C1_074_025G2_Ecoli_41_128|C1_074_025G2 # 43775 # 44248 # -1
Number of features: 0
Seq('FVGVDRSVAYKQMKDAADFFSRNKKLITHCDYISNEGLLRVMFSSKTINYITAI...MR*', SingleLetterAlphabet())


In [11]:
print(min_length)
print(all_length.index(min_length))
all_ID[all_length.index(min_length)]

20
4286


'93_004_scaffold_569_5|93_004_Viruses_35_44|93_004'

In [12]:
print(max_length)
print(all_length.index(max_length))
all_ID[all_length.index(max_length)]

11904
31368


'RTP_09252017_15_scaffold_7_116|RTP_09252017_15_Complete_588kb_Eukaryotes_Virus_29_22|RTP_09252017_15'

In [13]:
all_ID[-10:]

['C1_074_025G2_scaffold_104_45|C1_074_025G2_Ecoli_41_128|C1_074_025G2',
 'C1_074_025G2_scaffold_104_46|C1_074_025G2_Ecoli_41_128|C1_074_025G2',
 'C1_074_025G2_scaffold_104_47|C1_074_025G2_Ecoli_41_128|C1_074_025G2',
 'C1_074_025G2_scaffold_104_48|C1_074_025G2_Ecoli_41_128|C1_074_025G2',
 'C1_074_025G2_scaffold_104_49|C1_074_025G2_Ecoli_41_128|C1_074_025G2',
 'C1_074_025G2_scaffold_104_50|C1_074_025G2_Ecoli_41_128|C1_074_025G2',
 'C1_074_025G2_scaffold_104_51|C1_074_025G2_Ecoli_41_128|C1_074_025G2',
 'C1_074_025G2_scaffold_104_52|C1_074_025G2_Ecoli_41_128|C1_074_025G2',
 'C1_074_025G2_scaffold_104_53|C1_074_025G2_Ecoli_41_128|C1_074_025G2',
 'C1_074_025G2_scaffold_104_54|C1_074_025G2_Ecoli_41_128|C1_074_025G2']

In [14]:
all_sequence[-10:]

[Seq('MSGRITTLCTAFGVVIAAVGLYLPYKNELNAALYQREFLTGKWSTDAEYIINSG...HD*', SingleLetterAlphabet()),
 Seq('MRDKTREAMRMFLGGRCYTAENLERDFLSELTRYSDERWEAPQRGARLAAAVKR...ES*', SingleLetterAlphabet()),
 Seq('VSDNKNSDHKKILDIFSDNAFLSMMNERQGRESRARGSGLNQPFTHNGTRKDTF...RV*', SingleLetterAlphabet()),
 Seq('MTSNAPTNFVRAYLRASTAEQDASRALETIEAFARERGLIICSYYIENESGSRL...SK*', SingleLetterAlphabet()),
 Seq('MGKILLVVSDKGGVGKSTYVANTGSMLVNKGKSVIILKTDKNHDLLSWNEKRTD...IL*', SingleLetterAlphabet()),
 Seq('MHTVQGEIVMALKLNRPNIDESQQPATNAETARFISGATRAPVVGKPKLMNFRL...KF*', SingleLetterAlphabet()),
 Seq('MRYLTELICQLYRLTDHHKLTAATFKNLADIKLVEPTADADTILQLENIFSEYS...DS*', SingleLetterAlphabet()),
 Seq('MKYQVREFITDKYAKAVNILKDNLKENYHVFYGVRLSEILFPASEYGTDAFFKE...KS*', SingleLetterAlphabet()),
 Seq('MAKIYQFPQGDERGKFREEIARERKKRFAVKTGSTFVKWLGWTWFYLRLLVASV...SQ*', SingleLetterAlphabet()),
 Seq('FVGVDRSVAYKQMKDAADFFSRNKKLITHCDYISNEGLLRVMFSSKTINYITAI...MR*', SingleLetterAlphabet())]

In [38]:
d = {'ID':all_ID, 'Seq':all_sequence}
all_data = pd.DataFrame(d)

In [62]:
dat_ATP[0][0]

'BML_08182015_6_5m_scaffold_1_346|BML_08182015_6_5m_Completed_Jumbo_Phage_40_15|BML_08182015_6_5m'

In [79]:
# pull out ATPase sequences
seq_ATP = pd.DataFrame()

for i in range(len(dat_ATP)):
    seq_ATP = seq_ATP.append(all_data[all_data['ID']==dat_ATP[0][i]])



In [81]:
len(seq_ATP)

562

In [83]:
seq_ATP['Seq']

313108    (M, A, V, T, E, T, R, T, V, T, P, E, E, A, R, ...
54371     (L, P, D, K, P, Y, I, N, S, K, V, F, V, D, T, ...
313930    (M, S, V, E, H, R, T, V, T, A, V, G, A, R, K, ...
872       (M, A, S, S, A, P, S, A, P, T, P, A, S, A, P, ...
318280    (M, S, V, E, H, R, T, V, T, A, V, G, A, R, K, ...
862692    (L, T, T, A, F, R, D, A, V, F, G, F, D, N, D, ...
30129     (M, D, N, L, A, N, S, L, P, N, V, M, T, S, Q, ...
30231     (M, D, G, M, T, G, V, S, G, F, G, N, F, N, M, ...
323772    (M, S, H, E, T, R, T, V, T, A, I, G, A, R, R, ...
323175    (M, S, H, E, T, R, T, V, T, A, I, G, A, R, R, ...
322366    (M, D, N, L, A, N, S, L, P, N, V, M, T, S, Q, ...
322468    (M, D, G, M, T, G, V, S, G, F, G, N, F, N, M, ...
315165    (M, S, V, E, H, R, T, V, T, A, V, G, A, R, K, ...
320409    (M, S, N, I, N, N, S, I, V, W, T, C, D, N, N, ...
314186    (M, S, V, E, H, R, T, V, T, A, V, G, A, R, K, ...
319085    (M, S, N, I, N, N, S, I, V, W, T, C, D, N, N, ...
321779    (M, S, V, E, H, R, T, V, T, A,