In [1]:
# Copyright 2019 The Authors. All Rights Reserved.
#
# GNU General Public License v3.0
# Permissions of this strongest copyleft license are conditioned on 
# making available complete source code of licensed works and modifications, 
# which include larger works using a licensed work, under the same license. 
# Copyright and license notices must be preserved. 
# Contributors provide an express grant of patent rights. 
# When a modified version is used to provide a service over a network, 
# the complete source code of the modified version must be made available.
# ==============================================================================

# Title: Process results from Composite Search of PPV_ATPase data
# Author: Hyunjin Shim
# Date created: 20191003
# Email: jinenstar@gmail.com

# Dataset description (downloaded on 20190830)
Original Data
- Organisms: 7,029
- Contigs: 26,331
- Features: 938,182

ATPase annotated
- Total: 5,497
- Virus: 361
- Phage: 2,654
- Plasmid: 2,216
- Other: 266

# Composite protiens of ATPase annotations from PPV

- to analyze proteins with ATPase annotations from https://ggkbase.berkeley.edu/ after Composite Search to find fusion proteins (database as PPV and NCBI_nr)
- to prioritize composite proteins of interest

In [1]:
# Data
import os
import pandas as pd
import numpy as np
import csv

# Biopython
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# Regular expression
import re

# Plot
from pylab import *
import pylab as pylab
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
import matplotlib.patches as mpatches
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
# location of raw data file
datapath = '/Users/jinenstar/Desktop/Pro_ATP/'
os.chdir(datapath)

# open dictionary (geneID and ggKbase names) from cleanNetwork
file_in ='PPV_ATP.out.cleanNetwork.dico'

with open(file_in) as f:
    reader = csv.reader(f, delimiter="\t")
    all_dat = list(reader)

with open(file_in) as f:
    reader = csv.reader(f, delimiter="\t")
    all_dat_df = pd.DataFrame(reader)
    
print(all_dat_df[:10])

                                                   0        1
0  0106W_scaffold_128_8|0106W_Staphylococcus_plas...  1427709
1  0125A1030_scaffold_106_5|0125A1030_Klebsiella_...  1426845
2  0205A1024_scaffold_1808_1|0205A1024_Enterobact...  1428416
3  0205A1024_scaffold_1808_4|0205A1024_Enterobact...  1424765
4  0205A1024_scaffold_1808_5|0205A1024_Enterobact...   946011
5  0205A1024_scaffold_28_21|0205A1024_Enterobacte...   942202
6  0205A1024_scaffold_407_13|0205A1024_Enterobact...   942203
7  0212A1031_scaffold_120_10|0212A1031_Clostridiu...  1424872
8  0212A1031_scaffold_160_19|0212A1031_Staphyloco...  1424052
9  0212A1031_scaffold_160_3|0212A1031_Staphylococ...  1424769


In [3]:
all_dat_df
#len(all_dat)

Unnamed: 0,0,1
0,0106W_scaffold_128_8|0106W_Staphylococcus_plas...,1427709
1,0125A1030_scaffold_106_5|0125A1030_Klebsiella_...,1426845
2,0205A1024_scaffold_1808_1|0205A1024_Enterobact...,1428416
3,0205A1024_scaffold_1808_4|0205A1024_Enterobact...,1424765
4,0205A1024_scaffold_1808_5|0205A1024_Enterobact...,946011
5,0205A1024_scaffold_28_21|0205A1024_Enterobacte...,942202
6,0205A1024_scaffold_407_13|0205A1024_Enterobact...,942203
7,0212A1031_scaffold_120_10|0212A1031_Clostridiu...,1424872
8,0212A1031_scaffold_160_19|0212A1031_Staphyloco...,1424052
9,0212A1031_scaffold_160_3|0212A1031_Staphylococ...,1424769


In [4]:
all_keyword = []
nr = 0
ppv = 0

for i in all_dat:
    if(re.findall(r'GI\|', str(i), flags=re.I)):
        all_keyword.append('NR')
        nr += 1
    elif(re.findall(r'\|REF\|', str(i), flags=re.I)):
        all_keyword.append('NR')
        nr += 1
    elif(re.findall(r'\|GB\|', str(i), flags=re.I)):
        all_keyword.append('NR')
        nr += 1
    else:
        all_keyword.append('PPV')
        ppv += 1

In [5]:
print(all_keyword[:10000])
print(nr)
print(ppv)
len(all_keyword)

['PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV', 'PPV'

1433097

In [6]:
# dictionary to DataFrame
d = {'ID':all_dat_df[:][0], 'Composite':all_dat_df[:][1], 'Keyword':all_keyword}
all_combined = pd.DataFrame(d)

In [7]:
all_combined

Unnamed: 0,ID,Composite,Keyword
0,0106W_scaffold_128_8|0106W_Staphylococcus_plas...,1427709,PPV
1,0125A1030_scaffold_106_5|0125A1030_Klebsiella_...,1426845,PPV
2,0205A1024_scaffold_1808_1|0205A1024_Enterobact...,1428416,PPV
3,0205A1024_scaffold_1808_4|0205A1024_Enterobact...,1424765,PPV
4,0205A1024_scaffold_1808_5|0205A1024_Enterobact...,946011,PPV
5,0205A1024_scaffold_28_21|0205A1024_Enterobacte...,942202,PPV
6,0205A1024_scaffold_407_13|0205A1024_Enterobact...,942203,PPV
7,0212A1031_scaffold_120_10|0212A1031_Clostridiu...,1424872,PPV
8,0212A1031_scaffold_160_19|0212A1031_Staphyloco...,1424052,PPV
9,0212A1031_scaffold_160_3|0212A1031_Staphylococ...,1424769,PPV


In [8]:
# extract all items (geneID and ggKbase names) of PPV
PPV_data = all_combined[all_combined['Keyword']=='PPV']
print(PPV_data)
len(PPV_data)

                                                        ID Composite Keyword
0        0106W_scaffold_128_8|0106W_Staphylococcus_plas...   1427709     PPV
1        0125A1030_scaffold_106_5|0125A1030_Klebsiella_...   1426845     PPV
2        0205A1024_scaffold_1808_1|0205A1024_Enterobact...   1428416     PPV
3        0205A1024_scaffold_1808_4|0205A1024_Enterobact...   1424765     PPV
4        0205A1024_scaffold_1808_5|0205A1024_Enterobact...    946011     PPV
5        0205A1024_scaffold_28_21|0205A1024_Enterobacte...    942202     PPV
6        0205A1024_scaffold_407_13|0205A1024_Enterobact...    942203     PPV
7        0212A1031_scaffold_120_10|0212A1031_Clostridiu...   1424872     PPV
8        0212A1031_scaffold_160_19|0212A1031_Staphyloco...   1424052     PPV
9        0212A1031_scaffold_160_3|0212A1031_Staphylococ...   1424769     PPV
10       0212A1031_scaffold_79_12|0212A1031_Clostridium...   1427622     PPV
11       0212A1031_scaffold_79_24|0212A1031_Clostridium...    960867     PPV

12218

In [9]:
# location of raw data file
datapath = '/Users/jinenstar/Desktop/Pro_ATP/results/PPV_ATP_out_cleanNetwork_composites_Thu_Oct__3_00_44_18_2019'
os.chdir(datapath)

In [10]:
# open results from Composite Search
file_in ='PPV_ATP_out_cleanNetwork.compositesinfo'

with open(file_in) as f:
    reader = csv.DictReader(f, delimiter="\t")
    all_comp_df = pd.DataFrame(reader)

In [11]:
all_comp_df["#geneID"]

0          C46285
1          C93343
2         C121224
3         C121275
4         C121324
5         C121378
6         C121477
7         C121532
8         C121580
9         C121628
10        C121676
11        C121724
12        C121776
13        C121824
14        C121872
15        C121920
16        C121968
17        C122017
18        C122065
19        C122113
20        C122161
21        C122209
22        C122257
23        C122305
24        C122354
25        C122402
26        C122451
27        C122499
28        C122547
29        C122595
           ...   
63056     C271545
63057    C1425520
63058     C274040
63059     C276101
63060     C257295
63061     C273900
63062    C1055481
63063     C257844
63064     C257925
63065     C257995
63066     C258057
63067     C258133
63068     C258257
63069     C258571
63070     C258704
63071     C256728
63072     C257185
63073     C257457
63074     C257595
63075     C257770
63076    C1427295
63077    C1432786
63078     C980681
63079    C1424623
63080    C

In [12]:
# retain geneID without the first letter C
res_comp = [None] * len(all_comp_df)

for n in range(len(all_comp_df)):
    all_comp_df_gene = all_comp_df["#geneID"][n]
    res_comp[n] = int(all_comp_df_gene[1:])

In [13]:
len(res_comp)

63086

In [14]:
# extract geneID from PPV
res_PPV = pd.to_numeric(PPV_data['Composite']).tolist()
len(res_PPV)

12218

In [15]:
def Intersection(lst1, lst2): 
    return set(lst1).intersection(lst2) 
      
# intersection between composite geneID and PPV gene ID: find PPV within composite
intersect = Intersection(res_comp, res_PPV)

In [16]:
intersect_index = list(intersect)
print(intersect_index)

[1431552, 1425510, 1425411, 548868, 1431557, 1425412, 1425415, 903175, 1431561, 1167369, 1431566, 1431567, 1425424, 903185, 1425425, 1431571, 737300, 1431573, 1181717, 1425514, 1081368, 739352, 1425429, 1431579, 1431580, 1425515, 1425432, 1431583, 479263, 1431586, 1431589, 1431591, 1423399, 903210, 1431596, 1423406, 1431602, 1425520, 1431605, 862262, 1429560, 1431611, 1431612, 1429592, 1431615, 1431616, 1431617, 1425475, 1042503, 1431626, 1431627, 739404, 1431629, 1425486, 1431631, 1425488, 1431633, 1429585, 1425491, 1429588, 1425492, 1429587, 1425495, 1425496, 317528, 1429586, 1429595, 1425499, 559196, 1425500, 1429591, 1425504, 1425505, 1144930, 1425506, 1425508, 1425509, 1429601, 1425511, 1425512, 1241184, 903274, 1429610, 1425516, 1425517, 1425518, 1425519, 1065072, 1425521, 1429612, 952435, 952436, 1429613, 1425523, 1425527, 430191, 430193, 1429604, 1429627, 1065084, 1425532, 952446, 952447, 1425531, 1429628, 1065094, 1065097, 1132682, 1065109, 1065110, 1425560, 1425561, 1429611, 

In [17]:
# convert PPV geneID to PPV names
PPV_comp = [None] * len(intersect_index)

for i in range(len(intersect_index)):
    PPV_comp[i] = PPV_data.loc[PPV_data['Composite']==str(intersect_index[i])]

In [18]:
PPV_comp

[                                                     ID Composite Keyword
 1496  BML_08182015_6_5m_scaffold_1_346|BML_08182015_...   1431552     PPV,
                                                         ID Composite Keyword
 1432360  sample2_BioR1_scaffold_257_19|BioR1_s2_plankto...   1425510     PPV,
                                                     ID Composite Keyword
 186  04302015_21_scaffold_144_21|04302015_21_Plasmi...   1425411     PPV,
                                                  ID Composite Keyword
 1429589  gwc1_scaffold_739_20|GWC1_Phage_28_17|GWC1    548868     PPV,
                                                      ID Composite Keyword
 1491  BML_08182015_6_5m_scaffold_18_224|BML_08182015...   1431557     PPV,
                                                         ID Composite Keyword
 1431737  sample16_BioR1_112_29|BioR1_s16_temp_40_alphap...   1425412     PPV,
                                                      ID Composite Keyword
 7757  YH_S30_sca

In [19]:
type(PPV_comp) #list of pandas dataframe
PPV_comp

[                                                     ID Composite Keyword
 1496  BML_08182015_6_5m_scaffold_1_346|BML_08182015_...   1431552     PPV,
                                                         ID Composite Keyword
 1432360  sample2_BioR1_scaffold_257_19|BioR1_s2_plankto...   1425510     PPV,
                                                     ID Composite Keyword
 186  04302015_21_scaffold_144_21|04302015_21_Plasmi...   1425411     PPV,
                                                  ID Composite Keyword
 1429589  gwc1_scaffold_739_20|GWC1_Phage_28_17|GWC1    548868     PPV,
                                                      ID Composite Keyword
 1491  BML_08182015_6_5m_scaffold_18_224|BML_08182015...   1431557     PPV,
                                                         ID Composite Keyword
 1431737  sample16_BioR1_112_29|BioR1_s16_temp_40_alphap...   1425412     PPV,
                                                      ID Composite Keyword
 7757  YH_S30_sca

In [20]:
with open('PPV_ATP_composite.txt', 'w') as f:
    for item in PPV_comp:
        f.write("%s\n" % item)

In [88]:
import pickle

with open('PPV_ATP_composite', 'wb') as fp:
    pickle.dump(PPV_comp, fp)

In [122]:
with open('PPV_ATP_composite.txt', 'w') as f:
    f.writelines("%s\n" % place for place in PPV_comp)

In [130]:
f = open('PPV_ATP_composite.csv', 'w')
for df in PPV_comp:
    df.to_csv(f,'\t')
f.close()

In [20]:
# convert PPV geneID to PPV names
PPV_comp2 = [None] * len(intersect_index)
#PPV_comp_list = [None] * len(intersect_index)

for i in range(len(intersect_index)):
    PPV_comp_col = PPV_data.loc[PPV_data['Composite']==str(intersect_index[i])]
    PPV_comp2[i] = PPV_comp_col['ID'].tolist()
    #PPV_comp_list[i]

In [21]:
PPV_comp2

[['BML_08182015_6_5m_scaffold_1_346|BML_08182015_6_5m_Completed_Jumbo_Phage_40_15|BML_08182015_6_5m'],
 ['sample2_BioR1_scaffold_257_19|BioR1_s2_planktonic1_rhizobiales_plasmid_60_23|BioR1_s2_planktonic1'],
 ['04302015_21_scaffold_144_21|04302015_21_Plasmid_64_31|04302015_21'],
 ['gwc1_scaffold_739_20|GWC1_Phage_28_17|GWC1'],
 ['BML_08182015_6_5m_scaffold_18_224|BML_08182015_6_5m_Fragmented_Jumbo_Phage_47_8|BML_08182015_6_5m'],
 ['sample16_BioR1_112_29|BioR1_s16_temp_40_alphaproteobacteria_plasmid_55_33|BioR1_s16_temp_40'],
 ['YH_S30_scaffold_88_20|YH_S30_plasmid_68_28|YH_S30'],
 ['scnpilot_expt_750_p_scaffold_5758_1|scnpilot_dereplicated_Virus_unknown_3|SCNPILOT_EXPT_750_P'],
 ['BML_02172017_0m_scaffold_41_93|BML_02172017_0m_Fragmented_Jumbo_Phage_47_9|BML_02172017_0m'],
 ['Ig7659_scaffold_8_6|bjp_ig7659_COMPLETE_MegaPlasmid_w-CRISPR_60_13|08E140C01_z1_2015'],
 ['BML_02172017_0m_scaffold_9_170|BML_02172017_0m_virus_32_13|BML_02172017_0m'],
 ['BML_02172017_0m_scaffold_9_272|BML_0217201

In [79]:
with open('PPV_ATP_composite_list.txt', 'w') as f:
        for item in PPV_comp2:
            f.write(str(item) + "\n")

In [84]:
result=open('PPV_ATP_composite_list.txt', 'w')
result.write("\n".join(["''".join([str(x) for x in item]) for item in PPV_comp2]))
result.close()

# Elimiate all sequences of domain > 50% Bacteria

In [22]:
# location of raw data file
datapath = '/Users/jinenstar/Desktop/Data/Progress'
os.chdir(datapath)

file_in = 'phage-plasmid-virus-protein-families.contig-taxonomy.tsv'

with open(file_in) as f:
    reader = csv.DictReader(f, delimiter="\t")
    contig_list = pd.DataFrame(reader)

# contig count: ggKbase 91205
len(contig_list)

26327

In [23]:
contig_list

Unnamed: 0,Contig name,Size (bp),Coverage,GC %,Taxonomy winner,Winner %,Species winner,Species winner %,Genus winner,Genus winner %,Order winner,Order winner %,Class winner,Class winner %,Phylum winner,Phylum winner %,Domain winner,Domain winner %
0,ACD33_1,5920,1.0,39.78,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
1,ACD33_2,14760,1.0,24.18,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
2,ACD33_3,8105,1.0,26.07,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
3,ACD33_4,4479,1.0,24.72,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
4,ACD33_5,35638,1.0,27.82,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
5,ACD33_6,5947,1.0,25.05,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
6,ACD33_7,11993,1.0,26.51,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
7,ACD33_8,11061,1.0,25.78,uncultured bacterium,0.94,uncultured bacterium,0.94,unknown,1.0,unknown,0.94,unknown,1.0,unknown,1.0,Bacteria,0.94
8,ACD33_9,7319,1.0,26.37,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
9,ACD33_10,2545,1.0,26.52,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0


In [24]:
cols = contig_list.columns
print(cols[17])

Domain winner %


In [25]:
domain_winner_per = contig_list[cols[17]].apply(pd.to_numeric, errors='coerce')
domain_winner_per

0        1.00
1        1.00
2        1.00
3        1.00
4        1.00
5        1.00
6        1.00
7        0.94
8        1.00
9        1.00
10       1.00
11       1.00
12       0.50
13       1.00
14       1.00
15       1.00
16       1.00
17       1.00
18       1.00
19       1.00
20       1.00
21       1.00
22       0.67
23       1.00
24       0.50
25       0.89
26       1.00
27       1.00
28       1.00
29       1.00
         ... 
26297    0.60
26298    0.80
26299    0.62
26300    0.62
26301    0.62
26302    0.69
26303    0.96
26304    0.80
26305    0.62
26306    0.43
26307    0.77
26308    0.89
26309    0.82
26310    0.80
26311    0.85
26312    0.71
26313    0.70
26314    0.75
26315    0.64
26316    0.91
26317    0.63
26318    0.57
26319    0.87
26320    0.75
26321    0.56
26322    0.68
26323    0.53
26324    0.83
26325    0.62
26326    0.60
Name: Domain winner %, Length: 26327, dtype: float64

In [26]:
domain_winner = contig_list[domain_winner_per > 0.5]
domain_winner

Unnamed: 0,Contig name,Size (bp),Coverage,GC %,Taxonomy winner,Winner %,Species winner,Species winner %,Genus winner,Genus winner %,Order winner,Order winner %,Class winner,Class winner %,Phylum winner,Phylum winner %,Domain winner,Domain winner %
0,ACD33_1,5920,1.0,39.78,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
1,ACD33_2,14760,1.0,24.18,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
2,ACD33_3,8105,1.0,26.07,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
3,ACD33_4,4479,1.0,24.72,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
4,ACD33_5,35638,1.0,27.82,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
5,ACD33_6,5947,1.0,25.05,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
6,ACD33_7,11993,1.0,26.51,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
7,ACD33_8,11061,1.0,25.78,uncultured bacterium,0.94,uncultured bacterium,0.94,unknown,1.0,unknown,0.94,unknown,1.0,unknown,1.0,Bacteria,0.94
8,ACD33_9,7319,1.0,26.37,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
9,ACD33_10,2545,1.0,26.52,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0


In [27]:
bacteria_list = domain_winner.loc[domain_winner['Domain winner'] == 'Bacteria']
bacteria_list

Unnamed: 0,Contig name,Size (bp),Coverage,GC %,Taxonomy winner,Winner %,Species winner,Species winner %,Genus winner,Genus winner %,Order winner,Order winner %,Class winner,Class winner %,Phylum winner,Phylum winner %,Domain winner,Domain winner %
0,ACD33_1,5920,1.0,39.78,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
1,ACD33_2,14760,1.0,24.18,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
2,ACD33_3,8105,1.0,26.07,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
3,ACD33_4,4479,1.0,24.72,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
4,ACD33_5,35638,1.0,27.82,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
5,ACD33_6,5947,1.0,25.05,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
6,ACD33_7,11993,1.0,26.51,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
7,ACD33_8,11061,1.0,25.78,uncultured bacterium,0.94,uncultured bacterium,0.94,unknown,1.0,unknown,0.94,unknown,1.0,unknown,1.0,Bacteria,0.94
8,ACD33_9,7319,1.0,26.37,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0
9,ACD33_10,2545,1.0,26.52,uncultured bacterium,1.0,uncultured bacterium,1.0,unknown,1.0,unknown,1.0,unknown,1.0,unknown,1.0,Bacteria,1.0


In [28]:
len(bacteria_list)

9037

In [29]:
bacteria_contig = bacteria_list['Contig name'].tolist()
type(bacteria_contig)

list

In [30]:
bacteria_contig_list = [bc + "_" for bc in bacteria_contig]
bacteria_contig_list

['ACD33_1_',
 'ACD33_2_',
 'ACD33_3_',
 'ACD33_4_',
 'ACD33_5_',
 'ACD33_6_',
 'ACD33_7_',
 'ACD33_8_',
 'ACD33_9_',
 'ACD33_10_',
 'ACD33_11_',
 'ACD33_12_',
 'ACD33_14_',
 'ACD33_15_',
 'ACD33_16_',
 'ACD33_17_',
 'ACD33_18_',
 'ACD33_19_',
 'ACD33_20_',
 'ACD33_21_',
 'ACD33_22_',
 'ACD33_23_',
 'ACD33_24_',
 'ACD33_26_',
 'ACD71_1_',
 'ACD71_2_',
 'ACD71_3_',
 'ACD71_4_',
 'ACD71_5_',
 'ACD71_6_',
 'ACD71_7_',
 'ACD71_8_',
 'ACD71_9_',
 'ACD71_10_',
 'ACD71_11_',
 'ACD71_12_',
 'ACD71_13_',
 'ACD71_14_',
 'ACD71_15_',
 'ACD71_16_',
 'ACD71_17_',
 'ACD71_18_',
 'ACD71_19_',
 'ACD71_20_',
 'ACD71_21_',
 'ACD71_22_',
 'ACD71_23_',
 'ACD71_24_',
 'ACD71_25_',
 'ACD71_26_',
 'ACD71_27_',
 'ACD71_28_',
 'ACD71_29_',
 'ACD71_30_',
 'ACD71_31_',
 'ACD71_32_',
 'ACD71_33_',
 'ACD71_34_',
 'ACD71_35_',
 'ACD71_36_',
 'ACD71_37_',
 'ACD71_38_',
 'ACD71_40_',
 'ACD71_41_',
 'ACD71_42_',
 'ACD71_43_',
 'ACD71_44_',
 'ACD71_45_',
 'ACD71_46_',
 'ACD71_47_',
 'ACD71_48_',
 'ACD71_49_',
 'ACD71_50

In [48]:
# flat_list = []
# for sublist in l:
#     for item in sublist:
PPV_comp2_flat = []
PPV_comp2_flat = [item for sublist in PPV_comp2 for item in sublist]
PPV_comp2_flat

['BML_08182015_6_5m_scaffold_1_346|BML_08182015_6_5m_Completed_Jumbo_Phage_40_15|BML_08182015_6_5m',
 'sample2_BioR1_scaffold_257_19|BioR1_s2_planktonic1_rhizobiales_plasmid_60_23|BioR1_s2_planktonic1',
 '04302015_21_scaffold_144_21|04302015_21_Plasmid_64_31|04302015_21',
 'gwc1_scaffold_739_20|GWC1_Phage_28_17|GWC1',
 'BML_08182015_6_5m_scaffold_18_224|BML_08182015_6_5m_Fragmented_Jumbo_Phage_47_8|BML_08182015_6_5m',
 'sample16_BioR1_112_29|BioR1_s16_temp_40_alphaproteobacteria_plasmid_55_33|BioR1_s16_temp_40',
 'YH_S30_scaffold_88_20|YH_S30_plasmid_68_28|YH_S30',
 'scnpilot_expt_750_p_scaffold_5758_1|scnpilot_dereplicated_Virus_unknown_3|SCNPILOT_EXPT_750_P',
 'BML_02172017_0m_scaffold_41_93|BML_02172017_0m_Fragmented_Jumbo_Phage_47_9|BML_02172017_0m',
 'Ig7659_scaffold_8_6|bjp_ig7659_COMPLETE_MegaPlasmid_w-CRISPR_60_13|08E140C01_z1_2015',
 'BML_02172017_0m_scaffold_9_170|BML_02172017_0m_virus_32_13|BML_02172017_0m',
 'BML_02172017_0m_scaffold_9_272|BML_02172017_0m_virus_32_13|BML_02

In [78]:
bacteria_list = []
#[item for item in PPV_comp2_flat if bacteria_contig not in item]
for i in range(len(bacteria_contig_list)):
    for item in PPV_comp2_flat: 
        if bacteria_contig_list[i] in item:
            bacteria_list.append(item)
#str(bacteria_contig_list)

In [79]:
len(bacteria_list)

441

In [81]:
def diff(first, second):
    second = set(second)
    return [item for item in first if item not in second]

nonbacteria_list = diff(PPV_comp2_flat, bacteria_list)
len(nonbacteria_list)

562

In [82]:
nonbacteria_list

['BML_08182015_6_5m_scaffold_1_346|BML_08182015_6_5m_Completed_Jumbo_Phage_40_15|BML_08182015_6_5m',
 'gwc1_scaffold_739_20|GWC1_Phage_28_17|GWC1',
 'BML_08182015_6_5m_scaffold_18_224|BML_08182015_6_5m_Fragmented_Jumbo_Phage_47_8|BML_08182015_6_5m',
 'scnpilot_expt_750_p_scaffold_5758_1|scnpilot_dereplicated_Virus_unknown_3|SCNPILOT_EXPT_750_P',
 'BML_02172017_0m_scaffold_41_93|BML_02172017_0m_Fragmented_Jumbo_Phage_47_9|BML_02172017_0m',
 'Ig7659_scaffold_8_6|bjp_ig7659_COMPLETE_MegaPlasmid_w-CRISPR_60_13|08E140C01_z1_2015',
 'BML_02172017_0m_scaffold_9_170|BML_02172017_0m_virus_32_13|BML_02172017_0m',
 'BML_02172017_0m_scaffold_9_272|BML_02172017_0m_virus_32_13|BML_02172017_0m',
 'BML_02172017_0m_scaffold_615_5|BML_02172017_0m_Fragmented_Jumbo_Phage_51_8|BML_02172017_0m',
 'BML_02172017_6_5m_scaffold_12_106|BML_02172017_6_5m_Fragmented_Jumbo_Phage_51_16|BML_02172017_6_5m',
 'BML_02172017_6_5m_scaffold_30_18|BML_02172017_6_5m_Fragmented_Jumbo_Phage_32_11|BML_02172017_6_5m',
 'BML_0217

In [83]:
with open('PPV_ATP_composite_nonbacteria_list.txt', 'w') as f:
        for item in nonbacteria_list:
            f.write(str(item) + "\n")

In [89]:
# def unique(list1): 
  
#     # intilize a null list 
#     unique_list = [] 
      
#     # traverse for all elements 
#     for x in list1: 
#         # check if exists in unique_list or not 
#         if x not in unique_list: 
#             unique_list.append(x) 
#     # print list 
#     for x in unique_list: 
#         print(x)

# nonbacteria_list_unique = unique(nonbacteria_list)
# len(nonbacteria_list_unique)

BML_08182015_6_5m_scaffold_1_346|BML_08182015_6_5m_Completed_Jumbo_Phage_40_15|BML_08182015_6_5m
gwc1_scaffold_739_20|GWC1_Phage_28_17|GWC1
BML_08182015_6_5m_scaffold_18_224|BML_08182015_6_5m_Fragmented_Jumbo_Phage_47_8|BML_08182015_6_5m
scnpilot_expt_750_p_scaffold_5758_1|scnpilot_dereplicated_Virus_unknown_3|SCNPILOT_EXPT_750_P
BML_02172017_0m_scaffold_41_93|BML_02172017_0m_Fragmented_Jumbo_Phage_47_9|BML_02172017_0m
Ig7659_scaffold_8_6|bjp_ig7659_COMPLETE_MegaPlasmid_w-CRISPR_60_13|08E140C01_z1_2015
BML_02172017_0m_scaffold_9_170|BML_02172017_0m_virus_32_13|BML_02172017_0m
BML_02172017_0m_scaffold_9_272|BML_02172017_0m_virus_32_13|BML_02172017_0m
BML_02172017_0m_scaffold_615_5|BML_02172017_0m_Fragmented_Jumbo_Phage_51_8|BML_02172017_0m
BML_02172017_6_5m_scaffold_12_106|BML_02172017_6_5m_Fragmented_Jumbo_Phage_51_16|BML_02172017_6_5m
BML_02172017_6_5m_scaffold_30_18|BML_02172017_6_5m_Fragmented_Jumbo_Phage_32_11|BML_02172017_6_5m
BML_02172017_6_5m_scaffold_30_120|BML_02172017_6_5m_Fr

TypeError: object of type 'NoneType' has no len()

# Using the output processed by SF (obsolete)

In [135]:
# location of raw data file
datapath = '/Users/jinenstar/Desktop/Pro_ATP/'
os.chdir(datapath)

# open dictionary (geneID and ggKbase names) from cleanNetwork
file_in ='PPV_ATP_composite_list.txt'

with open(file_in) as f:
    reader = csv.reader(f, delimiter="\n")
    dat_ATP = list(reader)

In [157]:
dat_ATP_f = [val for sublist in dat_ATP for val in sublist]
dat_ATP_f

['BML_08182015_6_5m_scaffold_1_346|BML_08182015_6_5m_Completed_Jumbo_Phage_40_15|BML_08182015_6_5m',
 'sample2_BioR1_scaffold_257_19|BioR1_s2_planktonic1_rhizobiales_plasmid_60_23|BioR1_s2_planktonic1',
 '04302015_21_scaffold_144_21|04302015_21_Plasmid_64_31|04302015_21',
 'gwc1_scaffold_739_20|GWC1_Phage_28_17|GWC1',
 'BML_08182015_6_5m_scaffold_18_224|BML_08182015_6_5m_Fragmented_Jumbo_Phage_47_8|BML_08182015_6_5m',
 'sample16_BioR1_112_29|BioR1_s16_temp_40_alphaproteobacteria_plasmid_55_33|BioR1_s16_temp_40',
 'YH_S30_scaffold_88_20|YH_S30_plasmid_68_28|YH_S30',
 'scnpilot_expt_750_p_scaffold_5758_1|scnpilot_dereplicated_Virus_unknown_3|SCNPILOT_EXPT_750_P',
 'BML_02172017_0m_scaffold_41_93|BML_02172017_0m_Fragmented_Jumbo_Phage_47_9|BML_02172017_0m',
 'Ig7659_scaffold_8_6|bjp_ig7659_COMPLETE_MegaPlasmid_w-CRISPR_60_13|08E140C01_z1_2015',
 'BML_02172017_0m_scaffold_9_170|BML_02172017_0m_virus_32_13|BML_02172017_0m',
 'BML_02172017_0m_scaffold_9_272|BML_02172017_0m_virus_32_13|BML_02

In [137]:
# # location of raw data file
# datapath = '/Users/jinenstar/Desktop/Pro_ATP/'
# os.chdir(datapath)

# # open dictionary (geneID and ggKbase names) from cleanNetwork
# file_in ='jin_bacteria_filtered_proteins.txt'

# with open(file_in) as f:
#     reader = csv.reader(f, delimiter="\n")
#     dat_Bacteria = list(reader)

In [158]:
dat_Bacteria_f = [val for sublist in dat_Bacteria for val in sublist]
dat_Bacteria_f

['0227A1115_scaffold_78_53|0227A1115_Clostridium_perfringens_plasmid__196|0227A1115',
 '0227A1115_scaffold_78_54|0227A1115_Clostridium_perfringens_plasmid__196|0227A1115',
 '0305A1203_scaffold_65_54|0305A1203_Clostridium_perfringens_phage__302|0305A1203',
 '0305A1203_scaffold_65_55|0305A1203_Clostridium_perfringens_phage__302|0305A1203',
 '0307A1205_scaffold_68_21|0307A1205_Clostridium_perfringens_plasmid__916|0307A1205',
 '0307A1205_scaffold_68_22|0307A1205_Clostridium_perfringens_plasmid__916|0307A1205',
 '0312A1210_scaffold_66_21|0312A1210_Clostridium_perfringens_plasmid__695|0312A1210',
 '0312A1210_scaffold_66_22|0312A1210_Clostridium_perfringens_plasmid__695|0312A1210',
 '0315A1213_scaffold_60_21|0315A1213_Clostridium_perfringens_plasmid__1291|0315A1213',
 '0315A1213_scaffold_60_22|0315A1213_Clostridium_perfringens_plasmid__1291|0315A1213',
 '0319A1217_scaffold_83_60|0319A1217_Clostridium_perfringens_phage__256|0319A1217',
 '0319A1217_scaffold_83_61|0319A1217_Clostridium_perfringe

In [139]:
final_list = list(set(dat_ATP_f) & set(dat_Bacteria_f))

In [147]:
len(final_list)

272

In [160]:
def Diff(li1, li2): 
    return (list(set(li1) - set(li2))) 
len(Diff(dat_ATP_f, dat_Bacteria_f))

731