In [None]:
# Copyright 2019 The Authors. All Rights Reserved.
#
# GNU General Public License v3.0
# Permissions of this strongest copyleft license are conditioned on 
# making available complete source code of licensed works and modifications, 
# which include larger works using a licensed work, under the same license. 
# Copyright and license notices must be preserved. 
# Contributors provide an express grant of patent rights. 
# When a modified version is used to provide a service over a network, 
# the complete source code of the modified version must be made available.
# ==============================================================================

# Title: Process results from Composite Search of PPV_ATPase data
# Author: Hyunjin Shim
# Date created: 20191003
# Email: jinenstar@gmail.com

# Dataset description (downloaded on 20190830)
Original Data
- Organisms: 7,029
- Contigs: 26,331
- Features: 938,182

ATPase annotated
- Total: 5,497
- Virus: 361
- Phage: 2,654
- Plasmid: 2,216
- Other: 266

# Composite protiens of ATPase annotations from PPV

- to analyze proteins with ATPase annotations from https://ggkbase.berkeley.edu/ after Composite Search to find fusion proteins (database as PPV and NCBI_nr)
- to prioritize composite proteins of interest

In [None]:
# Data
import os
import pandas as pd
import numpy as np
import csv

# Biopython
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# Regular expression
import re

# Plot
from pylab import *
import pylab as pylab
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
import matplotlib.patches as mpatches
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
# location of raw data file
datapath = '/Users/jinenstar/Desktop/Pro_ATP/'
os.chdir(datapath)

# open dictionary (geneID and ggKbase names) from cleanNetwork
file_in ='PPV_ATP.out.cleanNetwork.dico'

with open(file_in) as f:
    reader = csv.reader(f, delimiter="\t")
    all_dat = list(reader)

with open(file_in) as f:
    reader = csv.reader(f, delimiter="\t")
    all_dat_df = pd.DataFrame(reader)
    
print(all_dat_df[:10])

In [None]:
all_dat_df
#len(all_dat)

In [None]:
all_keyword = []
nr = 0
ppv = 0

for i in all_dat:
    if(re.findall(r'GI\|', str(i), flags=re.I)):
        all_keyword.append('NR')
        nr += 1
    elif(re.findall(r'\|REF\|', str(i), flags=re.I)):
        all_keyword.append('NR')
        nr += 1
    elif(re.findall(r'\|GB\|', str(i), flags=re.I)):
        all_keyword.append('NR')
        nr += 1
    else:
        all_keyword.append('PPV')
        ppv += 1

In [None]:
print(all_keyword[:10000])
print(nr)
print(ppv)
len(all_keyword)

In [None]:
# dictionary to DataFrame
d = {'ID':all_dat_df[:][0], 'Composite':all_dat_df[:][1], 'Keyword':all_keyword}
all_combined = pd.DataFrame(d)

In [None]:
all_combined

In [None]:
# extract all items (geneID and ggKbase names) of PPV
PPV_data = all_combined[all_combined['Keyword']=='PPV']
print(PPV_data)
len(PPV_data)

In [None]:
# location of raw data file
datapath = '/Users/jinenstar/Desktop/Pro_ATP/results/PPV_ATP_out_cleanNetwork_composites_Thu_Oct__3_00_44_18_2019'
os.chdir(datapath)

In [None]:
# open results from Composite Search
file_in ='PPV_ATP_out_cleanNetwork.compositesinfo'

with open(file_in) as f:
    reader = csv.DictReader(f, delimiter="\t")
    all_comp_df = pd.DataFrame(reader)

In [None]:
all_comp_df["#geneID"]

In [None]:
# retain geneID without the first letter C
res_comp = [None] * len(all_comp_df)

for n in range(len(all_comp_df)):
    all_comp_df_gene = all_comp_df["#geneID"][n]
    res_comp[n] = int(all_comp_df_gene[1:])

In [None]:
len(res_comp)

In [None]:
# extract geneID from PPV
res_PPV = pd.to_numeric(PPV_data['Composite']).tolist()
len(res_PPV)

In [None]:
def Intersection(lst1, lst2): 
    return set(lst1).intersection(lst2) 
      
# intersection between composite geneID and PPV gene ID: find PPV within composite
intersect = Intersection(res_comp, res_PPV)

In [None]:
intersect_index = list(intersect)
print(intersect_index)

In [None]:
# convert PPV geneID to PPV names
PPV_comp = [None] * len(intersect_index)

for i in range(len(intersect_index)):
    PPV_comp[i] = PPV_data.loc[PPV_data['Composite']==str(intersect_index[i])]

In [None]:
PPV_comp

In [None]:
type(PPV_comp) #list of pandas dataframe
PPV_comp

In [None]:
with open('PPV_ATP_composite.txt', 'w') as f:
    for item in PPV_comp:
        f.write("%s\n" % item)

In [None]:
import pickle

with open('PPV_ATP_composite', 'wb') as fp:
    pickle.dump(PPV_comp, fp)

In [None]:
with open('PPV_ATP_composite.txt', 'w') as f:
    f.writelines("%s\n" % place for place in PPV_comp)

In [None]:
f = open('PPV_ATP_composite.csv', 'w')
for df in PPV_comp:
    df.to_csv(f,'\t')
f.close()

In [None]:
# convert PPV geneID to PPV names
PPV_comp2 = [None] * len(intersect_index)
#PPV_comp_list = [None] * len(intersect_index)

for i in range(len(intersect_index)):
    PPV_comp_col = PPV_data.loc[PPV_data['Composite']==str(intersect_index[i])]
    PPV_comp2[i] = PPV_comp_col['ID'].tolist()
    #PPV_comp_list[i]

In [None]:
PPV_comp2

In [None]:
with open('PPV_ATP_composite_list.txt', 'w') as f:
        for item in PPV_comp2:
            f.write(str(item) + "\n")

In [None]:
result=open('PPV_ATP_composite_list.txt', 'w')
result.write("\n".join(["''".join([str(x) for x in item]) for item in PPV_comp2]))
result.close()

# Elimiate all sequences of domain > 50% Bacteria

In [None]:
# location of raw data file
datapath = '/Users/jinenstar/Desktop/Data/Progress'
os.chdir(datapath)

file_in = 'phage-plasmid-virus-protein-families.contig-taxonomy.tsv'

with open(file_in) as f:
    reader = csv.DictReader(f, delimiter="\t")
    contig_list = pd.DataFrame(reader)

# contig count: ggKbase 91205
len(contig_list)

In [None]:
contig_list

In [None]:
cols = contig_list.columns
print(cols[17])

In [None]:
domain_winner_per = contig_list[cols[17]].apply(pd.to_numeric, errors='coerce')
domain_winner_per

In [None]:
domain_winner = contig_list[domain_winner_per > 0.5]
domain_winner

In [None]:
bacteria_list = domain_winner.loc[domain_winner['Domain winner'] == 'Bacteria']
bacteria_list

In [None]:
len(bacteria_list)

In [None]:
bacteria_contig = bacteria_list['Contig name'].tolist()
type(bacteria_contig)

In [None]:
bacteria_contig_list = [bc + "_" for bc in bacteria_contig]
bacteria_contig_list

In [None]:
# flat_list = []
# for sublist in l:
#     for item in sublist:
PPV_comp2_flat = []
PPV_comp2_flat = [item for sublist in PPV_comp2 for item in sublist]
PPV_comp2_flat

In [None]:
bacteria_list = []
#[item for item in PPV_comp2_flat if bacteria_contig not in item]
for i in range(len(bacteria_contig_list)):
    for item in PPV_comp2_flat: 
        if bacteria_contig_list[i] in item:
            bacteria_list.append(item)
#str(bacteria_contig_list)

In [None]:
len(bacteria_list)

In [None]:
def diff(first, second):
    second = set(second)
    return [item for item in first if item not in second]

nonbacteria_list = diff(PPV_comp2_flat, bacteria_list)
len(nonbacteria_list)

In [None]:
nonbacteria_list

In [None]:
with open('PPV_ATP_composite_nonbacteria_list.txt', 'w') as f:
        for item in nonbacteria_list:
            f.write(str(item) + "\n")

In [None]:
# def unique(list1): 
  
#     # intilize a null list 
#     unique_list = [] 
      
#     # traverse for all elements 
#     for x in list1: 
#         # check if exists in unique_list or not 
#         if x not in unique_list: 
#             unique_list.append(x) 
#     # print list 
#     for x in unique_list: 
#         print(x)

# nonbacteria_list_unique = unique(nonbacteria_list)
# len(nonbacteria_list_unique)

# Using the output processed by SF (obsolete)

In [None]:
# location of raw data file
datapath = '/Users/jinenstar/Desktop/Pro_ATP/'
os.chdir(datapath)

# open dictionary (geneID and ggKbase names) from cleanNetwork
file_in ='PPV_ATP_composite_list.txt'

with open(file_in) as f:
    reader = csv.reader(f, delimiter="\n")
    dat_ATP = list(reader)

In [None]:
dat_ATP_f = [val for sublist in dat_ATP for val in sublist]
dat_ATP_f

In [None]:
# # location of raw data file
# datapath = '/Users/jinenstar/Desktop/Pro_ATP/'
# os.chdir(datapath)

# # open dictionary (geneID and ggKbase names) from cleanNetwork
# file_in ='jin_bacteria_filtered_proteins.txt'

# with open(file_in) as f:
#     reader = csv.reader(f, delimiter="\n")
#     dat_Bacteria = list(reader)

In [None]:
dat_Bacteria_f = [val for sublist in dat_Bacteria for val in sublist]
dat_Bacteria_f

In [None]:
final_list = list(set(dat_ATP_f) & set(dat_Bacteria_f))

In [None]:
len(final_list)

In [None]:
def Diff(li1, li2): 
    return (list(set(li1) - set(li2))) 
len(Diff(dat_ATP_f, dat_Bacteria_f))