In [2]:
import pandas as pd
import numpy as np
import urllib.parse
import urllib.request
from tqdm import tqdm
import os
import time
import subprocess
from scipy import stats
import matplotlib.pyplot as plt

def uniprotMapping(fileName,query, From="ACC",To="ACC",Format="fasta",Columns="",outputDir=""):
    for i in range(10):
        try:
            url = 'https://www.uniprot.org/uploadlists/'
            params={
                "query":query,
                "from":From,
                "to":To,
                "format":Format,
                "columns":Columns,
            }
            data = urllib.parse.urlencode(params)
            data = data.encode('utf-8')
            req = urllib.request.Request(url, data)
            with urllib.request.urlopen(req) as f:
                response = str(f.read(),encoding="utf-8")
            outputPath="{}{}".format(outputDir,fileName)
            if outputDir and not os.path.exists(outputDir):
                os.makedirs(outputDir)
            with open(outputPath,"w") as f:
                f.write(response)
            return fileName
        except:
            print("request failed, wait for", i*5,"seconds and try again")
            time.sleep(i*5)

"""Get Info of all the Twins"""
TWINS = pd.read_csv("../2020-05-12.wilcoxon-ranksum-test/twins_info.tab",sep="\t",index_col=0).reset_index(drop=True)

"""Keep Twins with two different Uniref50 groups"""
TWINS = TWINS[TWINS["Cytoplasm_uniref50"]!=TWINS["Periplasm_uniref50"]].reset_index(drop=True).reset_index(drop=True)

"""Keep Twins with length ratio > ratio_thr"""
ratio_thr=0.75
TWINS = TWINS[TWINS["length_ratio"]>ratio_thr].reset_index(drop=True)

"""Keep Twins abs diff<abs_thr"""
#abs_thr=50
#TWINS = TWINS[TWINS["length_abs_diff"]<abs_thr].reset_index(drop=True)

"""Reduce to unique groups"""
print("Unique cytoplasm uniref50 groups:", len(TWINS["Cytoplasm_uniref50"].unique()))
print("Unique periplasm uniref50 groups:", len(TWINS["Periplasm_uniref50"].unique()))
#TWINS.sort_values("length_ratio",ascending=False).groupby(["Cytoplasm_uniref50","Periplasm_uniref50"],as_index=True).first()
TWINS=TWINS.sort_values(["Periplasm_uniref50_size"],ascending=False).groupby("Cytoplasm_uniref50",as_index=False).first()
TWINS=TWINS.sort_values(["Cytoplasm_uniref50_size"],ascending=False).groupby("Periplasm_uniref50",as_index=False).first()

Unique cytoplasm uniref50 groups: 20
Unique periplasm uniref50 groups: 21


In [3]:
TWINS

Unnamed: 0,Periplasm_uniref50,Cytoplasm_uniref50,Cytoplasm,Periplasm,Organism,SP_start,SP_end,Cytoplasm_uniref50_size,Periplasm_uniref50_size,Cytoplasm_length,Periplasm_length,length_ratio,length_abs_diff
0,UniRef50_A0A1X1D1X1,UniRef50_Q2NVU4,A0A484WTL8,A0A0A3ZPT1,Enterobacter_cancerogenus,1,35,124,148,171,166,0.77193,39
1,UniRef50_O53021,UniRef50_P83221,P23869,P0AFL3,Escherichia_coli_strain_K12,1,24,4189,2118,164,190,0.982036,3
2,UniRef50_O59651,UniRef50_Q96VT4,Q9ZGM4,Q9WXB9,Legionella_pneumophila,1,27,1678,9906,721,749,0.997234,2
3,UniRef50_P00805,UniRef50_P0A963,P0A962,P00805,Escherichia_coli_strain_K12,1,22,3836,2537,338,348,0.967456,11
4,UniRef50_P0AAL4,UniRef50_P0AAL2,A0A3T8GC16,P0AAL5,Shigella_flexneri,1,41,1270,492,164,231,0.858639,27
5,UniRef50_P0C278,UniRef50_P10902,Q8EH88,P83223,Shewanella_oneidensis_strain_MR-1,1,25,6730,160,537,596,0.938811,35
6,UniRef50_P25718,UniRef50_P21517,P21517,P25718,Escherichia_coli_strain_K12,1,17,4496,5447,604,676,0.915152,56
7,UniRef50_P39185,UniRef50_O33732,O33732,Q080C0,Shewanella_frigidimarina_strain_NCIMB_400,1,27,29,6749,938,829,0.856077,135
8,UniRef50_P44652,UniRef50_P44650,P44650,P44652,Haemophilus_influenzae_strain_ATCC_51907__DSM_...,1,50,243,304,176,279,0.765217,54
9,UniRef50_P45523,UniRef50_P0A9L3,P0A9L3,P45523,Escherichia_coli_strain_K12,1,25,1427,2225,206,270,0.837398,40


In [18]:
""" Add structures of Cytoplasm """
PROTEINS = list(TWINS["Cytoplasm"].unique())+list(TWINS["Periplasm"].unique())
FILENAME="cytoplasm_structures.tab"
QUERY=" ".join(PROTEINS)
FROM="ACC"
TO=" PDB_ID"
FORMAT="tab"
COLUMNS="id"
uniprotMapping(FILENAME,QUERY,From=FROM,To=TO,Format=FORMAT,Columns=COLUMNS)
STRUCTURES=pd.DataFrame()
STRUCTURES[["ACC","PDB"]]=pd.read_csv(FILENAME,sep="\t")
STRUCTURES=STRUCTURES.groupby("ACC").agg(list).reset_index()
STRUCTURES

Unnamed: 0,ACC,PDB
0,P00805,"[1HO3, 1IHD, 1JAZ, 1JJA, 1NNS, 3ECA, 4ECA, 5MQ..."
1,P0A962,"[2HIM, 2P2D, 2P2N, 6NXC, 6NXD]"
2,P0AFL3,"[1CLH, 1J2A, 1V9T, 1VAI]"
3,P12994,"[1FJJ, 1VI3]"
4,P21517,[5BN7]
5,P23869,"[1LOP, 2NUL, 2RS4]"
6,P45523,"[1Q6H, 1Q6I, 1Q6U, 4QCC]"
7,P77368,[1FUX]
8,P83223,"[1D4C, 1D4D, 1D4E]"
9,Q72EC8,"[2XVX, 2XVY, 2XVZ]"


In [25]:
STRUCTURES_CYTOPLASM[["Cytoplasm","PDB"]] = STRUCTURES[STRUCTURES["ACC"].apply(lambda x:x in TWINS["Cytoplasm"].unique())]
STRUCTURES_CYTOPLASM

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,ACC,PDB,Cytoplasm
1,P0A962,"[2HIM, 2P2D, 2P2N, 6NXC, 6NXD]",P0A962
3,P12994,"[1FJJ, 1VI3]",P12994
4,P21517,[5BN7],P21517
5,P23869,"[1LOP, 2NUL, 2RS4]",P23869


In [26]:
STRUCTURES_PERIPLASM[["Periplasm","PDB"]]= STRUCTURES[STRUCTURES["ACC"].apply(lambda x:x in TWINS["Periplasm"].unique())]
STRUCTURES_PERIPLASM

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,ACC,PDB,Periplasm
0,P00805,"[1HO3, 1IHD, 1JAZ, 1JJA, 1NNS, 3ECA, 4ECA, 5MQ...",P00805
2,P0AFL3,"[1CLH, 1J2A, 1V9T, 1VAI]",P0AFL3
6,P45523,"[1Q6H, 1Q6I, 1Q6U, 4QCC]",P45523
7,P77368,[1FUX],P77368
8,P83223,"[1D4C, 1D4D, 1D4E]",P83223
9,Q72EC8,"[2XVX, 2XVY, 2XVZ]",Q72EC8


In [24]:
TWINS

Unnamed: 0,Periplasm_uniref50,Cytoplasm_uniref50,Cytoplasm,Periplasm,Organism,SP_start,SP_end,Cytoplasm_uniref50_size,Periplasm_uniref50_size,Cytoplasm_length,Periplasm_length,length_ratio,length_abs_diff
0,UniRef50_A0A1X1D1X1,UniRef50_Q2NVU4,A0A484WTL8,A0A0A3ZPT1,Enterobacter_cancerogenus,1,35,124,148,171,166,0.77193,39
1,UniRef50_O53021,UniRef50_P83221,P23869,P0AFL3,Escherichia_coli_strain_K12,1,24,4189,2118,164,190,0.982036,3
2,UniRef50_O59651,UniRef50_Q96VT4,Q9ZGM4,Q9WXB9,Legionella_pneumophila,1,27,1678,9906,721,749,0.997234,2
3,UniRef50_P00805,UniRef50_P0A963,P0A962,P00805,Escherichia_coli_strain_K12,1,22,3836,2537,338,348,0.967456,11
4,UniRef50_P0AAL4,UniRef50_P0AAL2,A0A3T8GC16,P0AAL5,Shigella_flexneri,1,41,1270,492,164,231,0.858639,27
5,UniRef50_P0C278,UniRef50_P10902,Q8EH88,P83223,Shewanella_oneidensis_strain_MR-1,1,25,6730,160,537,596,0.938811,35
6,UniRef50_P25718,UniRef50_P21517,P21517,P25718,Escherichia_coli_strain_K12,1,17,4496,5447,604,676,0.915152,56
7,UniRef50_P39185,UniRef50_O33732,O33732,Q080C0,Shewanella_frigidimarina_strain_NCIMB_400,1,27,29,6749,938,829,0.856077,135
8,UniRef50_P44652,UniRef50_P44650,P44650,P44652,Haemophilus_influenzae_strain_ATCC_51907__DSM_...,1,50,243,304,176,279,0.765217,54
9,UniRef50_P45523,UniRef50_P0A9L3,P0A9L3,P45523,Escherichia_coli_strain_K12,1,25,1427,2225,206,270,0.837398,40
