In [2]:
import pandas as pd
from os import listdir
import numpy as np
from os.path import isfile, join
from Bio import SeqIO
import re
import seaborn as sns

HOME_DIR = "../data/pdb_str/"
PB_HITS_DIR = "../data/pdb_str/pb_hits/"
MK_HITS_DIR = "../data/pdb_str/m32k25_hits/"
AA_HITS_DIR = "../data/pdb_str/aa_hits/"
CHEM_HITS_DIR = "../data/pdb_str/chem_hits/"
CS219_HITS_DIR = "../data/pdb_str/cs219_hits/"

In [74]:
clearEmpty = lambda x: [ i for i in x if i != '']
regex = re.compile('[^0-9]')

aa_filter = pd.read_csv("aa_hits_sample.csv")
aa_filter = aa_filter[aa_filter.hit_length >= 10]
aa_filter["key"] = aa_filter[["x", "y"]].apply(lambda x: "%s_%s" % (x.x, x.y), axis=1)
aa_filter["key_revert"] = aa_filter[["x", "y"]].apply(lambda x: "%s_%s" % (x.y, x.x), axis=1)
aa_filter.shape

def parse_interval(range):
    segs = [ seg.split("-")  for seg in range.split(",")]
    segs = [ clearEmpty(s) for s in segs ]
    return [ [ int(regex.sub('', s[0])), int(regex.sub('', s[1]).split(":")[0]) ] for s in segs]

def merge_range(elem):
    return ",".join([ ":".join( [str(o) for o in pair] ) for pair in elem])

def readPDB(path):
    for record in SeqIO.parse(path, "fasta"):
        pdb = "".join(list(record.seq))
    return pdb

def filterByAA(df):
    df["key"] = df[["x", "y"]].apply(lambda x: "%s_%s" % (x.x, x.y), axis=1)
    df = pd.merge(df, aa_filter[["key", "key_revert"]], on="key", how="left")
    df = pd.merge(df, aa_filter[["key", "key_revert"]], left_on="key", right_on="key_revert", how="left")
    df = df[df.key_revert_x.isna() & df.key_revert_y.isna()]
    return df

def filterEqChain(df):
    df["chain_x"] = df.x.apply(lambda r: r[0:-3])
    df["chain_y"] = df.y.apply(lambda r: r[0:-3])
    df = df[df.chain_x != df.chain_y]
    df = df.drop(columns=["chain_x", "chain_y"])
    return df
    

def countFileBy(i, F, key):
    print(i, F)
    _df = pd.read_csv(F)
    print("filter by AA")
    _df = filterByAA(_df)
    print("filter eq Chain")
    _df = filterEqChain(_df)
    _counts = _df.groupby(key).size().reset_index(name="N")
    _counts.to_csv("../data/pdb_str/statistics/temp/%s.csv" % F.split("/")[-1], index=False)
    return _counts

In [83]:
def histogram_hit_length(source, target):
    files = [ "".join([source, f]) for f in listdir(source) ]
#     files = files[657:-1]
    counts = [ countFileBy(f, files[f], "hit") for f in range(len(files))]
    df = pd.concat(counts)
    df = df.groupby("hit").N.sum().reset_index(name="N")
    df.to_csv("../data/pdb_str/statistics/%s" % target, index=False)

In [84]:
# histogram_hit_length(PB_HITS_DIR, "pb_histogram_count_by_hit.csv")
histogram_hit_length(MK_HITS_DIR, "m32k25_histogram_count_by_hit.csv")
# histogram_hit_length(AA_HITS_DIR, "aa_histogram_count_by_hit.csv")
# histogram_hit_length(CHEM_HITS_DIR, "chem_histogram_count_by_hit.csv")
# histogram_hit_length(CS219_HITS_DIR, "cs219_histogram_count_by_hit.csv")

0 ../data/pdb_str/cs219_hits/hits_xaz_cath_pairs_ah.gz
filter by AA
filter eq Chain
1 ../data/pdb_str/cs219_hits/hits_xaz_cath_pairs_ai.gz
filter by AA
filter eq Chain
2 ../data/pdb_str/cs219_hits/hits_xaz_cath_pairs_aj.gz
filter by AA
filter eq Chain
3 ../data/pdb_str/cs219_hits/hits_xaz_cath_pairs_ak.gz
filter by AA
filter eq Chain
4 ../data/pdb_str/cs219_hits/hits_xaz_cath_pairs_al.gz
filter by AA
filter eq Chain
5 ../data/pdb_str/cs219_hits/hits_xaz_cath_pairs_am.gz
filter by AA
filter eq Chain
6 ../data/pdb_str/cs219_hits/hits_xaz_cath_pairs_an.gz
filter by AA
filter eq Chain
7 ../data/pdb_str/cs219_hits/hits_xaz_cath_pairs_ao.gz
filter by AA
filter eq Chain
8 ../data/pdb_str/cs219_hits/hits_xaz_cath_pairs_ap.gz
filter by AA
filter eq Chain
9 ../data/pdb_str/cs219_hits/hits_xaz_cath_pairs_aq.gz
filter by AA
filter eq Chain
10 ../data/pdb_str/cs219_hits/hits_xaz_cath_pairs_ar.gz
filter by AA
filter eq Chain
11 ../data/pdb_str/cs219_hits/hits_xaz_cath_pairs_as.gz
filter by AA
filter

filter by AA
filter eq Chain
97 ../data/pdb_str/cs219_hits/hits_xbd_cath_pairs_aa.gz
filter by AA
filter eq Chain
98 ../data/pdb_str/cs219_hits/hits_xbd_cath_pairs_ab.gz
filter by AA
filter eq Chain
99 ../data/pdb_str/cs219_hits/hits_xbd_cath_pairs_ac.gz
filter by AA
filter eq Chain
100 ../data/pdb_str/cs219_hits/hits_xbd_cath_pairs_ad.gz
filter by AA
filter eq Chain
101 ../data/pdb_str/cs219_hits/hits_xbd_cath_pairs_ae.gz
filter by AA
filter eq Chain
102 ../data/pdb_str/cs219_hits/hits_xbd_cath_pairs_af.gz
filter by AA
filter eq Chain
103 ../data/pdb_str/cs219_hits/hits_xbd_cath_pairs_ag.gz
filter by AA
filter eq Chain
104 ../data/pdb_str/cs219_hits/hits_xbd_cath_pairs_ah.gz
filter by AA
filter eq Chain
105 ../data/pdb_str/cs219_hits/hits_xbd_cath_pairs_ai.gz
filter by AA
filter eq Chain
106 ../data/pdb_str/cs219_hits/hits_xbd_cath_pairs_aj.gz
filter by AA
filter eq Chain
107 ../data/pdb_str/cs219_hits/hits_xbd_cath_pairs_ak.gz
filter by AA
filter eq Chain
108 ../data/pdb_str/cs219_hi

192 ../data/pdb_str/cs219_hits/hits_xbg_cath_pairs_ar.gz
filter by AA
filter eq Chain
193 ../data/pdb_str/cs219_hits/hits_xbg_cath_pairs_as.gz
filter by AA
filter eq Chain
194 ../data/pdb_str/cs219_hits/hits_xbg_cath_pairs_at.gz
filter by AA
filter eq Chain
195 ../data/pdb_str/cs219_hits/hits_xbg_cath_pairs_au.gz
filter by AA
filter eq Chain
196 ../data/pdb_str/cs219_hits/hits_xbg_cath_pairs_av.gz
filter by AA
filter eq Chain
197 ../data/pdb_str/cs219_hits/hits_xbg_cath_pairs_aw.gz
filter by AA
filter eq Chain
198 ../data/pdb_str/cs219_hits/hits_xbg_cath_pairs_ax.gz
filter by AA
filter eq Chain
199 ../data/pdb_str/cs219_hits/hits_xbg_cath_pairs_ay.gz
filter by AA
filter eq Chain
200 ../data/pdb_str/cs219_hits/hits_xbg_cath_pairs_az.gz
filter by AA
filter eq Chain
201 ../data/pdb_str/cs219_hits/hits_xbh_cath_pairs_aa.gz
filter by AA
filter eq Chain
202 ../data/pdb_str/cs219_hits/hits_xbh_cath_pairs_ab.gz
filter by AA
filter eq Chain
203 ../data/pdb_str/cs219_hits/hits_xbh_cath_pairs_ac.

filter by AA
filter eq Chain
288 ../data/pdb_str/cs219_hits/hits_xbk_cath_pairs_aj.gz
filter by AA
filter eq Chain
289 ../data/pdb_str/cs219_hits/hits_xbk_cath_pairs_ak.gz
filter by AA
filter eq Chain
290 ../data/pdb_str/cs219_hits/hits_xbk_cath_pairs_al.gz
filter by AA
filter eq Chain
291 ../data/pdb_str/cs219_hits/hits_xbk_cath_pairs_am.gz
filter by AA
filter eq Chain
292 ../data/pdb_str/cs219_hits/hits_xbk_cath_pairs_an.gz
filter by AA
filter eq Chain
293 ../data/pdb_str/cs219_hits/hits_xbk_cath_pairs_ao.gz
filter by AA
filter eq Chain
294 ../data/pdb_str/cs219_hits/hits_xbk_cath_pairs_ap.gz
filter by AA
filter eq Chain
295 ../data/pdb_str/cs219_hits/hits_xbk_cath_pairs_aq.gz
filter by AA
filter eq Chain
296 ../data/pdb_str/cs219_hits/hits_xbk_cath_pairs_ar.gz
filter by AA
filter eq Chain
297 ../data/pdb_str/cs219_hits/hits_xbk_cath_pairs_as.gz
filter by AA
filter eq Chain
298 ../data/pdb_str/cs219_hits/hits_xbk_cath_pairs_at.gz
filter by AA
filter eq Chain
299 ../data/pdb_str/cs219

In [103]:
source = "../data/pdb_str/statistics/temp/"
dfs = [ pd.read_csv("%s" % f)  for f in  files ]
df = pd.concat(dfs)
df = df.groupby("hit").N.sum().reset_index(name="N")
df.head()

Unnamed: 0,hit,N
0,!,23381223
1,!!,300564
2,!!!,2135
3,!!!!,39
4,!!!!/,1


In [105]:
df.to_csv("../data/pdb_str/statistics/cs219_histogram_count_by_hit.csv", index=False)

In [8]:
MK_HITS_DIR = "../data/pdb_str/m32k25_hits/"
files = [ "".join([MK_HITS_DIR, f]) for f in listdir(MK_HITS_DIR) ]

Unnamed: 0,x,y,hit_length,hit
0,12asA00,132lA00,7,UUUUUUV
1,12asA00,153lA00,20,UUUUUUUUUUUUUUUUUUUU
2,12asA00,155cA00,9,KVUUUUUUU
3,12asA00,16pkA02,14,UUUUUUUUUUUUUU
4,12asA00,16vpA00,21,VUUUUUUUUUUUUUUUUUUUU


In [11]:
for f in files:
    _df = pd.read_csv(f)
    _df = _df[_df.hit == "AABLPWNAA"]
    if _df.shape[0] > 0:
        break

In [12]:
_df

Unnamed: 0,x,y,hit_length,hit
212035,1iwmA00,1u14A00,9,AABLPWNAA
