In [1]:
from glob import glob
import os
from pathlib import Path
import random

import sidechainnet as scn

In [2]:
pl = scn.get_proteinnet_ids(12, "all", thinning=100)
len(pl)

104323

In [3]:
def do_glob(pth):
    items = [Path(p).stem for p in glob(pth)]
    items_set = set(items)
    assert len(items_set) == len(items), "Duplicates found."
    return items_set

In [4]:
unmin = do_glob("/net/pulsar/home/koes/jok120/scnmin220511/unmin/*pkl")

min1 = do_glob("/net/pulsar/home/koes/jok120/scnmin220511/min/*pkl")
failed1 = do_glob("/net/pulsar/home/koes/jok120/scnmin220511/failed/*")

min2 = do_glob("/net/pulsar/home/koes/jok120/scnmin220512/min/*pkl")
failed2 = do_glob("/net/pulsar/home/koes/jok120/scnmin220512/failed/*")

In [5]:
all_proteins = set(pl)

In [7]:
missing1 = all_proteins - set(min1)
missing2 = all_proteins - set(min2)

In [8]:
should_have_failed1 = missing1 - set(failed1)
should_have_failed2 = missing2 - set(failed2)

In [9]:
list(map(len, [all_proteins, unmin, min1, failed1, missing1, should_have_failed1]))

[104323, 103969, 46988, 44019, 57336, 13318]

These proteins failed to generate unminimized versions.

In [10]:
did_not_make_unmin = all_proteins - unmin
len(did_not_make_unmin)

356

These proteins failed to minimize and reported their failure.

In [11]:
len(failed1), random.sample(failed1, 5)

(44019, ['4ESF_1_A', '4WZ7_11_G', '4IZE_1_A', '3O9O_1_A', '1P5W_2_A'])

These proteins failed to minimze but also did not report their failure.

In [14]:
unknown_failures1 = unmin - min1 - failed1
len(unknown_failures1), sorted(unknown_failures1)[:5]

(12962, ['122L_1_A', '123L_1_A', '125L_1_A', '156L_1_A', '157L_1_A'])

In [15]:
unknown_failures2 = unmin - min2 - failed2
len(unknown_failures2), sorted(unknown_failures2)[:5]

(13236, ['120L_1_A', '122L_1_A', '123L_1_A', '191L_1_A', '192L_1_A'])

In [16]:
unknown_failures1 == unknown_failures2

False

These proteins sucessfully minimized.

In [17]:
len(min1), len(min2), len(min1.union(min2))

(46988, 46910, 52873)

In [18]:
all_minimized = min1.union(min2)

In [19]:
remaining = all_proteins - all_minimized

In [20]:
len(remaining)

51451

In [21]:
sorted(remaining)[:5]

['10#1HF2_1_A', '10#2IJR_1_A', '10#3DB9_1_A', '10#3LM3_1_A', '10#3MK8_2_B']

## Find reason why failed

In [23]:
import os

In [None]:
did_not_pickle = []

for scn12protein in all_proteins:
    if scn12protein not in os.listdir("/net/pulsar/home/koes/jok120/scnmin220511/unmin/")

In [24]:
os.listdir("/net/pulsar/home/koes/jok120/scnmin220511/unmin/")

['3QO0_3_C.pkl',
 '3N56_2_C.pkl',
 '5FGB_3_F.pkl',
 '1T3E_2_P.pkl',
 '3N57_2_C.pkl',
 '4M38_2_E.pkl',
 '1A38_2_P.pkl',
 '1HTM_1_A.pkl',
 '3OA6_4_G.pkl',
 '4BOH_3_M.pkl',
 '4V2W_2_C.pkl',
 '4N4F_2_C.pkl',
 '1Q1S_1_A.pkl',
 '4ZJ7_2_B.pkl',
 '1O6O_2_D.pkl',
 '1TZS_3_X.pkl',
 '5HDT_2_E.pkl',
 '3E50_2_C.pkl',
 '3IFN_3_P.pkl',
 '4WNN_3_T.pkl',
 '4HDQ_3_C.pkl',
 '2B2W_3_D.pkl',
 '1YTV_2_M.pkl',
 '1KO6_2_B.pkl',
 '4EZN_2_C.pkl',
 '3IUR_2_B.pkl',
 '3N9N_2_B.pkl',
 '10#4USL_2_D.pkl',
 '5EKF_1_B.pkl',
 '3BAE_3_A.pkl',
 '4X2H_3_C.pkl',
 '2YVC_2_D.pkl',
 '3EMW_2_B.pkl',
 '3WGX_2_C.pkl',
 '4BJT_2_D.pkl',
 '4EZP_2_C.pkl',
 '3P6Z_3_C.pkl',
 '4HTV_2_B.pkl',
 '1UM2_2_C.pkl',
 '4GGD_2_C.pkl',
 '4FT4_2_P.pkl',
 '4N1A_2_G.pkl',
 '3GZ2_2_P.pkl',
 '3LGE_2_E.pkl',
 '4J6S_2_E.pkl',
 '2UXN_3_E.pkl',
 '4MMY_3_C.pkl',
 '3EGS_3_C.pkl',
 '2ATP_2_E.pkl',
 '4JLQ_2_B.pkl',
 '4GQB_3_C.pkl',
 '4WNL_2_E.pkl',
 '3ZMS_3_C.pkl',
 '1RDT_4_E.pkl',
 '4LCD_2_C.pkl',
 '4KA3_2_B.pkl',
 '4HKC_2_B.pkl',
 '10#4DT7_3_E.pkl',
 '1T7F_2