# Do ssRNA bind the protein more via the base than dsRNA?

In [14]:
# load the json description of each fragment (PDB structure, 2D, sequence...)
# load the npy coordinates of each fragment
%run ../search_frag_library/load_data.py

In [21]:
# define query functions
%run ../search_frag_library/query.py
%run ../search_frag_library/requests.py

SyntaxError: invalid syntax (query.py, line 42)

In [16]:
# define the data schema
%run ../search_frag_library/make_chainschema.py

In [17]:
len(fragments)

464495

In [19]:
# mask all fully single-stranded ("ss") fragments
ss = query(chaindata, chainschema, fragments, is_ss, "ss", part=None)

ERROR:root:An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 4))



KeyError: 'pos'

In [92]:
percent_ss = 100*sum(ss)/len(fragments)
print('%i %% fragments are fully single-stranded'%percent_ss)

9 % fragments are fully single-stranded


In [5]:
# mask all fully double-stranded ("ds") fragments
ds = query(chaindata, chainschema, fragments, is_ds, "ss", part=None)

In [91]:
percent_ds = 100*sum(ds)/len(fragments)
print('%i %% fragments are fully double-stranded'%percent_ds)

52 % fragments are fully double-stranded


In [81]:
# Select ss/ds fragments which central nucleotide binds via phosphate ("ph")
ss_ph = query(chaindata, chainschema, fragments[ss], contact_parts, "interface_protein", part="ph")
ds_ph = query(chaindata, chainschema, fragments[ds], contact_parts, "interface_protein", part="ph")

In [52]:
# Do the same for each nucleotide part
masks = {}
counts = {}
for part in ["ph","sug", "base"]:
    for statename, state in zip(["ss","ds"],[ss, ds]):
        masks[statename, part] = np.array(query(chaindata, chainschema, fragments[state], contact_parts, "interface_protein", part))
        counts[statename, part] = sum(masks[statename, part])
        print(part, statename, counts[statename, part])

('ph', 'ss', 25065)
('ph', 'ds', 102747)
('sug', 'ss', 23699)
('sug', 'ds', 86156)
('base', 'ss', 22043)
('base', 'ds', 68126)


In [49]:
# Count number of fragments binding the protein with any nucleotidic part
def sum_bool(masks, statename):
    mask = masks[statename,"sug"] | masks[statename,"ph"] | masks[statename,"base"]
    return sum(mask)

In [50]:
ds_bound = sum_bool(masks, "ds")
ss_bound = sum_bool(masks, "ss")

244526
45960


In [96]:
# Count what percentage of ss/ds binding fragments bind via ph/sug/base
bound = {"ds": ds_bound, "ss": ss_bound}
percent = {}
for part in ["ph","sug", "base"]:
    for statename, state in zip(["ss","ds"],[ss, ds]):
        percent[statename, part] = 100*sum(masks[statename,part])/bound[statename]
        print('%i%% of %s bound fragments bind via the %s'%(percent[statename, part], statename, part))

85% of ss bound fragments bind via the ph
87% of ds bound fragments bind via the ph
80% of ss bound fragments bind via the sug
73% of ds bound fragments bind via the sug
75% of ss bound fragments bind via the base
58% of ds bound fragments bind via the base


# Are the differences between ssRNA and dsRNA significant?

In [59]:
from scipy.stats import fisher_exact
def fisher(part):
    ss_true = counts["ss",part]
    ss_false = ss_bound - counts["ss",part]
    ds_true = counts["ds",part]
    ds_false = ds_bound - counts["ds",part]
    a=[[ss_true, ss_false],[ds_true, ds_false]]
    return fisher_exact(a)

In [66]:
percent["ss","base"], percent["ds","base"]

(75, 58)

In [112]:
for part in ["ph", "sug", "base"]:
    fi, p = fisher(part)
    perc_ss = percent["ss",part]
    perc_ds = percent["ds",part]
    print("%s: %s%% ssRNA, %s%% dsRNA, ratio %.1f, p-value %f"%(part, perc_ss, perc_ds, fi, p))

ph: 85% ssRNA, 87% dsRNA, ratio 0.8, p-value 0.000000
sug: 80% ssRNA, 73% dsRNA, ratio 1.5, p-value 0.000000
base: 75% ssRNA, 58% dsRNA, ratio 2.2, p-value 0.000000
