In [16]:
import numpy as np, pandas as pd

from modules.gkm_to_kmers import *
from modules.feat_imp import *

In [5]:
exp = pd.read_csv('ctcf_ml_datasets/ls-gkm/gkmexp_pos_impscores.txt', sep='\t', header=None, names=['seq_id', 'total_score', 'raw_contrib'])
exp

Unnamed: 0,seq_id,total_score,raw_contrib
0,Chr20:209112-209162,0.687623,"0,0,-0.00430696,0;0,0,-0.0058605,0;0,-0.001097..."
1,Chr20:267130-267180,0.329949,"0,0,-0.00485222,0;0,0,0,-0.00224563;0,0,-0.008..."
2,Chr20:290750-290800,0.217206,"0,0,0.00285889,0;0,0.00208094,0,0;0,0.00659347..."
3,Chr20:330740-330790,1.162030,"0,0,0,0.00193958;0,0,0.00279447,0;0,0,0.004610..."
4,Chr20:354618-354668,2.034350,"0,0,0.000939664,0;0.00139033,0,0,0;0,0,0.01349..."
...,...,...,...
395,ChrX:14039481-14039531,0.764577,"-0.00271244,0,0,0;0,0,-0.00227391,0;0,-0.00491..."
396,ChrX:14095821-14095871,0.996715,"0,0,0,-0.000562098;0,0,0,0.00166636;0,0,0.0084..."
397,ChrX:14528864-14528914,-0.248347,"0,0.00541292,0,0;0,0,0,0.00487365;0,0,0,0.0026..."
398,ChrX:14697644-14697694,0.885247,"0.000541073,0,0,0;0,0.00856637,0,0;0,0,0,0.027..."


In [6]:
seqs = fa_dict('ctcf_ml_datasets/ls-gkm/test_pos.fa')

In [7]:
kmer_imp, kmer_counts = exp_to_kmer_imp(exp, seqs, k=6)

In [8]:
len(kmer_imp)

3652

In [10]:
sorted(kmer_imp.items(), key=lambda x: abs(x[1]), reverse=True)[:20]

[('AGGGGG', 33.275970336),
 ('AGAGGG', 32.40303419999999),
 ('CCCTCT', 28.917012713000005),
 ('GCCACC', 28.676624480000005),
 ('CACCAG', 25.547688471999997),
 ('CTAGTG', 24.345899271500002),
 ('CAGGGG', 22.984863435440005),
 ('GGGGGC', 22.603668166000002),
 ('CCCCCT', 22.467795230999997),
 ('CCACCA', 21.4249725107),
 ('CCACCT', 20.8471203),
 ('GAGGGC', 20.833836049000002),
 ('GCCCTC', 19.544821442999996),
 ('CAGCAG', 19.472080291000005),
 ('GGTGGC', 19.127339582999998),
 ('CTGGTG', 18.673929915000002),
 ('TAGTGG', 17.92034275),
 ('CAGAGG', 17.600743198000004),
 ('AGGTGG', 17.274458185670003),
 ('CTGCTG', 16.497807183000006)]

In [12]:
norm_kmer_imp = {k:kmer_imp[k]/kmer_counts[k] for k in kmer_imp.keys()}
len(norm_kmer_imp)

3652

In [13]:
top_norm_kmers = sorted(norm_kmer_imp.items(), key=lambda x: abs(x[1]), reverse=True)
top_norm_kmers[:20]

[('ACTAGG', 0.8087205472727272),
 ('ACCAGG', 0.7596865585714285),
 ('CACCTA', 0.7521879350000001),
 ('CACCAG', 0.751402602117647),
 ('AGGGGG', 0.7394660074666667),
 ('AGGTGG', 0.7197690910695834),
 ('CCACCT', 0.7188662172413793),
 ('TCTAGT', 0.718581720225),
 ('CTAGTG', 0.7160558609264707),
 ('ATCTAG', 0.7036906010000001),
 ('GCCACC', 0.6827767733333334),
 ('ACGAGG', 0.6757165),
 ('AGAGGG', 0.6750632124999999),
 ('ACCAGA', 0.6710164707142857),
 ('CCACCA', 0.669530390959375),
 ('CACTAG', 0.6684117790833333),
 ('TAGAGG', 0.6663178534666668),
 ('CACGAG', 0.6636408666666668),
 ('ACCTAG', 0.663441890357143),
 ('CCCCCT', 0.6608175067941175)]

## Alignment to ENCODE PWM

In [15]:
all_kmers = list(kmer_imp.keys())
all_kmers[:5]

['GGCCTG', 'GCCTGC', 'CCTGCA', 'CTGCAA', 'TGCAAA']

In [17]:
pwm = load_jaspar_pwm('MA0139.1.jaspar').T
pwm

array([[0.18291347, 0.15881709, 0.45345016, 0.20481928],
       [0.30777656, 0.05366922, 0.49178532, 0.14676889],
       [0.06133625, 0.8762322 , 0.0230011 , 0.03943045],
       [0.00876232, 0.9890471 , 0.        , 0.00219058],
       [0.81489595, 0.01423877, 0.07119387, 0.09967141],
       [0.04381161, 0.57831325, 0.36582694, 0.01204819],
       [0.11732456, 0.4747807 , 0.05263158, 0.35526316],
       [0.93311404, 0.0120614 , 0.03508772, 0.01973684],
       [0.00548847, 0.        , 0.99121844, 0.00329308],
       [0.36553238, 0.00329308, 0.62129528, 0.00987925],
       [0.05927552, 0.01317234, 0.5532382 , 0.37431394],
       [0.01318681, 0.        , 0.97802198, 0.00879121],
       [0.06153846, 0.00879121, 0.85164835, 0.07802198],
       [0.11441144, 0.80638064, 0.00550055, 0.07370737],
       [0.40924092, 0.01430143, 0.55775578, 0.01870187],
       [0.09030837, 0.530837  , 0.33810573, 0.0407489 ],
       [0.12885463, 0.35462555, 0.08039648, 0.43612335],
       [0.44273128, 0.19933921,

In [19]:
kmer_scores = {k:best_pwm_score(k,pwm) for k in all_kmers}
len(kmer_scores)

3652

In [20]:
bg_scores = get_background_dist(all_kmers, 50, kmer_scores)
bg_scores.shape

(10000,)

In [31]:
top_kmer_pwm_scores = [kmer_scores[k[0]] for k in top_norm_kmers[:50]]
top_kmer_pwm_scores[:20]

[4.294100114606082,
 4.413617658465731,
 2.9583928616381256,
 4.78136947603438,
 4.92853628572402,
 4.749612026668038,
 3.9309094327244334,
 2.9674595537597632,
 3.421026340615799,
 3.0194419569780653,
 4.225054524317365,
 3.9914685356587145,
 4.672773387809641,
 4.157854760551352,
 4.66638323629446,
 4.661851932174731,
 4.176388194056026,
 4.359220353227363,
 2.880909982558082,
 4.109833691780415]

In [41]:
observed = np.mean(top_kmer_pwm_scores)
pval = (np.sum(bg_scores >= observed) + 1) / (10000 + 1)
print(f"Observed mean score: {observed:.3f}")
print(f"Null mean ± std: {bg_scores.mean():.3f} ± {bg_scores.std():.3f}")
print(f"p-value: {pval:.3e}")

Observed mean score: 3.990
Null mean ± std: 3.126 ± 0.071
p-value: 9.999e-05
