In [1]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from collections import defaultdict, Counter
from torch import tensor
import torch
import pickle

from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.frozen import FrozenEstimator

from modules.one_hot_svm import *
from modules.kmer_svm import *
from modules.feat_imp import *

from joblib import dump, load

In [2]:
train = pd.read_csv('ctcf_ml_datasets/train_dinuc.tsv', sep='\t', header=None)
train

Unnamed: 0,0,1,2,3,4
0,CTCCTCAGTTTCTCTGTGCAGCACCAGGTGGCAGCAGAGGTCAGCA...,1,chr1,267981,268031
1,CCCTCCCGCGGCTCCGGAGCCGGCTGCCACCAGGGGGCGCGCCCGC...,1,chr1,869894,869944
2,CACTCCGCCACCAGGGGGCGCCACAGCTCCTCGCGCCGCCGCCTCC...,1,chr1,904763,904813
3,TCCTGCTTGGGGCAGCGTGGAGGCCAGCGGAGGAACTGCAGGAGCC...,1,chr1,913001,913051
4,TCCTCCCTCTGGCGGCGGGAGGcaggctccagcctcagcccagcGG...,1,chr1,921192,921242
...,...,...,...,...,...
3795,GTGCACACAGTGTGCGGCTGTGGTGACGCCCTTATTCTCCTCCAGA...,0,chr19,2951162,2951212
3796,atgaaCACAGGGCAGACCACTATGACCATGCGTCAATTTTTGGAAA...,0,chr19,2956589,2956639
3797,CATCAATTCCCTACATTAGATTGACAAGGAGGCTGGTAGGTTATAC...,0,chr19,2962235,2962285
3798,GGTTTGGAGGGGCGGAGAGTCCAGGGGCCCTCCAGCTGCCCGATCC...,0,chr19,3057552,3057602


In [3]:
train_seqs = [seq.upper() for seq in train[0]]
len(train_seqs)

3800

In [4]:
vocab, kmer_idx = build_kmer_vocab(train_seqs)

In [5]:
vocab[:10]

['AAAAAA',
 'AAAAAC',
 'AAAAAG',
 'AAAAAT',
 'AAAACA',
 'AAAACC',
 'AAAACG',
 'AAAACT',
 'AAAAGA',
 'AAAAGC']

In [6]:
x_train = kmer_matrix(train_seqs, kmer_idx)
y_train = np.array(train[1])
x_train.shape, y_train.shape

((3800, 4093), (3800,))

In [7]:
x_train.max(axis=1).max()

15.0

In [8]:
idx = torch.randperm(x_train.shape[0])
x_train, y_train = x_train[idx], y_train[idx]
y_train[:20]

array([1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0])

In [9]:
param_grid = {'C': [0.01, 0.1, 1, 2]}
grid = GridSearchCV(
LinearSVC(max_iter=5000, dual=False),
param_grid, 
scoring='roc_auc',
cv=5)

In [10]:
grid.fit(x_train, y_train)

0,1,2
,estimator,LinearSVC(dua...max_iter=5000)
,param_grid,"{'C': [0.01, 0.1, ...]}"
,scoring,'roc_auc'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,False
,tol,0.0001
,C,0.01
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [11]:
grid.best_score_, grid.best_params_

(0.8015360110803323, {'C': 0.01})

In [12]:
cal_svc = CalibratedClassifierCV(FrozenEstimator(grid), method='sigmoid')
cal_svc.fit(x_train, y_train)

0,1,2
,estimator,FrozenEstimat...ng='roc_auc'))
,method,'sigmoid'
,cv,
,n_jobs,
,ensemble,'auto'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,False
,tol,0.0001
,C,0.01
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [13]:
test = pd.read_csv('ctcf_ml_datasets/test_dinuc.tsv', sep='\t', header=None)
test

Unnamed: 0,0,1,2,3,4
0,GGCCTGCAAAATATCCTGTAACTCCTCTAGAGGGTGCTCACTAGCA...,1,chr20,209112,209162
1,GTGCCAGTCTCTACCGCCATCTGCCGGCAGCACTCGGTACACCCGC...,1,chr20,267130,267180
2,GCCAGCGTCTCGGTCTCCAAGGAAACGCGACGCCGCCTACCGAGCG...,1,chr20,290750,290800
3,TGGGGTCTCTGGGGCCACCTAGTGGCTAAGAGGAGTAGAACCAAGG...,1,chr20,330740,330790
4,gAGGCGTCATCTCTCTTCTGTACCACTAGAGGGAGCTCTGATGCAG...,1,chr20,354618,354668
...,...,...,...,...,...
795,AAGCCAAGGTGTAGCACCCCGGGTCAATATGCTGATAACCAATTGC...,0,chrX,14039481,14039531
796,TCAGACGTGCATGGCACCAGTGAGCCGCAGGTGAACAGATGTTCTG...,0,chrX,14095821,14095871
797,CTCCCGCTGTTTGAGAATTGGAGCCATGAACAGGAGCAAAGAAGGC...,0,chrX,14528864,14528914
798,ACCTTTACAGGCTGTCTGCACCCTCCTATGTGATACTGCCTGGGCA...,0,chrX,14697644,14697694


In [14]:
test_seqs = [seq.upper() for seq in test[0]]
x_test = kmer_matrix(test_seqs, kmer_idx)
y_test = np.array(test[1])
x_test.shape, y_test.shape

((800, 4093), (800,))

In [15]:
get_metrics(cal_svc, x_test, y_test)

Accuracy: 0.75625
AUROC: 0.83080625
AUPRC: 0.8381132219078224


In [16]:
weights = get_weights(cal_svc)
weights.shape

(4093,)

In [17]:
imp_idx = np.argsort(np.abs(weights))[::-1]

In [18]:
imp_kmers = [vocab[i] for i in imp_idx]
imp_weights = weights[imp_idx]
imp_kmer_list = list(zip(imp_kmers, imp_weights))
imp_kmer_list[:20]

[('AGGGGG', 0.36290740516052866),
 ('GGGGGG', -0.2847403910838223),
 ('CCCTCT', 0.26601101688362905),
 ('GGTGGC', 0.25494988394491686),
 ('CCACCT', 0.2525180239765296),
 ('CCCCCT', 0.21861121428552888),
 ('CACCAG', 0.2017559796685214),
 ('TAGAGG', 0.20174456160694956),
 ('TAGGTG', 0.19389207366186728),
 ('CAGAAG', 0.19328882221547322),
 ('CCTCTC', -0.19217616958308312),
 ('CCCCCC', -0.19121569022276727),
 ('GAGGGC', 0.18820266361374755),
 ('AGGTGG', 0.18309693033978333),
 ('CTAGAG', 0.17997420187018887),
 ('CCATCT', 0.1777382159890629),
 ('CTAGTG', 0.17357907179178006),
 ('AGAGGG', 0.17276786103887412),
 ('TAGTGG', 0.17244806132666232),
 ('GGAGCG', -0.1637493072921417)]

In [19]:
pwm = load_jaspar_pwm('MA0139.1.jaspar')
pwm = pwm.T

In [20]:
kmer_scores = {k:best_pwm_score(k, pwm) for k in imp_kmers}

In [23]:
top_kmers = [k for k in imp_kmers[:50]]

In [24]:
observed = np.mean([kmer_scores[k] for k in top_kmers])
observed

3.85455656922008

In [25]:
bg_scores = get_background_dist(imp_kmers, 50, kmer_scores)

In [31]:
# One-sided p-value (are top kmers MORE motif-like?)
p_value = (np.sum(bg_scores >= observed) + 1) / (10000 + 1)

print(f"Observed mean score: {observed:.3f}")
print(f"Null mean ± std: {bg_scores.mean():.3f} ± {bg_scores.std():.3f}")
print(f"P-value: {p_value:.3e}")

Observed mean score: 3.855
Null mean ± std: 3.114 ± 0.070
P-value: 9.999e-05
