In [1]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from collections import defaultdict, Counter
from torch import tensor
import torch
import pickle

from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.frozen import FrozenEstimator

from modules.one_hot_svm import *
from modules.kmer_svm import *
from modules.feat_imp import *

from joblib import dump, load

In [2]:
train = pd.read_csv('sp1_ml_datasets/train.tsv', sep='\t', header=None)
train

Unnamed: 0,0,1,2,3,4
0,CCTGCGTCACTGGGCACAGACGCCAGTGAG,1,chr1,191486,191516
1,GAGCCAATCAGAACTCGCGGTGGGGGCTGC,1,chr1,778741,778771
2,GGCTCCGGATAATCCGTTTCCGGGTCAACA,1,chr1,827496,827526
3,CTCCCTTGGCAGCTCTCAGCTGTCTGTATC,1,chr1,830917,830947
4,TTGCAGGTCACAAGCAGGCTATCAGctcag,1,chr1,831248,831278
...,...,...,...,...,...
1895,TGAACCCTAGCTGAGCAAAAGTGAGTAAGC,0,chr9,2623828,2623858
1896,ACCATCTAGGACCTCCAGGTGGGAATCGGA,0,chr9,2663638,2663668
1897,GCCAAAACCACTGTATAACATTTCACTTAT,0,chr9,2678698,2678728
1898,TTCGATTGAGGCACATCTAGCTGTTGGAGA,0,chr9,2720435,2720465


In [3]:
train_seqs = [seq.upper() for seq in train[0]]
len(train_seqs)

1900

In [4]:
vocab, train_kmer_idx = build_kmer_vocab(train_seqs, k=5)
len(vocab)

1024

In [5]:
x_train = kmer_matrix(train_seqs, train_kmer_idx, k=5)
y_train = np.array(train[1])
x_train.shape, y_train.shape

((1900, 1024), (1900,))

In [6]:
param_grid = {'C': [0.01, 0.1, 1, 2]}
grid = GridSearchCV(
LinearSVC(max_iter=5000, dual=False),
param_grid,
scoring='roc_auc',
cv=5)

In [7]:
grid.fit(x_train, y_train)

0,1,2
,estimator,LinearSVC(dua...max_iter=5000)
,param_grid,"{'C': [0.01, 0.1, ...]}"
,scoring,'roc_auc'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,False
,tol,0.0001
,C,0.01
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [8]:
grid.best_score_, grid.best_params_

(0.604191135734072, {'C': 0.01})

In [9]:
cal_svc = CalibratedClassifierCV(FrozenEstimator(grid), method='sigmoid')
cal_svc.fit(x_train, y_train)

0,1,2
,estimator,FrozenEstimat...ng='roc_auc'))
,method,'sigmoid'
,cv,
,n_jobs,
,ensemble,'auto'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,False
,tol,0.0001
,C,0.01
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [10]:
test = pd.read_csv('sp1_ml_datasets/test.tsv', sep='\t', header=None)
test

Unnamed: 0,0,1,2,3,4
0,CTGTTTGCTCTACCTCTCCCCCTCCTAACA,1,chr20,257531,257561
1,ACCGGGGACAATAGCCCCGCGGGAGTGGGG,1,chr20,290412,290442
2,TCCGGGCGCCCCATCCCGCTGCGGACGGAG,1,chr20,297327,297357
3,CCACCAATGCGCTTCGACCCTGCCCTCCTT,1,chr20,310661,310691
4,CCCCAGGACTGGCTCCGCCCCGTGCCCCTG,1,chr20,324557,324587
...,...,...,...,...,...
395,CCCTGACGGCAAGTCCGCCTTCAATAGAAT,0,chrX,3814964,3814994
396,GATTTCAGATTGAAACGGTGATTGGTCTGT,0,chrX,3817640,3817670
397,AGGCAATTCAGGATATAagcagggtggttt,0,chrX,3817862,3817892
398,GGCGGAACCTCAACAAAGGCACTTGAGCTG,0,chrX,3872421,3872451


In [11]:
test_seqs = [seq.upper() for seq in test[0]]
x_test = kmer_matrix(test_seqs, train_kmer_idx, k=5)
y_test = np.array(test[1])
x_test.shape, y_test.shape

((400, 1024), (400,))

In [12]:
get_metrics(cal_svc, x_test, y_test)

Accuracy: 0.55
AUROC: 0.564075
AUPRC: 0.5565199148076931


## 200bp windows centered at summit as input sequences

In [13]:
train = pd.read_csv('sp1_ml_datasets/train_200bp.tsv', sep='\t', header=None)
train

Unnamed: 0,0,1,2,3,4
0,CCTCCAGTCTCTGCACACTCCCAGCTGCAGCAGAGCCGGAGGAGAG...,1,chr1,191401,191601
1,CCTCTATGGTGTCGGCGAAGACCCGCCCTTGTGACGTCACGGAAGG...,1,chr1,778656,778856
2,CCGGGTGTGGAGGACGCCGCAGGGAGGGGACTGCGTGGCTGGGTTT...,1,chr1,827411,827611
3,TCTCTGTCCTTCATGGTGCATCCTCCTGTTGACTCCTGACCATCTG...,1,chr1,830832,831032
4,GGAGGAGTTTCTCTTTCCTAGGGTACACGTGGACATGCCTATGACT...,1,chr1,831163,831363
...,...,...,...,...,...
1895,AACAACCAGCAAAAACTCTGTAATATTTATTTTGCTAAAAAAGGAT...,0,chr9,2623743,2623943
1896,AGCACGCTCAAGATCTGAATCTCCCCAGGCAAGAGTAGTGACTAAT...,0,chr9,2663553,2663753
1897,GCCACAAGCAACCCATGAAATCACTCATACTACACACCACCAAAGA...,0,chr9,2678613,2678813
1898,AGGGCTAACTCTTACAGGTCTTGAAGAGATAGCTCAATATGTTAGG...,0,chr9,2720350,2720550


In [14]:
train_seqs = [seq.upper() for seq in train[0]]
len(train_seqs)

1900

In [15]:
vocab, train_kmer_idx = build_kmer_vocab(train_seqs, k=6)
len(vocab)

4096

In [16]:
x_train = kmer_matrix(train_seqs, train_kmer_idx, k=6)
y_train = np.array(train[1])
x_train.shape, y_train.shape

((1900, 4096), (1900,))

In [17]:
param_grid = {'C': [0.01, 0.1, 1, 2]}
grid = GridSearchCV(
LinearSVC(max_iter=5000, dual=False),
param_grid,
scoring='roc_auc',
cv=5)

In [18]:
grid.fit(x_train, y_train)

0,1,2
,estimator,LinearSVC(dua...max_iter=5000)
,param_grid,"{'C': [0.01, 0.1, ...]}"
,scoring,'roc_auc'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,False
,tol,0.0001
,C,0.01
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [19]:
grid.best_score_, grid.best_params_

(0.8296731301939058, {'C': 0.01})

In [20]:
cal_svc = CalibratedClassifierCV(FrozenEstimator(grid), method='sigmoid')
cal_svc.fit(x_train, y_train)

0,1,2
,estimator,FrozenEstimat...ng='roc_auc'))
,method,'sigmoid'
,cv,
,n_jobs,
,ensemble,'auto'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,False
,tol,0.0001
,C,0.01
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [21]:
test = pd.read_csv('sp1_ml_datasets/test_200bp.tsv', sep='\t', header=None)
test

Unnamed: 0,0,1,2,3,4
0,CGCCAAGAGAATGCTAGTTTCCCCACATCTCACCAACACTGTCTAG...,1,chr20,257446,257646
1,AAAGAACTTCCATTATCCCCAGCCCAAGGGGCAGCAAGCAGCAATT...,1,chr20,290327,290527
2,AAATCGTCGAGCACCGCTTGCGAAACGCCAAATCCTCGAGAGCGAG...,1,chr20,297242,297442
3,CGCCAGAGGCCCCGCCCCCCAGCTCTGGGGAGTCCACCAATCAGCA...,1,chr20,310576,310776
4,GGTGCCCGAACTGGCTCACACGTGGTGAGCCGGGAGCGCGCGGCCC...,1,chr20,324472,324672
...,...,...,...,...,...
395,GTACTACGGCACGTAGATCAAGGTTGGCCACCCCCCAAGAGAAGTC...,0,chrX,3814879,3815079
396,AATGATTGATGGCCAATTGTGTTGAAGGTGGGGCTGGGAATGCTTT...,0,chrX,3817555,3817755
397,caaattgtgcaggttaAagaaacgttgacaaatgggtggaacaagc...,0,chrX,3817777,3817977
398,ACCCAGATCTGGACCTTTTAGTATGTCATTCGTCATGCTTAGTGAC...,0,chrX,3872336,3872536


In [22]:
test_seqs = [s.upper() for s in test[0]]
len(test_seqs)

400

In [23]:
x_test = kmer_matrix(test_seqs, train_kmer_idx, k=6)
y_test = np.array(test[1])
x_test.shape, y_test.shape

((400, 4096), (400,))

In [24]:
get_metrics(cal_svc, x_test, y_test)

Accuracy: 0.7275
AUROC: 0.7979999999999999
AUPRC: 0.7914013957121699


## Assessing motif learning 

In [25]:
weights = get_weights(cal_svc)

In [26]:
imp_idx = np.argsort(np.abs(weights))[::-1]

In [27]:
imp_kmers = [vocab[i] for i in imp_idx]
imp_weights = [weights[i] for i in imp_idx]
imp_dict = {vocab[i]:weights[i] for i in imp_idx}
list(zip(imp_kmers, imp_weights))[:20]

[('GGGGGG', -0.20114985920554856),
 ('GTTTAC', 0.13303894933648144),
 ('CCCCCC', -0.1304997866340516),
 ('TGACCT', 0.1262421061233719),
 ('CCCTCA', -0.1255865171006991),
 ('TGCTCC', -0.12435046189218325),
 ('TGGGCC', -0.11547435702907431),
 ('CAATTG', -0.11476936637930521),
 ('AGGTCA', 0.1136992045200514),
 ('CCGGCT', 0.11051155723845535),
 ('GGGCGC', -0.10831836506993675),
 ('TGATCT', 0.10806582365012259),
 ('CGCCCG', -0.10716877386962523),
 ('ATAGAG', 0.10715493505296582),
 ('GTTTTT', -0.10707618606524147),
 ('GGGGGC', -0.1053235728662247),
 ('CTCCTC', 0.10426897161266933),
 ('TTTTAG', 0.10420780290330003),
 ('AGGAAG', 0.10353400938459625),
 ('CCCGCC', 0.10264549434629298)]

In [28]:
pwm = load_jaspar_pwm('MA0079.1.jaspar').T
pwm

array([[0.25 , 0.125, 0.5  , 0.125],
       [0.   , 0.   , 1.   , 0.   ],
       [0.   , 0.   , 1.   , 0.   ],
       [0.   , 0.625, 0.25 , 0.125],
       [0.25 , 0.   , 0.5  , 0.25 ],
       [0.   , 0.125, 0.625, 0.25 ],
       [0.   , 0.   , 0.75 , 0.25 ],
       [0.125, 0.125, 0.75 , 0.   ],
       [0.25 , 0.   , 0.   , 0.75 ]])

In [29]:
kmer_scores = {k:best_pwm_score(k,pwm) for k in imp_kmers}
len(kmer_scores)

4096

In [30]:
top_kmers = [kmer_scores[k] for k in imp_kmers[:50]]
top_kmers[:10]

[4.125, 1.75, 4.125, 2.375, 3.375, 2.625, 2.5, 1.5, 2.375, 3.25]

In [31]:
observed = np.mean(top_kmers)
observed

2.5975

In [32]:
bg_scores = get_background_dist(imp_kmers, 50, kmer_scores)
bg_scores.shape

(10000,)

In [33]:
pval = (np.sum(bg_scores >= observed) + 1) / (10000 + 1)
pval

0.0826917308269173

Not significantly more enriched for sequences that align well to canonical SP1 motif.

In [34]:
def has_cg(k):
    return "CG" in k

def bg_cg_fraction(all_kmers, n_samp, n_iter=10000):
    rng = np.random.default_rng()
    null_stats = []
    for _ in range(n_iter):
        sampled = rng.choice(all_kmers, size=n_samp, replace=False)
        stat = np.mean([has_cg(k) for k in sampled])
        null_stats.append(stat)
    return np.array(null_stats)

In [40]:
fraction_top = np.mean([has_cg(k) for k in imp_kmers[:50]])
fraction_nulls = bg_cg_fraction(imp_kmers, 50)

In [41]:
(np.sum(fraction_nulls >= fraction_top) + 1) / (10000 + 1)

0.9991000899910009

Also not significantly more enriched for CpG sequences.