In [1]:
#| default_exp one_hot_svm

In [4]:
#| export
import pickle
from torch import tensor
import torch
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score

In [5]:
import numpy as np, pandas as pd

from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.frozen import FrozenEstimator

In [3]:
train = pd.read_csv('ctcf_ml_datasets/train_dinuc.tsv', sep='\t', header=None)
train

Unnamed: 0,0,1,2,3,4
0,CTCCTCAGTTTCTCTGTGCAGCACCAGGTGGCAGCAGAGGTCAGCA...,1,chr1,267981,268031
1,CCCTCCCGCGGCTCCGGAGCCGGCTGCCACCAGGGGGCGCGCCCGC...,1,chr1,869894,869944
2,CACTCCGCCACCAGGGGGCGCCACAGCTCCTCGCGCCGCCGCCTCC...,1,chr1,904763,904813
3,TCCTGCTTGGGGCAGCGTGGAGGCCAGCGGAGGAACTGCAGGAGCC...,1,chr1,913001,913051
4,TCCTCCCTCTGGCGGCGGGAGGcaggctccagcctcagcccagcGG...,1,chr1,921192,921242
...,...,...,...,...,...
3795,GTGCACACAGTGTGCGGCTGTGGTGACGCCCTTATTCTCCTCCAGA...,0,chr19,2951162,2951212
3796,atgaaCACAGGGCAGACCACTATGACCATGCGTCAATTTTTGGAAA...,0,chr19,2956589,2956639
3797,CATCAATTCCCTACATTAGATTGACAAGGAGGCTGGTAGGTTATAC...,0,chr19,2962235,2962285
3798,GGTTTGGAGGGGCGGAGAGTCCAGGGGCCCTCCAGCTGCCCGATCC...,0,chr19,3057552,3057602


In [7]:
#| export
def one_hot(seqs):
    nt_dict = {'A': [1,0,0,0], 'C': [0,1,0,0], 'G': [0,0,1,0], 'T': [0,0,0,1], 'N': [0,0,0,0]}
    
    encoded_seqs = []
    for s in seqs:
        encoded = [nt_dict.get(nt, [0,0,0,0]) for nt in s.upper()]
        encoded_seqs.append(tensor(encoded).T)
    return torch.stack(encoded_seqs).reshape(len(seqs),-1)

In [8]:
x_train = one_hot(train[0])
y_train = tensor(train[1])
x_train.shape, y_train.shape

(torch.Size([3800, 200]), torch.Size([3800]))

In [9]:
idx = torch.randperm(x_train.shape[0])
x_train, y_train = x_train[idx], y_train[idx]
x_train.shape, y_train.shape, y_train[:10]

(torch.Size([3800, 200]),
 torch.Size([3800]),
 tensor([1, 0, 0, 0, 0, 0, 1, 0, 0, 0]))

In [10]:
param_grid = {'C': [0.01,0.1,1,2]}
grid = GridSearchCV(
LinearSVC(max_iter=5000, dual=False),
param_grid,
scoring='roc_auc',
cv=5)

In [11]:
grid.fit(x_train, y_train)

0,1,2
,estimator,LinearSVC(dua...max_iter=5000)
,param_grid,"{'C': [0.01, 0.1, ...]}"
,scoring,'roc_auc'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,False
,tol,0.0001
,C,0.01
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [12]:
grid.best_score_, grid.best_params_

(0.5583462603878117, {'C': 0.01})

In [13]:
cal_svc = CalibratedClassifierCV(FrozenEstimator(grid), method='sigmoid')
cal_svc.fit(x_train, y_train)

0,1,2
,estimator,FrozenEstimat...ng='roc_auc'))
,method,'sigmoid'
,cv,
,n_jobs,
,ensemble,'auto'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,False
,tol,0.0001
,C,0.01
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [14]:
test = pd.read_csv('ctcf_ml_datasets/test_dinuc.tsv', sep='\t', header=None)
test

Unnamed: 0,0,1,2,3,4
0,GGCCTGCAAAATATCCTGTAACTCCTCTAGAGGGTGCTCACTAGCA...,1,chr20,209112,209162
1,GTGCCAGTCTCTACCGCCATCTGCCGGCAGCACTCGGTACACCCGC...,1,chr20,267130,267180
2,GCCAGCGTCTCGGTCTCCAAGGAAACGCGACGCCGCCTACCGAGCG...,1,chr20,290750,290800
3,TGGGGTCTCTGGGGCCACCTAGTGGCTAAGAGGAGTAGAACCAAGG...,1,chr20,330740,330790
4,gAGGCGTCATCTCTCTTCTGTACCACTAGAGGGAGCTCTGATGCAG...,1,chr20,354618,354668
...,...,...,...,...,...
795,AAGCCAAGGTGTAGCACCCCGGGTCAATATGCTGATAACCAATTGC...,0,chrX,14039481,14039531
796,TCAGACGTGCATGGCACCAGTGAGCCGCAGGTGAACAGATGTTCTG...,0,chrX,14095821,14095871
797,CTCCCGCTGTTTGAGAATTGGAGCCATGAACAGGAGCAAAGAAGGC...,0,chrX,14528864,14528914
798,ACCTTTACAGGCTGTCTGCACCCTCCTATGTGATACTGCCTGGGCA...,0,chrX,14697644,14697694


In [15]:
x_test = one_hot(test[0])
y_test = tensor(test[1])
x_test.shape, y_test.shape

(torch.Size([800, 200]), torch.Size([800]))

In [16]:
#|export
def get_metrics(model, x_test, y_test):
    preds = model.predict(x_test)
    try:
        y_scores = model.predict_proba(x_test)[:,1]
    except AttributeError:
        y_scores = model.decision_function(x_test)
    acc, auc, avg_prec = accuracy_score(y_test, preds), roc_auc_score(y_test, y_scores), average_precision_score(y_test, y_scores)
    print("Accuracy:", acc)
    print("AUROC:", auc)
    print("AUPRC:", avg_prec)

In [17]:
get_metrics(cal_svc, x_test, y_test)

Accuracy: 0.50125
AUROC: 0.5059437499999999
AUPRC: 0.516415917441171


## Reverse complement augmentation

In [18]:
#| export
def revcomp(seqs):
    comp_map = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
    rc = []
    for seq in seqs:
        seq = seq.upper()
        rc.append(''.join([comp_map[b] for b in seq[::-1]]))
    return rc

In [19]:
rc = revcomp(train[0])

In [20]:
x_train = torch.vstack((x_train, one_hot(rc)))
y_train = torch.hstack((y_train, tensor(train[1])))
y_train[:10]

tensor([1, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [21]:
x_train.shape, y_train.shape

(torch.Size([7600, 200]), torch.Size([7600]))

In [22]:
idx = torch.randperm(x_train.shape[0])
x_train, y_train = x_train[idx], y_train[idx]
x_train.shape, y_train.shape, y_train[:20]

(torch.Size([7600, 200]),
 torch.Size([7600]),
 tensor([0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0]))

In [23]:
rc_grid = GridSearchCV(
LinearSVC(max_iter=5000, dual=False),
param_grid,
scoring='roc_auc',
cv=5)

In [24]:
rc_grid.fit(x_train, y_train)

0,1,2
,estimator,LinearSVC(dua...max_iter=5000)
,param_grid,"{'C': [0.01, 0.1, ...]}"
,scoring,'roc_auc'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,False
,tol,0.0001
,C,0.01
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [25]:
cal_rc = CalibratedClassifierCV(FrozenEstimator(rc_grid), method='sigmoid')
cal_rc.fit(x_train, y_train)

0,1,2
,estimator,FrozenEstimat...ng='roc_auc'))
,method,'sigmoid'
,cv,
,n_jobs,
,ensemble,'auto'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,False
,tol,0.0001
,C,0.01
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [26]:
get_metrics(cal_rc, x_test, y_test)

Accuracy: 0.52125
AUROC: 0.516575
AUPRC: 0.5177549803540585


In [6]:
import nbdev.export as nb
nb.nb_export('00_ctcf_svm.ipynb', './modules')