In [1]:
from sklearn import metrics
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import model_selection

In [2]:
#download data
raw_data = pd.read_csv('TargerF.csv', sep = ',')

In [3]:
raw_data.head()

Unnamed: 0,bin,enhancer_chrom,enhancer_distance_to_promoter,enhancer_end,enhancer_name,enhancer_start,label,promoter_chrom,promoter_end,promoter_name,...,ZNF143 (enhancer),ZNF143 (promoter),ZNF143 (window),ZNF274 (window),ZNF384 (enhancer),ZNF384 (promoter),ZNF384 (window),ZZZ3 (enhancer),ZZZ3 (promoter),ZZZ3 (window)
0,"[22271, 83570.8]",chr1,60682,9686400,GM12878|chr1:9685722-9686400,9685722,1,chr1,9749721,GM12878|chr1:9747084-9749721,...,0.0,0.016543,0.004661,0.0,0.005584,0.0,0.008941,0.0,0.0,0.0
1,"[22271, 83570.8]",chr1,56866,24136600,GM12878|chr1:24136556-24136600,24136556,1,chr1,24194871,GM12878|chr1:24193468-24194871,...,0.0,0.099907,0.00683,0.0,0.199458,0.0,0.000703,0.0,0.0,0.0
2,"[22271, 83570.8]",chr1,56534,24136932,GM12878|chr1:24136600-24136932,24136600,1,chr1,24194871,GM12878|chr1:24193468-24194871,...,0.0,0.099907,0.00687,0.0,0.026434,0.0,0.000552,0.0,0.0,0.0
3,"[22271, 83570.8]",chr1,55591,24137875,GM12878|chr1:24137625-24137875,24137625,1,chr1,24194871,GM12878|chr1:24193468-24194871,...,0.0,0.099907,0.00619,0.0,0.0,0.0,0.000561,0.0,0.0,0.0
4,"[22271, 83570.8]",chr1,54052,24139414,GM12878|chr1:24139145-24139414,24139145,1,chr1,24194871,GM12878|chr1:24193468-24194871,...,0.0,0.099907,0.006366,0.0,0.0,0.0,0.000577,0.0,0.0,0.0


In [4]:
#drop nonpredictors
nonpredictors = ['enhancer_name','promoter_name', 'enhancer_start', 'enhancer_end', 'promoter_chrom', 'promoter_start', 'promoter_end', 'window_chrom', 'window_start', 'window_end', 'window_name', 'active_promoters_in_window', 'interactions_in_window', 'enhancer_distance_to_promoter', 'bin']
data = raw_data.drop(nonpredictors, axis=1)

In [5]:
data.head()

Unnamed: 0,enhancer_chrom,label,ATF2 (enhancer),ATF2 (promoter),ATF2 (window),ATF3 (enhancer),ATF3 (promoter),ATF3 (window),BATF (enhancer),BATF (promoter),...,ZNF143 (enhancer),ZNF143 (promoter),ZNF143 (window),ZNF274 (window),ZNF384 (enhancer),ZNF384 (promoter),ZNF384 (window),ZZZ3 (enhancer),ZZZ3 (promoter),ZZZ3 (window)
0,chr1,1,0.085793,0.0,0.003873,0.0,0.0,0.0,0.032474,0.0,...,0.0,0.016543,0.004661,0.0,0.005584,0.0,0.008941,0.0,0.0,0.0
1,chr1,1,1.349429,0.027654,0.003495,0.0,0.0,0.000386,0.0,0.0,...,0.0,0.099907,0.00683,0.0,0.199458,0.0,0.000703,0.0,0.0,0.0
2,chr1,1,0.17884,0.027654,0.002465,0.0,0.0,0.000388,0.0,0.0,...,0.0,0.099907,0.00687,0.0,0.026434,0.0,0.000552,0.0,0.0,0.0
3,chr1,1,0.0,0.027654,0.002507,0.0,0.0,0.000394,0.0,0.0,...,0.0,0.099907,0.00619,0.0,0.0,0.0,0.000561,0.0,0.0,0.0
4,chr1,1,0.0,0.027654,0.002578,0.0,0.0,0.000406,0.0,0.0,...,0.0,0.099907,0.006366,0.0,0.0,0.0,0.000577,0.0,0.0,0.0


In [6]:
#split data to train and test, 20 chromosomes to train, 3 to test
import random
def train_test_split(data):
    chromosomes = data.enhancer_chrom.unique()
    print(len(chromosomes))
    list_of_random_items = random.sample(set(chromosomes), 20)
    data_train = data[data.enhancer_chrom.isin(list_of_random_items)]
    data_test = data[~data.enhancer_chrom.isin(list_of_random_items)]
    print (len(data_train), len(data_test))
    return data_train, data_test

In [7]:
data_train, data_test = train_test_split(data)

23
(40180, 4133)


In [8]:
#define labels and drop it from data
label_train = data_train.label
data_train = data_train.drop(['label', 'enhancer_chrom'], axis=1)

label_test = data_test.label
data_test = data_test.drop(['label', 'enhancer_chrom'], axis=1)

In [9]:
estimator = GradientBoostingClassifier(n_estimators = 4000, learning_rate = 0.1, max_depth = 5, max_features = 'log2', random_state = 0)

In [10]:
#predict
train_data, test_data, train_labels, test_labels = data_train, data_test, label_train, label_test
estimator.fit(train_data, train_labels)
p = estimator.predict(test_data)
print("accuracy: ", metrics.accuracy_score(test_labels, p))
print("precision_score: ",metrics.precision_score(test_labels, p))
print("recall_score", metrics.recall_score(test_labels, p))
print("f1_score",metrics.f1_score(test_labels, p))

('accuracy: ', 0.9528187757077183)
('precision_score: ', 0.7777777777777778)
('recall_score', 0.035)
('f1_score', 0.06698564593301436)
