# Train an XGBoost model

The focus of this is to train a model in one dataset and try to predict in another from a fifferent cell line. My interest here is to check if I can improve our ability to classify by using the k-mer scores as follows:
 - Create a possitive and a negative set from one ChIP-seq cell lines
     - Score the sequence using:
         - E-scores
         - frequency difference
         - a kmer-based model generated by training the sequences / using kmer counts of all the 8-mers in each set
         
     - Additional features:
         - DNA shape
         - Proximity to TSS
         - Evolutionary conservation
     - Train a model using the above features
     - Use the model generated to classify a given set of sequences
     - Performm feature importance studies
 
 To achevive the above, the following functions are needed:
     1. A scoring function for all the sequences
     2. A quick way to get the counts of the k-mers
     3. A quick way to get the DNA-shape features



## Import the useful modules

In [1]:
from multiprocessing import Pool, cpu_count
import subprocess
import pandas as pd
import numpy as np
from math import  exp
import seaborn as sns
import glob
import os

import pybedtools
import pyBigWig
import pysam
pd.set_option('display.max_colwidth', -1)



In [2]:
import matplotlib.pyplot as plt

# Main SVM module and grid search function
from sklearn import svm, grid_search

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
#For partitioning the data
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score, KFold

#Libsvm format data loading
from sklearn.datasets import load_svmlight_file

#Accuracy metrics
from sklearn.metrics import accuracy_score, classification_report, auc

# Creating an learning pipeline
from sklearn.pipeline import Pipeline

from sklearn import feature_selection

from sklearn.externals import joblib

#from xgboost import XGBClassifier

import xgboost as xgb

%matplotlib inline

### Import the core functions for training

In [3]:
from XGB_TFBSContext import *

# First set the path to essential, but large files

### 1. The DNA Shape files
Downloaded from:

In [4]:
shape_path = "/home/kipkurui/Dream_challenge/DNAShape"

### 2. The human genome

In [5]:
human_genome = "/home/kipkurui/Dream_challenge/annotations"

### 3. Uniformly processed ChIP-seq peaks

In [6]:
chipseq_path = "/home/kipkurui/Project/MARS/Data/ChIP-seq/Downloaded"

In [7]:
#BASE_DIR = "/home/kipkurui/Dream_challenge/DreamChallenge"

### 2. Create kmer dictionaries for the features of interest
We have two option here:
1. Backround noise scalled in a simiklar maner to sticky k-mers 
1. Preferred k-mers max normalized

In [8]:
dn_hg_dict, kmer_name = get_kmer_dict_rev("Data/dn_hg_max_normalized.txt", "test")

dn_hg_dict2, kmer_name = get_kmer_dict_rev("Data/hg_dn_backround_noise_minmax.txt", "test")

### 3. Score the sequences of interest

#### a) K_mer score

### Run all data prepapration steps

In [9]:
def get_feature_df(tf, pos):
    """
    Given a TF and the position of the peak file of interest
    Creat a DataFrame with all the coordinates
    
    This is the main Feature Vector
    """
    peak_files = get_peak_files(tf)

    combined_bed, trim_to = get_combined_bed(peak_files[pos])

    E_score_dict, kmer_name = get_contigmers_dict(get_contigmers(tf)[0],"test")

    ## Calculate all the necessary features
    #E_score_combined = get_kmer_score(combined_bed, sum_kmer_score, E_score_dict)

    feature_frame = pd.DataFrame()
    feature_frame["sum_kmer_score"] = get_kmer_score(combined_bed, sum_kmer_score, E_score_dict)
    feature_frame ["max_kmer_score"] = get_kmer_score(combined_bed, max_score_kmer, E_score_dict)
    test_score = get_kmer_score(combined_bed, max_score_kmer_pos, E_score_dict)
    double_deal = test_score.apply(pd.Series)
    feature_frame ["max_kmer_score_pos"] = double_deal[0]
    hits_df = get_hits_df(double_deal, combined_bed)
    feature_frame["dnase"] = apply_get_max_dnase(hits_df)
    feature_frame["phatsCons"] = apply_get_phatscon(hits_df)
    feature_frame["phyloP100way"] = apply_get_phatscon(hits_df, "phyloP100way")
    
    feature_frame["dn_hg_score"] = get_kmer_score(combined_bed, max_score_kmer, dn_hg_dict)
    feature_frame["hg_dn_score"] = get_kmer_score(combined_bed, max_score_kmer, hg_dn_dict)
#     feature_frame["pwm_score"] = get_kmer_score(combined_bed, energyscore, get_motif_details(tf))
    feature_frame.reset_index(drop=True, inplace=True)
    pos_tss = get_distance_to_tss(hits_df.head(trim_to))
    neg_tss = get_distance_to_tss(hits_df.tail(trim_to))
    pos_neg_tss = pos_tss.append(neg_tss)
    pos_neg_tss.reset_index(drop=True, inplace=True) 
    feature_frame["tss_dist"] = pos_neg_tss
    for shape in "ProT MGW HelT Roll".split():
        #feature_frame["%s_shape" % shape] = apply_get_shape(hits_df, shape)
        feature_fr = apply_get_full_shape(hits_df).apply(pd.Series)
        feature_fr.columns = get_shape_names(shape)
        feature_frame = feature_frame.T.append(feature_fr.T).T
    return feature_frame, trim_to

Complete Feature list

In [10]:
feat_list = ['max_kmer_score','dnase','kmer_score',"phatsCons",
 'Roll', 'ProT', 'MGW', 'HelT',
 'max_kmer_score_pos','dn_hg_score',
 'dn_hg_score2',"tss_dist", "phyloP100way"]

In [12]:
def pop_this(feat):
    try:
        all_feats.pop(all_feats.index(feat))
    except ValueError:
        try:
            for i in range(8):
                all_feats.pop(all_feats.index(feat+"_%i" % i))
        except ValueError:
            pass

### 4. Test the different machine learning models

A stand-alone implementation of this is *test_xgb_svm_gbc_sgd.py*

Using this note, we can test a variety of machine learning models. 

In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = MinMaxScaler()

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier

#Accuracy metrics
from sklearn.metrics import accuracy_score, classification_report, auc

from sklearn import feature_selection

from sklearn.externals import joblib

from xgboost import XGBClassifier

import xgboost as xgb

def train_sgd(feature_frame, feature_frame_p, y_train, y_test):
    scaler = MinMaxScaler()
    #Scale the train data
    scaler.fit(feature_frame) 
    X_train = scaler.transform(feature_frame)

    #Scale the test data as well
    scaler.fit(feature_frame_p)
    X_test = scaler.transform(feature_frame_p)
    
    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    pred_sgd = clf.predict(X_test)
    
    return roc_auc_score(y_test, pred_sgd)

def train_svm(feature_frame, feature_frame_p, y_train, y_test):
    scaler = MinMaxScaler()
    #Scale the train data
    scaler.fit(feature_frame) 
    X_train = scaler.transform(feature_frame)

    #Scale the test data as well
    scaler.fit(feature_frame_p)
    X_test = scaler.transform(feature_frame_p)
    
    clf = svm.SVC()
    clf.fit(X_train, y_train)
    pred_svm = clf.predict(X_test)
    
    return roc_auc_score(y_test, pred_svm)

def train_xgb(feature_frame,feature_frame_p,y_train, y_test):
    
    xgdmat = xgb.DMatrix(feature_frame, y_train) 
    our_params = {'eta': 0.3, 'seed':0, 'subsample': 1, 'colsample_bytree': 1, 
                 'objective': 'binary:logistic', 'max_depth':6, 'min_child_weight':1} 
    my_model = xgb.train(our_params,xgdmat)
    testdmat = xgb.DMatrix(feature_frame_p, y_test)
    y_pred = my_model.predict(testdmat)
    
    return roc_auc_score(y_test, y_pred)

def train_gradient(feature_frame,feature_frame_p,y_train, y_test):
    clf = GradientBoostingClassifier()
    clf.fit(feature_frame, y_train)
    pred_sgd = clf.predict(feature_frame_p)

    return roc_auc_score(y_test, pred_sgd)

In [None]:
with open("../Results/test.txt", "w") as tf_scores:
    
    tf_scores.write("Tf_name\t")
    for j in "sgd, svms, xgboost, gradient".split():
        tf_scores.write("%s\t" % j)
    for tf in ["Max"]:
        tf_scores.write("\n%s\t" % tf)
        #tf_feats.write("\n%s\t" % tf)
        print tf
        
        #Ensure the disk is not filled up by temp files
        pybedtools.cleanup()
        feature_frame = feature_frame.fillna(0)
        feature_frame_p = feature_frame_p.fillna(0)
        
        sgd = train_sgd(feature_frame, feature_frame_p, y_train, y_test)
        svms = train_svm(feature_frame, feature_frame_p, y_train, y_test)
        xgboost = train_xgb(feature_frame, feature_frame_p, y_train, y_test)
        gradient = train_gradient(feature_frame, feature_frame_p, y_train, y_test)
        
        for mod in [sgd, svms, xgboost, gradient]:
            tf_scores.write("%.4f\t" % mod)

### 4. Train a model using the data

In [None]:
with open("../Results/TF_scores_feature_importance_recursive_all.txt", "a") as tf_scores:
    
    tf_scores.write("Tf_name\tAll\t")
    for j in feat_list:
        tf_scores.write("%s\t" % j)
    for tf in repeat_tfs:
        tf_scores.write("\n%s\t" % tf)
        #tf_feats.write("\n%s\t" % tf)
        print tf

        feature_frame, trim_to = get_feature_best(tf, 0)
        feature_frame_p,trim_to_p =  get_feature_best(tf, -1)
        y_train = np.concatenate((np.ones(trim_to), np.zeros(trim_to)), axis=0)
        y_test = np.concatenate((np.ones(trim_to_p), np.zeros(trim_to_p)), axis=0)
        
        all_feats = list(feature_frame.columns)
        
        #All
        my_model = train_xgboost(feature_frame[all_feats], y_train, tf)
        testdmat = xgb.DMatrix(feature_frame_p[all_feats], y_test)
        y_pred = my_model.predict(testdmat)
        tf_scores.write("%s\t" % (roc_auc_score(y_test, y_pred)))
        
        for feats in feat_list:
            all_feats = list(feature_frame.columns)
            pop_this(feats)
            my_model = train_xgboost(feature_frame[all_feats], y_train, tf)
            
            testdmat = xgb.DMatrix(feature_frame_p[all_feats], y_test)

            y_pred = my_model.predict(testdmat)

            tf_scores.write("%s\t" % (roc_auc_score(y_test, y_pred)))

In [17]:
pbm_chip = []
pbmchip2name = {}
with open("../Data/Pbm_Chip_details.txt") as pbmnchip:
    for line in pbmnchip:
        if line.startswith('Tf_id'):
            continue
        else:
            pbm_chip.append(line.split()[0])
            pbmchip2name[line.split()[1]] = line.split()[0]

#### List of TFs affected by Sticky k-mers

In [8]:
sticky_tfs = pd.read_table("../Data/names.txt", header=None)
tf_list = []
for tf in sticky_tfs[0]:
    #Needs ChIp-seq files converted to posneg format. Provide path here
    chip_list = glob.glob("../Data/ChIP-seq/Derived/Posneg/%s/*" % tf.capitalize())
    if len(chip_list) > 0:
        tf_list.append(tf)

In [10]:
tf_list = ['Foxa2', 'Gata3', 'Max', 'Tcf3', 'Tcf7l2', 'Irf3', 'Irf4',
 'Hnf4a', 'Nr2f2', 'Rxra', 'Egr1', 'Sp4']

#### Get a list of Tfs available in PBM and ChIP with more than two peaks

In [2]:
tf_pbm = pd.read_table("../Data/Pbm_Chip_details.txt")

in_both = tf_pbm[(tf_pbm["Chip_name"] >0) == (tf_pbm["Pbm_name"] >0)]
chip_name = tf_pbm[(tf_pbm["Chip_name"] >0)]["Chip_name"]
chip_name = chip_name.sort_values()

in_both_new = []
for tf in chip_name:
    if (len(get_contigmers(tf)) > 0) & (len(get_peak_files(tf)) > 1):
        #print get_contigmers(tf)
        in_both_new.append(tf)
#Remove Taf1 -- Wrongly picked above
in_both_new.pop(in_both_new.index("Taf1"))

In [23]:
in_both_new2 = ['Ap2', 'Arid3a', 'Egr1', 'Elk1', 'Elk4', 'Ets1', 'Gabp', 'Gata3',
 'Gr', 'Hnf4a', 'Irf3', 'Jund', 'Mafk', 'Max', 'Pou2f2', 'Rxra', 'Sp1', 'Srf',
 'Tbp', 'Tcf7l2']

In [31]:
best = ['max_kmer_score',"phatsCons",'dn_hg_score','dnase', "tss_dist"]

In [111]:
feat_list = [
 'max_kmer_score',"phatsCons",
 'dn_hg_score2',
 'dnase', "tss_dist", "phyloP100way"]

In [55]:
overal_list = []
for feat in feat_list:
    feats = feat_list[:]
    feats.pop(feats.index(feat))
    overal_list.append(feats[:])
for feat1 in feat_list:
    new_l = [feat1]
    for feat in feat_list:
        if not feat in new_l:
            new_l.append(feat)
            new_l.sort()
            add_in = new_l[:]
            if add_in not in overal_list:
                overal_list.append(add_in)

In [6]:
with open("../Results/feature_details_importance", "w") as feats:   
    for j, yu in enumerate(overal_list):
        #print feat_list[j]+"_"+str(j)
        feats.write("AUC_%i\t %s\n" % (j, '|'.join(yu)))

In [None]:
def get_feature_best6(tf, pos):
    """
    Given a TF and the position of the peak file of interest
    Creat a DataFrame with all the coordinates
    
    This is the main Feature Vector
    """
    peak_files = get_peak_files(tf)

    combined_bed, trim_to = get_combined_bed(peak_files[pos])

    E_score_dict, kmer_name = get_contigmers_dict(get_contigmers(tf)[0],"test")

    feature_frame = pd.DataFrame()
    feature_frame ["max_kmer_score"] = get_kmer_score(combined_bed, max_score_kmer, E_score_dict)
    test_score = get_kmer_score(combined_bed, max_score_kmer_pos, E_score_dict)
    double_deal = test_score.apply(pd.Series)
    hits_df = get_hits_df(double_deal, combined_bed)
    feature_frame["dnase"] = apply_get_max_dnase(hits_df)
    feature_frame["phatsCons"] = apply_get_phatscon(hits_df)
    feature_frame["phyloP100way"] = apply_get_phatscon(hits_df, "phyloP100way")
    
    feature_frame["hg_dn_score"] = get_kmer_score(combined_bed, max_score_kmer, hg_dn_dict)
    feature_frame.reset_index(drop=True, inplace=True)
    pos_tss = get_distance_to_tss(hits_df.head(trim_to))
    neg_tss = get_distance_to_tss(hits_df.tail(trim_to))
    pos_neg_tss = pos_tss.append(neg_tss)
    pos_neg_tss.reset_index(drop=True, inplace=True) 
    feature_frame["tss_dist"] = pos_neg_tss
    return feature_frame, trim_to

### 5. Feature importance by eliminating one, sequentially

The shape features will have to be eliminated together as a group.This is an attempt to be clear on the contribution to the accuracy by each of the features. 

In [7]:
with open("../Results/TF_scores_feature_importance_recursive_all.txt", "a") as tf_scores:
    
    #tf_scores.write("Tf_name\tAll\t")
    #for j in feat_list:
        #tf_scores.write("%s\t" % j)
    for tf in tf_list:
        tf_scores.write("\n%s\t" % tf)
        #tf_feats.write("\n%s\t" % tf)
        print tf

        feature_frame, trim_to = get_feature_best(tf, 0)
        feature_frame_p,trim_to_p =  get_feature_best(tf, -1)
        y_train = np.concatenate((np.ones(trim_to), np.zeros(trim_to)), axis=0)
        y_test = np.concatenate((np.ones(trim_to_p), np.zeros(trim_to_p)), axis=0)
        
        all_feats = list(feature_frame.columns)
        
        #All
        my_model = train_xgboost(feature_frame[all_feats], y_train, tf)
        testdmat = xgb.DMatrix(feature_frame_p[all_feats], y_test)
        y_pred = my_model.predict(testdmat)
        tf_scores.write("%s\t" % (roc_auc_score(y_test, y_pred)))
        
        for feats in feat_list:
            all_feats = list(feature_frame.columns)
            pop_this(feats)
            my_model = train_xgboost(feature_frame[all_feats], y_train, tf)
            
            testdmat = xgb.DMatrix(feature_frame_p[all_feats], y_test)

            y_pred = my_model.predict(testdmat)

            tf_scores.write("%s\t" % (roc_auc_score(y_test, y_pred)))

### 6. Feature importance by recursive addition

In [8]:
with open("../Results/TF_scores_feature_importance_recursive_all.txt", "a") as tf_scores:
    tf_scores.write("Tf_name\tAll\t")
    feat_list = ['sum_kmer_score',"phatsCons",
 'Roll', 'ProT', 'MGW', 'HelT',
 'max_kmer_score_pos','dn_hg_score',
 'hg_dn_score',"tss_dist", "phyloP100way"]
    for j in feat_list:
        tf_scores.write("%s\t" % j)
    for tf in repeat_tfs: #in_both_new:
        tf_scores.write("\n%s\t" % tf)
#         #tf_feats.write("\n%s\t" % tf)
        print tf

        feature_frame, trim_to = get_feature_df(tf, 0)
        feature_frame_p,trim_to_p =  get_feature_df(tf, -1)
        y_train = np.concatenate((np.ones(trim_to), np.zeros(trim_to)), axis=0)
        y_test = np.concatenate((np.ones(trim_to_p), np.zeros(trim_to_p)), axis=0)
        
        all_feats = list(feature_frame.columns)
        
#         #All
        my_model = train_xgboost(feature_frame[all_feats], y_train, tf)
        testdmat = xgb.DMatrix(feature_frame_p[all_feats], y_test)
        y_pred = my_model.predict(testdmat)
        tf_scores.write("%s\t" % (roc_auc_score(y_test, y_pred)))
        
        all_feats = list(feature_frame.columns)
        
        feat_list = ['sum_kmer_score',"phatsCons",
 'Roll', 'ProT', 'MGW', 'HelT',
 'max_kmer_score_pos','dn_hg_score',
 'hg_dn_score',"tss_dist", "phyloP100way"]
        loop_this = feat_list[:]
        for i,j in enumerate(loop_this):
            all_feats = list(feature_frame.columns)
            feat_list = ['sum_kmer_score',"phatsCons",
             'Roll', 'ProT', 'MGW', 'HelT',
             'max_kmer_score_pos','dn_hg_score',
             'hg_dn_score',"tss_dist", "phyloP100way"]
            #print i,j
            feat_list.pop(i)

            for i in feat_list:
                pop_this(i)
            my_model = train_xgboost(feature_frame[all_feats], y_train, tf)
            
            testdmat = xgb.DMatrix(feature_frame_p[all_feats], y_test)

            y_pred = my_model.predict(testdmat)

            tf_scores.write("%s\t" % (roc_auc_score(y_test, y_pred)))

From the above, we can confidently deduce that the most informative feature as DNase and TSS; however, for other features, we lose information on the quality of the model since *k*-mer scores are will not be confidently measured. Therefore, we eliminate the poorly performing features and then introduce. 

### 7. Contribution of the DNA shape to the baseline model

Here, we will have a complete feature with:
* The max *k*-mer score
* The DNase score
* The Each of the Shape features

So the Idea is to start with a complete model, then one with a  variation of each of the shape features. 

The difficulty with these is that the model does not consider the order of the features, rather, it starts with the first ones and moves along with the rest. So, the feature presented first seems to have a high contribution to the tree decisions. 

Here we want to observe the contribution, rather than how much a dip is adding the feature causes. Then decide on the input of each of the features. 

In [25]:
def get_feature_df_shape(tf, pos):   
    peak_files = get_peak_files(tf)

    combined_bed, trim_to = get_combined_bed(peak_files[pos])

    E_score_dict, kmer_name = get_contigmers_dict(get_contigmers(tf)[0],"test")

    feature_frame = pd.DataFrame()
    feature_frame ["max_kmer_score"] = get_kmer_score(combined_bed, max_score_kmer, E_score_dict)
    test_score = get_kmer_score(combined_bed, max_score_kmer_pos, E_score_dict)
    double_deal = test_score.apply(pd.Series)
    hits_df = get_hits_df(double_deal, combined_bed)
    feature_frame["dnase"] = apply_get_max_dnase(hits_df)
    feature_frame.reset_index(drop=True, inplace=True)
    for shape in "ProT MGW HelT Roll".split():
        feature_fr = apply_get_full_shape(hits_df).apply(pd.Series)
        feature_fr.columns = get_shape_names(shape)
        feature_frame = feature_frame.T.append(feature_fr.T).T
    return feature_frame, trim_to

In [123]:
shapes = [ 'Roll', 'ProT', 'MGW', 'HelT']

In [9]:
with open("../Results/TF_scores_feature_importance_recursive_shape3.txt", "a") as tf_scores:
    tf_scores.write("Tf_name\tAll\tNone\t")
    for j in [ 'Roll', 'ProT', 'MGW', 'HelT']:
        tf_scores.write("%s\t" % j)
    for tf in repeat_tfs:#in_both_new[17:]:
        shapes = [ 'Roll', 'ProT', 'MGW', 'HelT']
        tf_scores.write("\n%s\t" % tf)
        #tf_feats.write("\n%s\t" % tf)
        print tf

        feature_frame, trim_to = get_feature_df_shape(tf, 0)
        feature_frame_p,trim_to_p =  get_feature_df_shape(tf, -1)
        y_train = np.concatenate((np.ones(trim_to), np.zeros(trim_to)), axis=0)
        y_test = np.concatenate((np.ones(trim_to_p), np.zeros(trim_to_p)), axis=0)
        
        all_feats = list(feature_frame.columns)
        
        #All
        my_model = train_xgboost(feature_frame[all_feats], y_train, tf)
        testdmat = xgb.DMatrix(feature_frame_p[all_feats], y_test)
        y_pred = my_model.predict(testdmat)
        tf_scores.write("%s\t" % (roc_auc_score(y_test, y_pred)))
        
        for i in shapes:
            pop_this(i)
        my_model = train_xgboost(feature_frame[all_feats], y_train, tf)
        testdmat = xgb.DMatrix(feature_frame_p[all_feats], y_test)
        y_pred = my_model.predict(testdmat)
        tf_scores.write("%s\t" % (roc_auc_score(y_test, y_pred)))
        
        all_feats = list(feature_frame.columns)
        for i,j in enumerate([ 'Roll', 'ProT', 'MGW', 'HelT']):
            all_feats = list(feature_frame.columns)
            shapes = [ 'Roll', 'ProT', 'MGW', 'HelT']
            #print i,j
            shapes.pop(i)

            for i in shapes:
                pop_this(i)
    
            my_model = train_xgboost(feature_frame[all_feats], y_train, tf)

            testdmat = xgb.DMatrix(feature_frame_p[all_feats], y_test)

            y_pred = my_model.predict(testdmat)

            tf_scores.write("%s\t" % (roc_auc_score(y_test, y_pred)))

### 8. Test the effect of the preferred and noise *k*-mer scores as an additional feature

The question here is to determine if it does contribute to the predictive ability of the model. Here we can extract the information from the feature contribution based on how much loss it causes. 

Here, we also want to use the baseline model comprising of the DNase and kmer scores; that is after determining the best scoring function we just settle on that for any subsequent computation. The level of correlation of the various features is also informative regarding how much more value they can add to the quality of the model. 

With this, the best option is to test the performance of the baseline with *k*-mer model and with DNase data

In [129]:
noise_features = [["dnase", "max_kmer_score","dn_hg_score","hg_dn_score"],["dnase", "max_kmer_score"],["dnase", "max_kmer_score","hg_dn_score"], ["dnase", "max_kmer_score","dn_hg_score"]]

In [25]:
def get_feature_noise(tf, pos):
    peak_files = get_peak_files(tf)

    combined_bed, trim_to = get_combined_bed(peak_files[pos])

    E_score_dict, kmer_name = get_contigmers_dict(get_contigmers(tf)[0],"test")

    feature_frame = pd.DataFrame()
    feature_frame ["max_kmer_score"] = get_kmer_score(combined_bed, max_score_kmer, E_score_dict)
    test_score = get_kmer_score(combined_bed, max_score_kmer_pos, E_score_dict)
    double_deal = test_score.apply(pd.Series)
    hits_df = get_hits_df(double_deal, combined_bed)
    feature_frame["dnase"] = apply_get_max_dnase(hits_df)
    
    feature_frame["dn_hg_score"] = get_kmer_score(combined_bed, max_score_kmer, dn_hg_dict)
    feature_frame["dn_hg_score2"] = get_kmer_score(combined_bed, max_score_kmer, dn_hg_dict2)
    feature_frame.reset_index(drop=True, inplace=True)
    return feature_frame, trim_to

In [10]:
with open("../Results/TF_scores_feature_importance_recursive_noise.txt", "a") as tf_scores:
    tf_scores.write("Tf_name\tAll\tNone\tNoise\tPreferred\t")
    for j in feat_list:
        tf_scores.write("%s\t" % j)
    for tf in in_both_new:
        tf_scores.write("\n%s\t" % tf)
        #tf_feats.write("\n%s\t" % tf)
        print tf

        feature_frame, trim_to = get_feature_noise(tf, 0)
        #for pos in range(1,len(get_peak_files(tf)))
        feature_frame_p,trim_to_p =  get_feature_noise(tf, -1)
        y_train = np.concatenate((np.ones(trim_to), np.zeros(trim_to)), axis=0)
        y_test = np.concatenate((np.ones(trim_to_p), np.zeros(trim_to_p)), axis=0)
        
        all_feats = list(feature_frame.columns)
        
        #All
        my_model = train_xgboost(feature_frame[all_feats], y_train, tf)
        testdmat = xgb.DMatrix(feature_frame_p[all_feats], y_test)
        y_pred = my_model.predict(testdmat)
        tf_scores.write("%s\t" % (roc_auc_score(y_test, y_pred)))
        
        for feats in noise_features:
            my_model = train_xgboost(feature_frame[feats], y_train, tf)
            
            testdmat = xgb.DMatrix(feature_frame_p[feats], y_test)

            y_pred = my_model.predict(testdmat)

            tf_scores.write("%s\t" % (roc_auc_score(y_test, y_pred)))

### 9. Test how well the models can be generalizable to other cell lines

To do this, I need to identify those TFS that have data in more than three cell lines, then use that information to test how a model trained in a rotating manner can be utilised top predict binding in the other cell lines and, what is the accuracy. Although in a way, our current implementation where we train in one and test in another is okay, we need to check and see if there exist fluctuations in performance depending on the training cell line. 

Given a TF with more than one cell line, we get the name and then test prediction ability of a model from one cell line in predicting performance in another cell line.
* Start with single cell line, and if we do observe some irregularities, then --
* Create a  model from each of the cell lines testing the performance in the other cell lines. 
* Determine how well we can generalise our predictions

In [14]:
over_3 = []
for tf in in_both_new:
    #print tf
    peak_files = get_peak_files(tf)
    if (len(peak_files) > 3) & (len(peak_files) < 10):
        over_3.append(tf)
    #print get_celltype(peak_files)

In [27]:
def get_celltype(peak_files):
    cell_types = []
    for i in range(len(peak_files)):
        cell_types.append(peak_files[i].split("/")[-1].split("Tfbs")[-1].split("UniPk")[0])
    return cell_types

In [28]:
def get_feature_cell_type(tf, pos):
    peak_files = get_peak_files(tf)

    combined_bed, trim_to = get_combined_bed(peak_files[pos])

    E_score_dict, kmer_name = get_contigmers_dict(get_contigmers(tf)[0],"test")

    feature_frame = pd.DataFrame()
    feature_frame ["max_kmer_score"] = get_kmer_score(combined_bed, max_score_kmer, E_score_dict)
    test_score = get_kmer_score(combined_bed, max_score_kmer_pos, E_score_dict)
    double_deal = test_score.apply(pd.Series)
    hits_df = get_hits_df(double_deal, combined_bed)
    feature_frame["dnase"] = apply_get_max_dnase(hits_df)
    feature_frame["phatsCons"] = apply_get_phatscon(hits_df)
    feature_frame["phyloP100way"] = apply_get_phatscon(hits_df, "phyloP100way")
    
    feature_frame["dn_hg_score"] = get_kmer_score(combined_bed, max_score_kmer, dn_hg_dict)
    feature_frame["hg_dn_score"] = get_kmer_score(combined_bed, max_score_kmer, hg_dn_dict)
    feature_frame.reset_index(drop=True, inplace=True)
    pos_tss = get_distance_to_tss(hits_df.head(trim_to))
    neg_tss = get_distance_to_tss(hits_df.tail(trim_to))
    pos_neg_tss = pos_tss.append(neg_tss)
    pos_neg_tss.reset_index(drop=True, inplace=True) 
    feature_frame["tss_dist"] = pos_neg_tss
    for shape in "ProT MGW HelT Roll".split():
        feature_fr = apply_get_full_shape(hits_df).apply(pd.Series)
        feature_fr.columns = get_shape_names(shape)
        feature_frame = feature_frame.T.append(feature_fr.T).T
    return feature_frame, trim_to

In [11]:
with open("../Results/TF_scores_cell_type_specificity_1.txt", "a") as tf_scores:
    for tf in over_3:
        tf_scores.write("\n%s\t" % tf)
        print tf

        feature_frame, trim_to = get_feature_cell_type(tf, -1)
        y_train = np.concatenate((np.ones(trim_to), np.zeros(trim_to)), axis=0)
        my_model = train_xgboost(feature_frame, y_train, tf)
        for pos in range(0,len(get_peak_files(tf))-1):
            feature_frame_p,trim_to_p =  get_feature_cell_type(tf, pos)

            y_test = np.concatenate((np.ones(trim_to_p), np.zeros(trim_to_p)), axis=0)

            all_feats = list(feature_frame.columns)

            testdmat = xgb.DMatrix(feature_frame_p, y_test)

            y_pred = my_model.predict(testdmat)

            tf_scores.write("%s\t" % (roc_auc_score(y_test, y_pred)))

#### Test the different training cell types

In [32]:
over_3 = ['Gabp', 'Gata3', 'Jund', 'Mafk', 'Max', 'Sp1', 'Srf', 'Tbp', 'Tcf7l2']

In [12]:
with open("../Results/TF_scores_cell_type_specificity_recursive.txt", "a") as tf_scores:
    for tf in over_3:
        tf_scores.write("\n%s\n" % tf)
        print tf
        for train in range(len(get_peak_files(tf))):
            pybedtools.cleanup()
            tf_scores.write("%s\t" % get_celltype(get_peak_files(tf))[train])
            feature_frame, trim_to = get_feature_cell_type(tf, train)
            y_train = np.concatenate((np.ones(trim_to), np.zeros(trim_to)), axis=0)
            my_model = train_xgboost(feature_frame, y_train, tf)
            for pos in range(len(get_peak_files(tf))):
                feature_frame_p,trim_to_p =  get_feature_cell_type(tf, pos)

                y_test = np.concatenate((np.ones(trim_to_p), np.zeros(trim_to_p)), axis=0)

                all_feats = list(feature_frame.columns)

                testdmat = xgb.DMatrix(feature_frame_p, y_test)

                y_pred = my_model.predict(testdmat)

                tf_scores.write("%s\t" % (roc_auc_score(y_test, y_pred)))

### 10. Test for conservation data contribution

Using the same idea, we need to test the various forms of acquiring the conservation scores and their effect on the performance of the model. Here are to test the following:
* Phastcons hit
* Phastcons whole site
* Phylo hit
* phylo whole site

With each of these, we use the baseline model already defined and tested. 

In [132]:
def get_feature_conservation(tf, pos):
    peak_files = get_peak_files(tf)

    combined_bed, trim_to = get_combined_bed(peak_files[pos])

    E_score_dict, kmer_name = get_contigmers_dict(get_contigmers(tf)[0],"test")

    feature_frame = pd.DataFrame()
    feature_frame ["max_kmer_score"] = get_kmer_score(combined_bed, max_score_kmer, E_score_dict)
    test_score = get_kmer_score(combined_bed, max_score_kmer_pos, E_score_dict)
    double_deal = test_score.apply(pd.Series)
    hits_df = get_hits_df(double_deal, combined_bed)
    feature_frame["dnase"] = apply_get_max_dnase(hits_df)
    feature_frame["phatsCons"] = apply_get_phatscon(hits_df)
    feature_frame["phyloP100way"] = apply_get_phatscon(hits_df, "phyloP100way")
    
    feature_frame["phatsCons_whole"] = apply_get_phatscon(combined_bed)
    feature_frame["phyloP100way_whole"] = apply_get_phatscon(combined_bed, "phyloP100way")
    
    feature_frame.reset_index(drop=True, inplace=True)
    return feature_frame, trim_to

In [133]:
conservation_features = [["dnase", "max_kmer_score","phatsCons","phyloP100way", "phatsCons_whole","phyloP100way_whole"],
                  ["dnase", "max_kmer_score"],
                  ["dnase", "max_kmer_score","phyloP100way", "phatsCons_whole","phyloP100way_whole"],
                  ["dnase", "max_kmer_score","phatsCons", "phatsCons_whole","phyloP100way_whole"],
                 ["dnase", "max_kmer_score","phatsCons","phyloP100way","phyloP100way_whole"],
                 ["dnase", "max_kmer_score","phatsCons","phyloP100way", "phatsCons_whole"]]

In [15]:
with open("../Results/TF_scores_feature_importance_recursive_conservation.txt", "a") as tf_scores:
    tf_scores.write("Tf_name\tAll\tNone\tPhats_hit\tPhylo_hit\tPhats_wh\tPhylo_wh\t")
    for tf in tf_list:
        tf_scores.write("\n%s\t" % tf)
        #tf_feats.write("\n%s\t" % tf)
        print tf

        feature_frame, trim_to = get_feature_conservation(tf, 0)
        feature_frame_p,trim_to_p =  get_feature_conservation(tf, -1)
        y_train = np.concatenate((np.ones(trim_to), np.zeros(trim_to)), axis=0)
        y_test = np.concatenate((np.ones(trim_to_p), np.zeros(trim_to_p)), axis=0)
        
        all_feats = list(feature_frame.columns)
        
        for feats in conservation_features:
            my_model = train_xgboost(feature_frame[feats], y_train, tf)
            
            testdmat = xgb.DMatrix(feature_frame_p[feats], y_test)

            y_pred = my_model.predict(testdmat)

            tf_scores.write("%s\t" % (roc_auc_score(y_test, y_pred)))

## parameter optimization

In [None]:
cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', cv = 5, n_jobs = -1) 

In [None]:
optimized_GBM.fit(feature_frame, y_train)

In [None]:
optimized_GBM.grid_scores_

In [None]:
cv_params = {'learning_rate': [0.1, 0.01], 'subsample': [0.7,0.8,0.9]}
ind_params = {'n_estimators': 1000, 'seed':0, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 'max_depth': 3, 'min_child_weight': 1}


optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', cv = 5, n_jobs = -1)
optimized_GBM.fit(feature_frame, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'subsample': [0.7, 0.8, 0.9], 'learning_rate': [0.1, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

[mean: 0.82907, std: 0.02203, params: {'max_depth': 3, 'min_child_weight': 1},
 mean: 0.82895, std: 0.02155, params: {'max_depth': 3, 'min_child_weight': 3},
 mean: 0.82891, std: 0.02206, params: {'max_depth': 3, 'min_child_weight': 5},
 mean: 0.82663, std: 0.02266, params: {'max_depth': 5, 'min_child_weight': 1},
 mean: 0.82706, std: 0.02249, params: {'max_depth': 5, 'min_child_weight': 3},
 mean: 0.82644, std: 0.02345, params: {'max_depth': 5, 'min_child_weight': 5},
 mean: 0.82056, std: 0.02175, params: {'max_depth': 7, 'min_child_weight': 1},
 mean: 0.81939, std: 0.02062, params: {'max_depth': 7, 'min_child_weight': 3},
 mean: 0.82089, std: 0.02104, params: {'max_depth': 7, 'min_child_weight': 5}]


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'subsample': [0.7, 0.8, 0.9], 'learning_rate': [0.1, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

#learning rate

[mean: 0.82960, std: 0.02174, params: {'subsample': 0.7, 'learning_rate': 0.1},
 mean: 0.82907, std: 0.02203, params: {'subsample': 0.8, 'learning_rate': 0.1},
 mean: 0.82858, std: 0.02144, params: {'subsample': 0.9, 'learning_rate': 0.1},
 mean: 0.82084, std: 0.01890, params: {'subsample': 0.7, 'learning_rate': 0.01},
 mean: 0.82065, std: 0.01878, params: {'subsample': 0.8, 'learning_rate': 0.01},
 mean: 0.82001, std: 0.01869, params: {'subsample': 0.9, 'learning_rate': 0.01}]

In [None]:
optimized_GBM.grid_scores_

In [None]:
feature_frame.corr()

In [None]:
xgdmat = xgb.DMatrix(feature_frame, y_train) # Create our DMatrix to make XGBoost more efficient

In [None]:
our_params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 'max_depth':3, 'min_child_weight':1} 
# Grid Search CV optimized settings

cv_xgb = xgb.cv(params = our_params, dtrain = xgdmat, num_boost_round = 3000, nfold = 5,
                metrics = ['error'], # Make sure you enter metrics inside a list or you may encounter issues!
                early_stopping_rounds = 100) # Look for early stopping that minimizes error