# XGBoost
## Target Variable: penetrance_lqt2

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.datasets import make_regression
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, accuracy_score
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from scipy import stats
from datetime import datetime
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.datasets import make_classification
import xgboost as xgb



In [2]:
data = pd.read_csv('csv_data/all_lqt2_data_trunc(herg_combined_data).csv')

In [3]:
list(data)

['Unnamed: 0',
 'var',
 'isoform',
 'mut_type',
 'resnum',
 'lqt2',
 'unaff',
 'total_carriers',
 'gnomAD',
 'gnomAD_seq',
 'weight',
 'penetrance_lqt2',
 'lqt2_penetranceBayesian_initial',
 'lqt2_penetranceBayesian',
 'pph2_prob',
 'blast_pssm',
 'provean_score',
 'pamscore',
 'aasimilaritymat',
 'revel_score',
 'hm_ssPeak',
 'hm_tailPeak',
 'hm_vhalfact',
 'hm_vhalfinact',
 'hm_recovfrominact',
 'hm_taudeact_fast',
 'ht_ssPeak',
 'ht_tailPeak',
 'ht_vhalfact',
 'ht_vhalfinact',
 'ht_recovfrominact',
 'ht_taudeact_fast',
 'lqt2_dist',
 'lqt2_dist_weight',
 'Structure',
 'Function',
 'p_mean_w',
 'prior_mean_w',
 'prior_mean',
 'alpha',
 'beta',
 'lqt2_patho',
 'p_mean_prior',
 'RMSF',
 'lqt2_dist_sequence',
 'lqt2_dist_weight_sequence',
 'lqt2_dist_observed',
 'lqt2_dist_weight_observed',
 'lqt2_dist_max',
 'lqt2_dist_weight_max',
 'lqt2_dist_mean',
 'lqt2_dist_weight_mean',
 'lqt2_dist_obs_max',
 'lqt2_dist_weight_obs_max',
 'lqt2_dist_obs_mean',
 'lqt2_dist_weight_obs_mean',
 'lqt2_

In [4]:
filter_columns = ['penetrance_lqt2','RMSF','lqt2_dist_sequence','lqt2_dist_weight_sequence',
                  'lqt2_dist_observed','lqt2_dist_weight_observed', 'lqt2_dist_max', 
                  'lqt2_dist_weight_max','lqt2_dist_mean','lqt2_dist_weight_mean',
                  'lqt2_dist_obs_max','lqt2_dist_weight_obs_max','lqt2_dist_obs_mean',
                  'lqt2_dist_weight_obs_mean','lqt2_dist_obs_seq','lqt2_dist_weight_obs_seq', 
                  'revel_score','ht_tailPeak','hm_tailPeak','lqt2_dist','lqt2_dist_weight',
                  'lqt2','unaff','total_carriers','gnomAD','gnomAD_seq','weight','pph2_prob',
                  'blast_pssm','provean_score','pamscore','aasimilaritymat','hm_ssPeak',
                  'hm_vhalfact','hm_vhalfinact','hm_recovfrominact','hm_taudeact_fast',
                  'ht_ssPeak','ht_vhalfact','ht_vhalfinact','ht_recovfrominact',
                  'ht_taudeact_fast','prior_mean_w', 'prior_mean','alpha','beta','lqt2_patho',
                  'p_mean_prior']

df = data.filter(filter_columns, axis=1)

In [5]:
features = ['RMSF', 'lqt2_dist_max', 'revel_score']

X = pd.DataFrame(df[features])
y = pd.DataFrame(df['penetrance_lqt2'])

In [6]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = features,
                                   columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)

print("MSE: {:.2f}".format(mse))
print("RMSE: {:.2f}".format(rmse))
print("MAE: {:.2f}".format(mae))
print("R-squared: {:.2f}".format(r2))
print("Explained variance score: {:.2f}".format(evs))

               importance
lqt2_dist_max    0.525270
revel_score      0.316396
RMSF             0.158333
MSE: 0.13
RMSE: 0.36
MAE: 0.24
R-squared: 0.34
Explained variance score: 0.34


### --------------------------------------------------------------------------------------------------------------------------------------------------------------


In [7]:
features = ['RMSF', 'lqt2_dist_max', 'revel_score', 
            'lqt2_dist_sequence', 'lqt2_dist_mean', 'lqt2_dist_observed']

X = pd.DataFrame(df[features])
y = pd.DataFrame(df['penetrance_lqt2'])

In [8]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = features,
                                   columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)

print("MSE: {:.2f}".format(mse))
print("RMSE: {:.2f}".format(rmse))
print("MAE: {:.2f}".format(mae))
print("R-squared: {:.2f}".format(r2))
print("Explained variance score: {:.2f}".format(evs))

                    importance
lqt2_dist_mean        0.582142
lqt2_dist_sequence    0.116597
revel_score           0.113651
lqt2_dist_observed    0.087114
lqt2_dist_max         0.051240
RMSF                  0.049256
MSE: 0.12
RMSE: 0.35
MAE: 0.23
R-squared: 0.39
Explained variance score: 0.39


### --------------------------------------------------------------------------------------------------------------------------------------------------------------


In [9]:
features = ['revel_score', 'ht_tailPeak', 'hm_tailPeak', 
            'lqt2_dist', 'RMSF', 'lqt2_dist_max', 'lqt2_dist_weight']

X = pd.DataFrame(df[features])
y = pd.DataFrame(df['penetrance_lqt2'])

In [10]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = features,
                                   columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)

print("MSE: {:.2f}".format(mse))
print("RMSE: {:.2f}".format(rmse))
print("MAE: {:.2f}".format(mae))
print("R-squared: {:.2f}".format(r2))
print("Explained variance score: {:.2f}".format(evs))

                  importance
lqt2_dist_max       0.550902
lqt2_dist_weight    0.119437
revel_score         0.091415
lqt2_dist           0.081995
RMSF                0.071858
hm_tailPeak         0.053290
ht_tailPeak         0.031103
MSE: 0.11
RMSE: 0.33
MAE: 0.21
R-squared: 0.44
Explained variance score: 0.44


### --------------------------------------------------------------------------------------------------------------------------------------------------------------


In [11]:
features = ['RMSF', 'lqt2_dist_max', 'lqt2_dist_sequence', 
            'lqt2_dist_observed', 'lqt2_dist_mean', 'revel_score', 
            'lqt2_dist_obs_max', 'lqt2_dist_obs_mean', 'lqt2_dist_obs_seq', 
            'lqt2_dist']

X = pd.DataFrame(df[features])
y = pd.DataFrame(df['penetrance_lqt2'])

In [12]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = features,
                                   columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)

print("MSE: {:.2f}".format(mse))
print("RMSE: {:.2f}".format(rmse))
print("MAE: {:.2f}".format(mae))
print("R-squared: {:.2f}".format(r2))
print("Explained variance score: {:.2f}".format(evs))

                    importance
lqt2_dist_obs_seq     0.349186
lqt2_dist_obs_mean    0.271025
lqt2_dist_obs_max     0.184654
revel_score           0.038994
lqt2_dist_mean        0.038000
lqt2_dist             0.035457
lqt2_dist_observed    0.026419
lqt2_dist_sequence    0.021066
RMSF                  0.018398
lqt2_dist_max         0.016801
MSE: 0.12
RMSE: 0.34
MAE: 0.23
R-squared: 0.41
Explained variance score: 0.41
