### All Data: Consensus Genotype

* The following notebook is trained on data generated from revised R script [Oct 12 2017]
    * Exact Match [1] and Homozygous [0] Reference data points
    * Removed all data points with Gtcons and GTconswithoutXX -1
* 5k randomly selected deletions test data was also processed through same R script
* Balanced Training Set for GTcons labels:
    * 200 Hom Var
    * 200 Hom Ref
    * 200 Het Var
* **Train/Prediction Label:** consensus genotype


In [1]:
"""
Imports
"""
import pandas as pd
import numpy as np
import graphviz
import io
from fancyimpute import KNN
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import LeaveOneOut
from scipy.stats import ks_2samp
from scipy import stats
from matplotlib import pyplot
from sklearn import preprocessing
from scipy.linalg import svd
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA as sklearnPCA
import plotly.plotly as py
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score, precision_score
from sklearn import preprocessing
from ggplot import *
from bokeh.charts import TimeSeries
from bokeh.models import HoverTool
from bokeh.plotting import show
from bokeh.charts import Scatter, Histogram, output_file, show
from bokeh.plotting import figure, show, output_file, ColumnDataSource
from bokeh.io import output_notebook
from bokeh.charts import Bar, output_file, show
import bokeh.palettes as palettes
from bokeh.models import HoverTool, BoxSelectTool, Legend
from sklearn import (manifold, datasets, decomposition, ensemble,
                     discriminant_analysis, random_projection)



In [2]:
# Import Training Data
# SVanalyzer generated training data
df_train = pd.read_csv('/Volumes/lesleydata/SVanalyzer_ML/Oct272017_ML_w_AllTech/data/train_test_data/train_data_min1.csv')
df_train_2 = pd.read_csv('/Volumes/lesleydata/SVanalyzer_ML/Oct272017_ML_w_AllTech/data/train_test_data/train_data_min1.csv')
df_train.rename(columns={'size': 'Size'}, inplace=True)
df_train.head(1)

Unnamed: 0,chrom,id,sample,start,end,type,SVtype,Size,Ill250.GT,Ill250.alt_alnScore_mean,...,tandemrep_pct,Label,GTconflict,GTcons,GTconswithoutIll250.GT,GTconswithoutIll300x.GT,GTconswithoutIllMP.GT,GTconswithoutTenX.GT,GTconswithoutpacbio.GT,GTsupp
0,1,23,HG002,72766323,72811839,Deletion,Deletion,-45516,1.0,977.7,...,0.059979,1,-1,1,1,1,1,1,1,3


In [3]:
train_set = pd.DataFrame()
train_set = df_train_2

In [4]:
train_set['GTcons'].replace(0, 'Homozygous_Reference', inplace=True)
train_set['GTcons'].replace(1, 'Heterozygous_Variant', inplace=True)
train_set['GTcons'].replace(2, 'Homozygous_Variant', inplace=True)

<a id='imbalance'></a>

In [5]:
pd.value_counts(train_set['GTcons'].values, sort=False)

Homozygous_Variant      200
Heterozygous_Variant    623
Homozygous_Reference    971
dtype: int64

** NOTE: Imbalanced classes in original training dataset. The following loads dataset with equal examples of each class **

In [6]:
# Import Training Data
# SVanalyzer generated training data
df_train = pd.read_csv('/Volumes/lesleydata/SVanalyzer_ML/Oct272017_ML_w_AllTech/data/train_test_data/train_data_balanced.csv')
df_train_2 = pd.read_csv('/Volumes/lesleydata/SVanalyzer_ML/Oct272017_ML_w_AllTech/data/train_test_data/train_data_balanced.csv')
df_train.rename(columns={'size': 'Size'}, inplace=True)
df_train.head(1)

Unnamed: 0,chrom,id,sample,start,end,type,SVtype,Size,Ill250.GT,Ill250.alt_alnScore_mean,...,tandemrep_pct,Label,GTconflict,GTcons,GTconswithoutIll250.GT,GTconswithoutIll300x.GT,GTconswithoutIllMP.GT,GTconswithoutTenX.GT,GTconswithoutpacbio.GT,GTsupp
0,1,21,HG002,65326531,65326651,Deletion,Deletion,-120,0.0,954.0,...,1.0,1,-1,0,0,0,0,0,0,4


In [7]:
train_set = pd.DataFrame()
train_set = df_train_2

In [8]:
train_set['GTcons'].replace(0, 'Homozygous_Reference', inplace=True)
train_set['GTcons'].replace(1, 'Heterozygous_Variant', inplace=True)
train_set['GTcons'].replace(2, 'Homozygous_Variant', inplace=True)

<a id='imbalance'></a>

In [9]:
pd.value_counts(train_set['GTcons'].values, sort=False)

Homozygous_Variant      200
Heterozygous_Variant    200
Homozygous_Reference    200
dtype: int64

In [10]:
# Train the model only on the rows that have an Exact Match or Homozygous Reference Label
# This step removes any row that has in 'Inaccurate Call' label
df_train = df_train[(df_train['Label'] == 1) | (df_train['Label'] == 0)]
df_train_2 = df_train_2[(df_train_2['Label'] == 1) | (df_train_2['Label'] == 0)]

In [11]:
# There are only Exact Match [1] and Homozygous Reference Labels [0]
pd.value_counts(df_train['Label'].values, sort=False)

0    164
1    436
dtype: int64

<a id='hom_ref'></a>

In [12]:
# Import Test Data
# SVanalyzer generated training data
df_test = pd.read_csv('/Volumes/lesleydata/SVanalyzer_ML/Oct272017_ML_w_AllTech/data/train_test_data/test_data_min1.csv')
df_test_2 = pd.read_csv('/Volumes/lesleydata/SVanalyzer_ML/Oct272017_ML_w_AllTech/data/train_test_data/test_data_min1.csv')
df_test.rename(columns={'size': 'Size'}, inplace=True)
df_test.head(1)

Unnamed: 0,chrom,id,Size,sample,start,end,type,SVtype,Ill250.GT,Ill250.alt_alnScore_mean,...,tandemrep_cnt,tandemrep_pct,GTconflict,GTcons,GTconswithoutIll250.GT,GTconswithoutIll300x.GT,GTconswithoutIllMP.GT,GTconswithoutTenX.GT,GTconswithoutpacbio.GT,GTsupp
0,1,859,-115,HG002,37568322,37568587,Insertion,Deletion,0.0,0.0,...,3,0.818868,-1,0,0,0,0,0,0,4


In [13]:
# Store header names in lists and find names that are NOT contained in BOTH lists
c = list(df_train.columns.values)
d = list(df_test.columns.values)
set(c) - set(d)

{'Label'}

In [14]:
### Drop columns that are not shared by both dataframes
df_train.drop(['Label'], axis=1, inplace = True)
df_train.drop(['GTconswithoutIll300x.GT'], axis=1, inplace = True)
df_train.drop(['GTconswithoutIll250.GT'], axis=1, inplace = True)
df_train.drop(['GTconswithoutIllMP.GT'], axis=1, inplace = True)
df_train.drop(['GTconswithoutTenX.GT'], axis=1, inplace = True)
df_train.drop(['GTconswithoutpacbio.GT'], axis=1, inplace = True)
df_train.drop(['Ill300x.GT'], axis=1, inplace = True)
df_train.drop(['Ill250.GT'], axis=1, inplace = True)
df_train.drop(['IllMP.GT'], axis=1, inplace = True)
df_train.drop(['TenX.GT'], axis=1, inplace = True)
df_train.drop(['pacbio.GT'], axis=1, inplace = True)
df_train.drop(['GTconflict'], axis=1, inplace = True)
df_train.drop(['GTsupp'], axis=1, inplace = True)
df_train.drop(['sample'], axis=1, inplace = True)
df_train.drop(['SVtype'], axis=1, inplace = True)
df_train.drop(['type'], axis=1, inplace = True)
df_train.drop(['id'], axis=1, inplace = True)

In [15]:
df_train.head(1)

Unnamed: 0,chrom,start,end,Size,Ill250.alt_alnScore_mean,Ill250.alt_alnScore_std,Ill250.alt_count,Ill250.alt_insertSize_mean,Ill250.alt_insertSize_std,Ill250.alt_reason_alignmentScore,...,pacbio.ref_insertSize_mean,pacbio.ref_insertSize_std,pacbio.ref_reason_alignmentScore,refN_cnt,refN_pct,segdup_cnt,segdup_pct,tandemrep_cnt,tandemrep_pct,GTcons
0,1,65326531,65326651,-120,954.0,0.0,1.0,445.0,0.0,1.0,...,11277.83333,4197.626206,54.0,0,0,0,0.0,1,1.0,0


In [16]:
df_train['chrom'].replace('X', 23, inplace=True)
df_train['chrom'].replace('Y', 24, inplace=True)
df_test['chrom'].replace('X', 23, inplace=True)
df_test['chrom'].replace('Y', 24, inplace=True)

In [17]:
# Store header names in lists and find names that are NOT contained in BOTH lists
c = list(df_train.columns.values)
d = list(df_test.columns.values)
set(d) - set(c)

{'GTconflict',
 'GTconswithoutIll250.GT',
 'GTconswithoutIll300x.GT',
 'GTconswithoutIllMP.GT',
 'GTconswithoutTenX.GT',
 'GTconswithoutpacbio.GT',
 'GTsupp',
 'Ill250.GT',
 'Ill250.amb_reason_insertSizeScore_insertSizeScore',
 'Ill250.amb_reason_insertSizeScore_orientation',
 'Ill300x.GT',
 'Ill300x.amb_reason_alignmentScore_insertSizeScore',
 'Ill300x.amb_reason_insertSizeScore_orientation',
 'Ill300x.amb_reason_orientation_insertSizeScore',
 'IllMP.GT',
 'IllMP.amb_reason_orientation_insertSizeScore',
 'SVtype',
 'TenX.GT',
 'TenX.HP1_amb_reason_insertSizeScore_insertSizeScore',
 'TenX.HP1_amb_reason_insertSizeScore_orientation',
 'TenX.HP1_amb_reason_orientation_insertSizeScore',
 'TenX.HP1_ref_reason_insertSizeScore',
 'TenX.HP2_amb_reason_insertSizeScore_insertSizeScore',
 'TenX.HP2_amb_reason_insertSizeScore_orientation',
 'TenX.HP2_amb_reason_orientation_insertSizeScore',
 'TenX.HP2_ref_reason_insertSizeScore',
 'id',
 'pacbio.GT',
 'sample',
 'type'}

In [18]:
### Drop columns that are not shared by both dataframes
df_test.drop(['Ill300x.amb_reason_alignmentScore_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['Ill300x.amb_reason_insertSizeScore_orientation'], axis=1, inplace = True)
df_test.drop(['Ill300x.amb_reason_orientation_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['Ill250.amb_reason_insertSizeScore_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['Ill250.amb_reason_insertSizeScore_orientation'], axis=1, inplace = True)
df_test.drop(['IllMP.amb_reason_orientation_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['TenX.HP1_amb_reason_insertSizeScore_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['TenX.HP1_amb_reason_insertSizeScore_orientation'], axis=1, inplace = True)
df_test.drop(['TenX.HP1_amb_reason_orientation_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['TenX.HP1_ref_reason_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['TenX.HP2_amb_reason_insertSizeScore_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['TenX.HP2_amb_reason_insertSizeScore_orientation'], axis=1, inplace = True)
df_test.drop(['TenX.HP2_amb_reason_orientation_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['TenX.HP2_ref_reason_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['GTconswithoutIll300x.GT'], axis=1, inplace = True)
df_test.drop(['GTconswithoutIll250.GT'], axis=1, inplace = True)
df_test.drop(['GTconswithoutIllMP.GT'], axis=1, inplace = True)
df_test.drop(['GTconswithoutTenX.GT'], axis=1, inplace = True)
df_test.drop(['GTconswithoutpacbio.GT'], axis=1, inplace = True)
df_test.drop(['Ill300x.GT'], axis=1, inplace = True)
df_test.drop(['Ill250.GT'], axis=1, inplace = True)
df_test.drop(['IllMP.GT'], axis=1, inplace = True)
df_test.drop(['TenX.GT'], axis=1, inplace = True)
df_test.drop(['pacbio.GT'], axis=1, inplace = True)
df_test.drop(['GTcons'], axis=1, inplace = True)
df_test.drop(['GTconflict'], axis=1, inplace = True)
df_test.drop(['GTsupp'], axis=1, inplace = True)
df_test.drop(['sample'], axis=1, inplace = True)
df_test.drop(['SVtype'], axis=1, inplace = True)
df_test.drop(['type'], axis=1, inplace = True)
df_test.drop(['id'], axis=1, inplace = True)

***
Impute missing values using KNN
***

In [19]:
# Store training data in a new variable which will be converted to a matrix
X = df_train
X.head(3)

Unnamed: 0,chrom,start,end,Size,Ill250.alt_alnScore_mean,Ill250.alt_alnScore_std,Ill250.alt_count,Ill250.alt_insertSize_mean,Ill250.alt_insertSize_std,Ill250.alt_reason_alignmentScore,...,pacbio.ref_insertSize_mean,pacbio.ref_insertSize_std,pacbio.ref_reason_alignmentScore,refN_cnt,refN_pct,segdup_cnt,segdup_pct,tandemrep_cnt,tandemrep_pct,GTcons
0,1,65326531,65326651,-120,954.0,0.0,1.0,445.0,0.0,1.0,...,11277.83333,4197.626206,54.0,0,0,0,0.0,1,1.0,0
1,1,83753489,83753698,-209,908.0,0.0,1.0,547.0,0.0,0.0,...,11648.90244,2866.042175,41.0,0,0,1,1.0,1,1.0,0
2,1,152326749,152326980,-231,0.0,0.0,0.0,0.0,0.0,0.0,...,10544.55556,4602.637067,36.0,0,0,1,1.0,0,0.0,0


In [20]:
# Convert dataframe to matrix
X=X.as_matrix()

#Imput missing values from three closest observations
X_imputed=KNN(k=3).complete(X)
X=pd.DataFrame(X_imputed)

Imputing row 1/600 with 1 missing, elapsed time: 0.277
Imputing row 101/600 with 0 missing, elapsed time: 0.350
Imputing row 201/600 with 22 missing, elapsed time: 0.352
Imputing row 301/600 with 1 missing, elapsed time: 0.356
Imputing row 401/600 with 1 missing, elapsed time: 0.359
Imputing row 501/600 with 1 missing, elapsed time: 0.365


In [21]:
# Store header values in a list, will be used later to re-label the matrix post KNN imputation
dftrain_header = list(df_train.columns.values)
X.columns = dftrain_header
X.head(3)

Unnamed: 0,chrom,start,end,Size,Ill250.alt_alnScore_mean,Ill250.alt_alnScore_std,Ill250.alt_count,Ill250.alt_insertSize_mean,Ill250.alt_insertSize_std,Ill250.alt_reason_alignmentScore,...,pacbio.ref_insertSize_mean,pacbio.ref_insertSize_std,pacbio.ref_reason_alignmentScore,refN_cnt,refN_pct,segdup_cnt,segdup_pct,tandemrep_cnt,tandemrep_pct,GTcons
0,1.0,65326531.0,65326651.0,-120.0,954.0,0.0,1.0,445.0,0.0,1.0,...,11277.83333,4197.626206,54.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,1.0,83753489.0,83753698.0,-209.0,908.0,0.0,1.0,547.0,0.0,0.0,...,11648.90244,2866.042175,41.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
2,1.0,152326749.0,152326980.0,-231.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10544.55556,4602.637067,36.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [23]:
from scipy import stats

In [26]:
pre_imp = df_train['Ill300x.ref_reason_alignmentScore']
post_imp = X['Ill300x.ref_reason_alignmentScore']
stats.ks_2samp(pre_imp, post_imp)

Ks_2sampResult(statistic=0.0, pvalue=1.0)

In [37]:
pre_imp = df_train['pacbio.alt_alnScore_std']
post_imp = X['pacbio.alt_alnScore_std']
stats.ks_2samp(pre_imp, post_imp)

Ks_2sampResult(statistic=0.0016666666666667052, pvalue=0.999999999999999)

In [38]:
pre_imp = df_train['Ill250.alt_count']
post_imp = X['Ill250.alt_count']
stats.ks_2samp(pre_imp, post_imp)

Ks_2sampResult(statistic=0.0, pvalue=1.0)

In [39]:
pre_imp = df_train['Ill300x.ref_count']
post_imp = X['Ill300x.ref_count']
stats.ks_2samp(pre_imp, post_imp)

Ks_2sampResult(statistic=0.0, pvalue=1.0)

In [40]:
pre_imp = df_train['pacbio.ref_count']
post_imp = X['pacbio.ref_count']
stats.ks_2samp(pre_imp, post_imp)

Ks_2sampResult(statistic=0.0016666666666667052, pvalue=0.999999999999999)

In [47]:
pre_imp = df_train['pacbio.alt_insertSize_mean']
post_imp = X['pacbio.alt_insertSize_mean']
stats.ks_2samp(pre_imp, post_imp)

Ks_2sampResult(statistic=0.0016666666666667052, pvalue=0.999999999999999)

In [48]:
pre_imp = df_train['pacbio.alt_insertSize_std']
post_imp = X['pacbio.alt_insertSize_std']
stats.ks_2samp(pre_imp, post_imp)

Ks_2sampResult(statistic=0.0016666666666667052, pvalue=0.999999999999999)

In [49]:
pre_imp = df_train['pacbio.ref_alnScore_std']
post_imp = X['pacbio.ref_alnScore_std']
stats.ks_2samp(pre_imp, post_imp)

Ks_2sampResult(statistic=0.0016666666666667052, pvalue=0.999999999999999)

In [50]:
pre_imp = df_train['Ill250.ref_reason_alignmentScore']
post_imp = X['Ill250.ref_reason_alignmentScore']
stats.ks_2samp(pre_imp, post_imp)

Ks_2sampResult(statistic=0.0, pvalue=1.0)

In [51]:
pre_imp = df_train['Ill250.alt_reason_alignmentScore']
post_imp = X['Ill250.alt_reason_alignmentScore']
stats.ks_2samp(pre_imp, post_imp)

Ks_2sampResult(statistic=0.0, pvalue=1.0)

In [52]:
df_train['Ill300x.ref_reason_alignmentScore'].isnull().sum()

0

In [53]:
df_train['pacbio.alt_alnScore_std'].isnull().sum()

1

In [54]:
df_train['Ill250.alt_count'].isnull().sum()

0

In [55]:
df_train['Ill300x.ref_count'].isnull().sum()

0

In [56]:
df_train['pacbio.ref_count'].isnull().sum()

1

In [57]:
df_train['pacbio.alt_insertSize_mean'].isnull().sum()

1

In [58]:
df_train['pacbio.alt_insertSize_std'].isnull().sum()

1

In [59]:
df_train['pacbio.ref_alnScore_std'].isnull().sum()

1

In [60]:
df_train['Ill250.ref_reason_alignmentScore'].isnull().sum()

0

In [61]:
df_train['Ill250.alt_reason_alignmentScore'].isnull().sum()

0

In [35]:
df_train['TenX.HP2_ref_count'].isnull().sum()

19

In [36]:
pre_imp = df_train['TenX.HP2_ref_count']
post_imp = X['TenX.HP2_ref_count']
stats.ks_2samp(pre_imp, post_imp)

Ks_2sampResult(statistic=0.031666666666666732, pvalue=0.92030040394917134)

In [41]:
df_train['TenX.HP1_alt_reason_alignmentScore'].isnull().sum()

19

In [42]:
pre_imp = df_train['TenX.HP1_alt_reason_alignmentScore']
post_imp = X['TenX.HP1_alt_reason_alignmentScore']
stats.ks_2samp(pre_imp, post_imp)

Ks_2sampResult(statistic=0.031666666666666732, pvalue=0.92030040394917134)

In [43]:
df_train['IllMP.alt_reason_alignmentScore'].isnull().sum()

1

In [44]:
pre_imp = df_train['IllMP.alt_reason_alignmentScore']
post_imp = X['IllMP.alt_reason_alignmentScore']
stats.ks_2samp(pre_imp, post_imp)

Ks_2sampResult(statistic=0.0016666666666667052, pvalue=0.999999999999999)