### All Data: Consensus Genotype

* The following notebook is trained on data generated from revised R script [Oct 12 2017]
    * Exact Match [1] and Homozygous [0] Reference data points
    * Removed all data points with Gtcons and GTconswithoutXX -1
* 5k randomly selected deletions test data was also processed through same R script
* Balanced Training Set for GTcons labels:
    * 200 Hom Var
    * 200 Hom Ref
    * 200 Het Var
* **Train/Prediction Label:** consensus genotype


In [1]:
"""
Imports
"""
import pandas as pd
import numpy as np
import graphviz
import io
from fancyimpute import KNN
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import LeaveOneOut
from scipy.stats import ks_2samp
from scipy import stats
from matplotlib import pyplot
from sklearn import preprocessing
from scipy.linalg import svd
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA as sklearnPCA
import plotly.plotly as py
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score, precision_score
from sklearn import preprocessing
from ggplot import *
from bokeh.charts import TimeSeries
from bokeh.models import HoverTool
from bokeh.plotting import show
from bokeh.charts import Scatter, Histogram, output_file, show
from bokeh.plotting import figure, show, output_file, ColumnDataSource
from bokeh.io import output_notebook
from bokeh.charts import Bar, output_file, show
import bokeh.palettes as palettes
from bokeh.models import HoverTool, BoxSelectTool, Legend
from sklearn import (manifold, datasets, decomposition, ensemble,
                     discriminant_analysis, random_projection)



In [2]:
# Import Training Data
# SVanalyzer generated training data
df_train = pd.read_csv('/Volumes/lesleydata/SVanalyzer_ML/Oct272017_ML_w_AllTech/data/train_test_data/train_data_min1.csv')
df_train_2 = pd.read_csv('/Volumes/lesleydata/SVanalyzer_ML/Oct272017_ML_w_AllTech/data/train_test_data/train_data_min1.csv')
df_train.rename(columns={'size': 'Size'}, inplace=True)
df_train.head(1)

Unnamed: 0,chrom,id,sample,start,end,type,SVtype,Size,Ill250.GT,Ill250.alt_alnScore_mean,...,tandemrep_pct,Label,GTconflict,GTcons,GTconswithoutIll250.GT,GTconswithoutIll300x.GT,GTconswithoutIllMP.GT,GTconswithoutTenX.GT,GTconswithoutpacbio.GT,GTsupp
0,1,23,HG002,72766323,72811839,Deletion,Deletion,-45516,1.0,977.7,...,0.059979,1,-1,1,1,1,1,1,1,3


In [3]:
train_set = pd.DataFrame()
train_set = df_train_2

In [4]:
train_set['GTcons'].replace(0, 'Homozygous_Reference', inplace=True)
train_set['GTcons'].replace(1, 'Heterozygous_Variant', inplace=True)
train_set['GTcons'].replace(2, 'Homozygous_Variant', inplace=True)

<a id='imbalance'></a>

In [5]:
pd.value_counts(train_set['GTcons'].values, sort=False)

Heterozygous_Variant    623
Homozygous_Reference    971
Homozygous_Variant      200
dtype: int64

** NOTE: Imbalanced classes in original training dataset. The following loads dataset with equal examples of each class **

In [6]:
# # Import Training Data
# # SVanalyzer generated training data
# df_train = pd.read_csv('/Volumes/lesleydata/SVanalyzer_ML/Oct272017_ML_w_AllTech/data/train_test_data/train_data_balanced.csv')
# df_train_2 = pd.read_csv('/Volumes/lesleydata/SVanalyzer_ML/Oct272017_ML_w_AllTech/data/train_test_data/train_data_balanced.csv')
# df_train.rename(columns={'size': 'Size'}, inplace=True)
# df_train.head(1)

In [7]:
train_set = pd.DataFrame()
train_set = df_train_2

In [8]:
train_set['GTcons'].replace(0, 'Homozygous_Reference', inplace=True)
train_set['GTcons'].replace(1, 'Heterozygous_Variant', inplace=True)
train_set['GTcons'].replace(2, 'Homozygous_Variant', inplace=True)

<a id='imbalance'></a>

In [9]:
pd.value_counts(train_set['GTcons'].values, sort=False)

Heterozygous_Variant    623
Homozygous_Reference    971
Homozygous_Variant      200
dtype: int64

In [10]:
# Train the model only on the rows that have an Exact Match or Homozygous Reference Label
# This step removes any row that has in 'Inaccurate Call' label
df_train = df_train[(df_train['Label'] == 1) | (df_train['Label'] == 0)]
df_train_2 = df_train_2[(df_train_2['Label'] == 1) | (df_train_2['Label'] == 0)]

In [11]:
# There are only Exact Match [1] and Homozygous Reference Labels [0]
pd.value_counts(df_train['Label'].values, sort=False)

0    969
1    825
dtype: int64

<a id='hom_ref'></a>

In [12]:
# Import Test Data
# SVanalyzer generated training data
df_test = pd.read_csv('/Volumes/lesleydata/SVanalyzer_ML/Oct272017_ML_w_AllTech/data/train_test_data/test_data_min1.csv')
df_test_2 = pd.read_csv('/Volumes/lesleydata/SVanalyzer_ML/Oct272017_ML_w_AllTech/data/train_test_data/test_data_min1.csv')
df_test.rename(columns={'size': 'Size'}, inplace=True)
df_test.head(1)

Unnamed: 0,chrom,id,Size,sample,start,end,type,SVtype,Ill250.GT,Ill250.alt_alnScore_mean,...,tandemrep_cnt,tandemrep_pct,GTconflict,GTcons,GTconswithoutIll250.GT,GTconswithoutIll300x.GT,GTconswithoutIllMP.GT,GTconswithoutTenX.GT,GTconswithoutpacbio.GT,GTsupp
0,1,859,-115,HG002,37568322,37568587,Insertion,Deletion,0.0,0.0,...,3,0.818868,-1,0,0,0,0,0,0,4


In [13]:
# Store header names in lists and find names that are NOT contained in BOTH lists
c = list(df_train.columns.values)
d = list(df_test.columns.values)
set(c) - set(d)

{'Label'}

In [14]:
### Drop columns that are not shared by both dataframes
df_train.drop(['Label'], axis=1, inplace = True)
df_train.drop(['GTconswithoutIll300x.GT'], axis=1, inplace = True)
df_train.drop(['GTconswithoutIll250.GT'], axis=1, inplace = True)
df_train.drop(['GTconswithoutIllMP.GT'], axis=1, inplace = True)
df_train.drop(['GTconswithoutTenX.GT'], axis=1, inplace = True)
df_train.drop(['GTconswithoutpacbio.GT'], axis=1, inplace = True)
df_train.drop(['Ill300x.GT'], axis=1, inplace = True)
df_train.drop(['Ill250.GT'], axis=1, inplace = True)
df_train.drop(['IllMP.GT'], axis=1, inplace = True)
df_train.drop(['TenX.GT'], axis=1, inplace = True)
df_train.drop(['pacbio.GT'], axis=1, inplace = True)
df_train.drop(['GTconflict'], axis=1, inplace = True)
df_train.drop(['GTsupp'], axis=1, inplace = True)
df_train.drop(['sample'], axis=1, inplace = True)
df_train.drop(['SVtype'], axis=1, inplace = True)
df_train.drop(['type'], axis=1, inplace = True)
df_train.drop(['id'], axis=1, inplace = True)

In [15]:
df_train.head(1)

Unnamed: 0,chrom,start,end,Size,Ill250.alt_alnScore_mean,Ill250.alt_alnScore_std,Ill250.alt_count,Ill250.alt_insertSize_mean,Ill250.alt_insertSize_std,Ill250.alt_reason_alignmentScore,...,pacbio.ref_insertSize_mean,pacbio.ref_insertSize_std,pacbio.ref_reason_alignmentScore,refN_cnt,refN_pct,segdup_cnt,segdup_pct,tandemrep_cnt,tandemrep_pct,GTcons
0,1,72766323,72811839,-45516,977.7,17.343875,20.0,451.85,87.001882,17.0,...,,,,0,0,4,0.076523,110,0.059979,1


In [16]:
df_train['chrom'].replace('X', 23, inplace=True)
df_train['chrom'].replace('Y', 24, inplace=True)
df_test['chrom'].replace('X', 23, inplace=True)
df_test['chrom'].replace('Y', 24, inplace=True)

In [17]:
# Store header names in lists and find names that are NOT contained in BOTH lists
c = list(df_train.columns.values)
d = list(df_test.columns.values)
set(d) - set(c)

{'GTconflict',
 'GTconswithoutIll250.GT',
 'GTconswithoutIll300x.GT',
 'GTconswithoutIllMP.GT',
 'GTconswithoutTenX.GT',
 'GTconswithoutpacbio.GT',
 'GTsupp',
 'Ill250.GT',
 'Ill250.amb_reason_insertSizeScore_insertSizeScore',
 'Ill250.amb_reason_insertSizeScore_orientation',
 'Ill300x.GT',
 'Ill300x.amb_reason_alignmentScore_insertSizeScore',
 'Ill300x.amb_reason_insertSizeScore_orientation',
 'Ill300x.amb_reason_orientation_insertSizeScore',
 'IllMP.GT',
 'IllMP.amb_reason_orientation_insertSizeScore',
 'SVtype',
 'TenX.GT',
 'TenX.HP1_amb_reason_insertSizeScore_insertSizeScore',
 'TenX.HP1_amb_reason_insertSizeScore_orientation',
 'TenX.HP1_amb_reason_orientation_insertSizeScore',
 'TenX.HP1_ref_reason_insertSizeScore',
 'TenX.HP2_amb_reason_insertSizeScore_insertSizeScore',
 'TenX.HP2_amb_reason_insertSizeScore_orientation',
 'TenX.HP2_amb_reason_orientation_insertSizeScore',
 'TenX.HP2_ref_reason_insertSizeScore',
 'id',
 'pacbio.GT',
 'sample',
 'type'}

In [18]:
### Drop columns that are not shared by both dataframes
df_test.drop(['Ill300x.amb_reason_alignmentScore_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['Ill300x.amb_reason_insertSizeScore_orientation'], axis=1, inplace = True)
df_test.drop(['Ill300x.amb_reason_orientation_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['Ill250.amb_reason_insertSizeScore_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['Ill250.amb_reason_insertSizeScore_orientation'], axis=1, inplace = True)
df_test.drop(['IllMP.amb_reason_orientation_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['TenX.HP1_amb_reason_insertSizeScore_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['TenX.HP1_amb_reason_insertSizeScore_orientation'], axis=1, inplace = True)
df_test.drop(['TenX.HP1_amb_reason_orientation_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['TenX.HP1_ref_reason_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['TenX.HP2_amb_reason_insertSizeScore_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['TenX.HP2_amb_reason_insertSizeScore_orientation'], axis=1, inplace = True)
df_test.drop(['TenX.HP2_amb_reason_orientation_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['TenX.HP2_ref_reason_insertSizeScore'], axis=1, inplace = True)
df_test.drop(['GTconswithoutIll300x.GT'], axis=1, inplace = True)
df_test.drop(['GTconswithoutIll250.GT'], axis=1, inplace = True)
df_test.drop(['GTconswithoutIllMP.GT'], axis=1, inplace = True)
df_test.drop(['GTconswithoutTenX.GT'], axis=1, inplace = True)
df_test.drop(['GTconswithoutpacbio.GT'], axis=1, inplace = True)
df_test.drop(['Ill300x.GT'], axis=1, inplace = True)
df_test.drop(['Ill250.GT'], axis=1, inplace = True)
df_test.drop(['IllMP.GT'], axis=1, inplace = True)
df_test.drop(['TenX.GT'], axis=1, inplace = True)
df_test.drop(['pacbio.GT'], axis=1, inplace = True)
df_test.drop(['GTcons'], axis=1, inplace = True)
df_test.drop(['GTconflict'], axis=1, inplace = True)
df_test.drop(['GTsupp'], axis=1, inplace = True)
df_test.drop(['sample'], axis=1, inplace = True)
df_test.drop(['SVtype'], axis=1, inplace = True)
df_test.drop(['type'], axis=1, inplace = True)
df_test.drop(['id'], axis=1, inplace = True)

***
Impute missing values using KNN
***

In [19]:
# Store training data in a new variable which will be converted to a matrix
X = df_train
X.shape

(1794, 176)

In [20]:
# Convert dataframe to matrix
X=X.as_matrix()

#Imput missing values from three closest observations
X_imputed=KNN(k=3).complete(X)
X=pd.DataFrame(X_imputed)

Imputing row 1/1794 with 22 missing, elapsed time: 2.774
Imputing row 101/1794 with 1 missing, elapsed time: 2.822
Imputing row 201/1794 with 1 missing, elapsed time: 2.826
Imputing row 301/1794 with 1 missing, elapsed time: 2.832
Imputing row 401/1794 with 1 missing, elapsed time: 2.836
Imputing row 501/1794 with 1 missing, elapsed time: 2.845
Imputing row 601/1794 with 1 missing, elapsed time: 2.849
Imputing row 701/1794 with 1 missing, elapsed time: 2.856
Imputing row 801/1794 with 1 missing, elapsed time: 2.863
Imputing row 901/1794 with 0 missing, elapsed time: 2.891
Imputing row 1001/1794 with 0 missing, elapsed time: 2.895
Imputing row 1101/1794 with 0 missing, elapsed time: 2.900
Imputing row 1201/1794 with 0 missing, elapsed time: 2.907
Imputing row 1301/1794 with 0 missing, elapsed time: 2.910
Imputing row 1401/1794 with 0 missing, elapsed time: 2.923
Imputing row 1501/1794 with 0 missing, elapsed time: 2.928
Imputing row 1601/1794 with 0 missing, elapsed time: 2.933
Imputing

In [21]:
# Store header values in a list, will be used later to re-label the matrix post KNN imputation
dftrain_header = list(df_train.columns.values)
X.columns = dftrain_header
X.head(3)

Unnamed: 0,chrom,start,end,Size,Ill250.alt_alnScore_mean,Ill250.alt_alnScore_std,Ill250.alt_count,Ill250.alt_insertSize_mean,Ill250.alt_insertSize_std,Ill250.alt_reason_alignmentScore,...,pacbio.ref_insertSize_mean,pacbio.ref_insertSize_std,pacbio.ref_reason_alignmentScore,refN_cnt,refN_pct,segdup_cnt,segdup_pct,tandemrep_cnt,tandemrep_pct,GTcons
0,1.0,72766323.0,72811839.0,-45516.0,977.7,17.343875,20.0,451.85,87.001882,17.0,...,11040.571047,4079.336822,58.165391,0.0,0.0,4.0,0.076523,110.0,0.059979,1.0
1,1.0,96139975.0,96142391.0,-2416.0,988.0,4.992302,26.0,449.0,98.968526,23.0,...,10075.07692,4504.705061,26.0,0.0,0.0,0.0,0.0,2.0,0.014487,1.0
2,1.0,109690878.0,109690918.0,-39.0,956.666667,34.439964,18.0,408.277778,67.678,18.0,...,12582.47059,3520.607336,17.0,0.0,0.0,0.0,0.0,1.0,0.4,1.0


In [22]:
# Store Labels in a new 'Y' DataFrame
Y = pd.DataFrame()
Y = X['GTcons']

In [23]:
# Order features
X4 = X.reindex_axis(sorted(X.columns), axis=1)

In [24]:
X4.to_csv('post_imp_compare.csv', index=False)

Random Forest Model

In [25]:
# Train Test Split
# Train on 70% of the data and test on 30%
X_train, X_test, y_train, y_test = train_test_split(X4, Y, test_size=0.3)

In [28]:
model2 = RandomForestClassifier(n_estimators=100, random_state=4, class_weight = "balanced") 
model2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=4, verbose=0, warm_start=False)

In [31]:
pred = model2.predict(X_test)

In [33]:
pred_prob = model2.predict_proba(X_test)

In [34]:
pre_post = pd.concat([X_test, pd.DataFrame(pred_prob, columns=['1','2','3'])])

In [35]:
X_test['predicted_label'] = pred



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [37]:
X_test.rename(columns={'1': 'Homozygous_Reference'}, inplace=True)
X_test.rename(columns={'2': 'Heterozygous_Variant'}, inplace=True)
X_test.rename(columns={'3': 'Homozygous_Variant'}, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [39]:
from sklearn.metrics import confusion_matrix
ytest = X_test['GTcons']
predict = X_test['predicted_label']
print(confusion_matrix(ytest, predict))

[[296   1   0]
 [  0 185   0]
 [  0   1  56]]
