### Crowdvariant Analysis
<br>

**Summary**

1. Data collection and data cleaning
2. Data preprocessing
3. Machine Learning analysis

** Notes **

- Gathered crowdsourced labels from the crowdvariant study
- high confidence labels only available for HG002
    - Are there other labels?
- All deletions
- 1514 data points total
    - 552 Heterozygous Variant (CrowdVar Label = 1)  [Confidence: >=84%]
    - 959 Homozygous Variant (CrowdVar Label = 0)    [Confidence: >=84%]
    - 3   Homozygous Reference (CrowdVar Label = 2)  [Confidence: >=91%]
    - 1   Unknown

***
Data Collection and Cleaning
***

** Train/Test Dataset **

**Labels**

- Labels were collected from the following article

![Figure 1](https://raw.githubusercontent.com/lesleymaraina/NIST/master/Notebooks/CrowdVariant/Screen%20Shot%202017-06-14%20at%2010.41.43%20AM.png)

http://biorxiv.org/content/early/2016/12/13/093526

**Features**

![Figure 1](https://raw.githubusercontent.com/lesleymaraina/NIST/master/Notebooks/CrowdVariant/Screen%20Shot%202017-06-14%20at%2012.09.23%20PM.png)

** Prediction Dataset **

Newly selected datapoints
June 12 2017

HG002 Deletions

***
Data Preprocessing
***

- Drop columns with labels
    
    'GTcons', 'GTconflict',	'GTsupp', 'CN0_prob', 'CN1_prob', 'CN2_prob', 'Label', 'TenX.GT', 'pacbio.GT', 'IllMP.GT', 'Ill250.GT', 'Ill300x.GT'



- Drop irrelevant columns
    
    'chrom', 'start', 'end', 'sample'


***
Machine Learning
***

Train machine learning classifier with labeled CrowdVariant Data

In [175]:
"""
Imports
"""
import pandas as pd
import numpy as np
from fancyimpute import KNN
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import LeaveOneOut
from scipy.stats import ks_2samp
from scipy import stats
from matplotlib import pyplot
from sklearn import preprocessing
from scipy.linalg import svd
from sklearn.decomposition import TruncatedSVD
import sqlite3
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA as sklearnPCA
import plotly.plotly as py
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score
from sklearn import preprocessing
from ggplot import *
from bokeh.charts import TimeSeries
from bokeh.models import HoverTool
from bokeh.plotting import show
from bokeh.charts import Scatter, Histogram, output_file, show
from bokeh.plotting import figure, show, output_file, ColumnDataSource
from bokeh.io import output_notebook
import bokeh.palettes as palettes
from bokeh.models import HoverTool, BoxSelectTool, Legend
from sklearn import (manifold, datasets, decomposition, ensemble,
                     discriminant_analysis, random_projection)

In [176]:
### Import Data
df_crowd = pd.read_csv('CrowdVar.Train_HG002.csv')

In [177]:
df_crowd.head(3)

Unnamed: 0,chrom,start,end,sample,Ill300x.alt_alnScore_mean,Ill300x.alt_alnScore_std,Ill300x.alt_count,Ill300x.alt_insertSize_mean,Ill300x.alt_insertSize_std,Ill300x.alt_reason_alignmentScore,...,TenX.GT,size,GTcons,GTconflict,GTsupp,CN0_prob,CN1_prob,CN2_prob,Svsize,Label
0,21,10842334,10842437,HG002,,,,,,,...,-1.0,103,1,-1,2,0.02,0.04,0.94,103,2
1,10,42356979,42357254,HG002,,,,,,,...,-1.0,275,1,-1,1,0.0,0.02,0.98,275,2
2,17,75845891,75846109,HG002,580.681818,15.496001,22.0,1126.909091,202.803736,0.0,...,-1.0,218,0,-1,2,0.0,0.09,0.91,218,2


In [178]:
### Drop irrelevant columns
df_crowd.drop(['GTcons'], axis=1, inplace = True)
df_crowd.drop(['GTconflict'], axis=1, inplace = True)
df_crowd.drop(['GTsupp'], axis=1, inplace = True)
# df_crowd.drop('SVtype', axis=1)
# df_crowd.drop('type',axis=1)
df_crowd.drop(['start'],axis=1, inplace = True)
df_crowd.drop(['end'],axis=1, inplace = True)
df_crowd.drop(['chrom'],axis=1, inplace = True)
# df_crowd.drop('Size',axis=1)
df_crowd.drop(['CN0_prob'],axis=1, inplace = True)
df_crowd.drop(['CN1_prob'],axis=1, inplace = True)
df_crowd.drop(['CN2_prob'],axis=1, inplace = True)
df_crowd.drop(['TenX.GT'],axis=1, inplace = True)
df_crowd.drop(['pacbio.GT'],axis=1, inplace = True)
df_crowd.drop(['IllMP.GT'],axis=1, inplace = True)
df_crowd.drop(['Ill250.GT'],axis=1, inplace = True)
df_crowd.drop(['Ill300x.GT'],axis=1, inplace = True)

# df_crowd.drop('GTconflict', axis=1)
# df_crowd.drop('GTsupp', axis=1)
# # df_crowd.drop('SVtype', axis=1)
# # df_crowd.drop('type',axis=1)
# df_crowd.drop('start',axis=1)
# df_crowd.drop('end',axis=1)
# df_crowd.drop('chrom',axis=1)
# # df_crowd.drop('Size',axis=1)
# df_crowd.drop('CN0_prob',axis=1)
# df_crowd.drop('CN1_prob',axis=1)
# df_crowd.drop('CN2_prob',axis=1)
# df_crowd.drop('TenX.GT',axis=1)
# df_crowd.drop('pacbio.GT',axis=1)
# df_crowd.drop('IllMP.GT',axis=1)
# df_crowd.drop('Ill250.GT',axis=1)
# df_crowd.drop('Ill300x.GT',axis=1)

In [179]:
df_crowd.drop(['sample'],axis=1, inplace = True)

In [180]:
df_crowd.to_csv('df_crowd_headers.csv', index=False)

In [181]:
df_crowd.head(3)

Unnamed: 0,Ill300x.alt_alnScore_mean,Ill300x.alt_alnScore_std,Ill300x.alt_count,Ill300x.alt_insertSize_mean,Ill300x.alt_insertSize_std,Ill300x.alt_reason_alignmentScore,Ill300x.alt_reason_insertSizeScore,Ill300x.alt_reason_orientation,Ill300x.amb_alnScore_mean,Ill300x.amb_alnScore_std,...,TenX.HP2_ref_alnScore_mean,TenX.HP2_ref_alnScore_std,TenX.HP2_ref_count,TenX.HP2_ref_insertSize_mean,TenX.HP2_ref_insertSize_std,TenX.HP2_ref_reason_alignmentScore,TenX.HP2_ref_reason_orientation,size,Svsize,Label
0,,,,,,,,,,,...,503.5,25.5,2.0,281.5,26.5,2.0,0.0,103,103,2
1,,,,,,,,,,,...,528.138889,20.997556,36.0,339.777778,122.282485,27.0,9.0,275,275,2
2,580.681818,15.496001,22.0,1126.909091,202.803736,0.0,22.0,0.0,547.209939,84.570535,...,517.333333,40.540789,3.0,819.666667,361.818862,3.0,0.0,218,218,2


In [182]:
# df = pd.read_csv('/Users/lmc2/Desktop/NIHFAES/FinalProject/Train/Data/CrowdVar.Train_250bp_HG002.csv') 
# X = pd.read_csv('/Users/lmc2/Desktop/NIHFAES/FinalProject/Train/Data/CrowdVar.Train_250bp_HG002.csv')
# X.drop(["sample", "chrom", "CN0_prob", "CN1_prob", "CN2_prob", "GTcons", "GTconflict", "GTsupp"], axis=1, inplace=True)
X = df_crowd

In [183]:
X.head(3)

Unnamed: 0,Ill300x.alt_alnScore_mean,Ill300x.alt_alnScore_std,Ill300x.alt_count,Ill300x.alt_insertSize_mean,Ill300x.alt_insertSize_std,Ill300x.alt_reason_alignmentScore,Ill300x.alt_reason_insertSizeScore,Ill300x.alt_reason_orientation,Ill300x.amb_alnScore_mean,Ill300x.amb_alnScore_std,...,TenX.HP2_ref_alnScore_mean,TenX.HP2_ref_alnScore_std,TenX.HP2_ref_count,TenX.HP2_ref_insertSize_mean,TenX.HP2_ref_insertSize_std,TenX.HP2_ref_reason_alignmentScore,TenX.HP2_ref_reason_orientation,size,Svsize,Label
0,,,,,,,,,,,...,503.5,25.5,2.0,281.5,26.5,2.0,0.0,103,103,2
1,,,,,,,,,,,...,528.138889,20.997556,36.0,339.777778,122.282485,27.0,9.0,275,275,2
2,580.681818,15.496001,22.0,1126.909091,202.803736,0.0,22.0,0.0,547.209939,84.570535,...,517.333333,40.540789,3.0,819.666667,361.818862,3.0,0.0,218,218,2


In [184]:
# Convert dataframe to matrix
X=X.as_matrix()

#Imput missing values from three closest observations
X_imputed=KNN(k=3).complete(X)
X=pd.DataFrame(X_imputed)

Imputing row 1/1515 with 59 missing, elapsed time: 1.894
Imputing row 101/1515 with 0 missing, elapsed time: 1.907
Imputing row 201/1515 with 0 missing, elapsed time: 1.909
Imputing row 301/1515 with 0 missing, elapsed time: 1.911
Imputing row 401/1515 with 0 missing, elapsed time: 1.912
Imputing row 501/1515 with 0 missing, elapsed time: 1.913
Imputing row 601/1515 with 0 missing, elapsed time: 1.915
Imputing row 701/1515 with 0 missing, elapsed time: 1.917
Imputing row 801/1515 with 0 missing, elapsed time: 1.918
Imputing row 901/1515 with 0 missing, elapsed time: 1.920
Imputing row 1001/1515 with 0 missing, elapsed time: 1.922
Imputing row 1101/1515 with 0 missing, elapsed time: 1.923
Imputing row 1201/1515 with 0 missing, elapsed time: 1.927
Imputing row 1301/1515 with 0 missing, elapsed time: 1.928
Imputing row 1401/1515 with 0 missing, elapsed time: 1.930
Imputing row 1501/1515 with 0 missing, elapsed time: 1.932


In [185]:
# Add header to the data frame
X.columns=['Ill300x.alt_alnScore_mean','Ill300x.alt_alnScore_std','Ill300x.alt_count','Ill300x.alt_insertSize_mean','Ill300x.alt_insertSize_std','Ill300x.alt_reason_alignmentScore','Ill300x.alt_reason_insertSizeScore','Ill300x.alt_reason_orientation','Ill300x.amb_alnScore_mean','Ill300x.amb_alnScore_std','Ill300x.amb_count','Ill300x.amb_insertSize_mean','Ill300x.amb_insertSize_std','Ill300x.amb_reason_alignmentScore_alignmentScore','Ill300x.amb_reason_alignmentScore_orientation','Ill300x.amb_reason_flanking','Ill300x.amb_reason_insertSizeScore_alignmentScore','Ill300x.amb_reason_insertSizeScore_insertSizeScore','Ill300x.amb_reason_multimapping','Ill300x.amb_reason_orientation_alignmentScore','Ill300x.amb_reason_orientation_orientation','Ill300x.amb_reason_same_scores','Ill300x.ref_alnScore_mean','Ill300x.ref_alnScore_std','Ill300x.ref_count','Ill300x.ref_insertSize_mean','Ill300x.ref_insertSize_std','Ill300x.ref_reason_alignmentScore','Ill300x.ref_reason_insertSizeScore','Ill300x.ref_reason_orientation','Ill250.alt_alnScore_mean','Ill250.alt_alnScore_std','Ill250.alt_count','Ill250.alt_insertSize_mean','Ill250.alt_insertSize_std','Ill250.alt_reason_alignmentScore','Ill250.alt_reason_insertSizeScore','Ill250.alt_reason_orientation','Ill250.amb_alnScore_mean','Ill250.amb_alnScore_std','Ill250.amb_count','Ill250.amb_insertSize_mean','Ill250.amb_insertSize_std','Ill250.amb_reason_alignmentScore_alignmentScore','Ill250.amb_reason_alignmentScore_orientation','Ill250.amb_reason_flanking','Ill250.amb_reason_insertSizeScore_alignmentScore','Ill250.amb_reason_insertSizeScore_insertSizeScore','Ill250.amb_reason_multimapping','Ill250.amb_reason_orientation_alignmentScore','Ill250.amb_reason_orientation_orientation','Ill250.amb_reason_same_scores','Ill250.ref_alnScore_mean','Ill250.ref_alnScore_std','Ill250.ref_count','Ill250.ref_insertSize_mean','Ill250.ref_insertSize_std','Ill250.ref_reason_alignmentScore','Ill250.ref_reason_orientation','IllMP.alt_alnScore_mean','IllMP.alt_alnScore_std','IllMP.alt_count','IllMP.alt_insertSize_mean','IllMP.alt_insertSize_std','IllMP.alt_reason_alignmentScore','IllMP.alt_reason_insertSizeScore','IllMP.alt_reason_orientation','IllMP.amb_alnScore_mean','IllMP.amb_alnScore_std','IllMP.amb_count','IllMP.amb_insertSize_mean','IllMP.amb_insertSize_std','IllMP.amb_reason_alignmentScore_alignmentScore','IllMP.amb_reason_alignmentScore_orientation','IllMP.amb_reason_flanking','IllMP.amb_reason_insertSizeScore_insertSizeScore','IllMP.amb_reason_multimapping','IllMP.amb_reason_orientation_alignmentScore','IllMP.amb_reason_orientation_orientation','IllMP.amb_reason_same_scores','IllMP.ref_alnScore_mean','IllMP.ref_alnScore_std','IllMP.ref_count','IllMP.ref_insertSize_mean','IllMP.ref_insertSize_std','IllMP.ref_reason_alignmentScore','IllMP.ref_reason_insertSizeScore','IllMP.ref_reason_orientation','pacbio.alt_alnScore_mean','pacbio.alt_alnScore_std','pacbio.alt_count','pacbio.alt_insertSize_mean','pacbio.alt_insertSize_std','pacbio.alt_reason_alignmentScore','pacbio.amb_alnScore_mean','pacbio.amb_alnScore_std','pacbio.amb_count','pacbio.amb_insertSize_mean','pacbio.amb_insertSize_std','pacbio.amb_reason_alignmentScore_alignmentScore','pacbio.amb_reason_flanking','pacbio.amb_reason_multimapping','pacbio.amb_reason_same_scores','pacbio.ref_alnScore_mean','pacbio.ref_alnScore_std','pacbio.ref_count','pacbio.ref_insertSize_mean','pacbio.ref_insertSize_std','pacbio.ref_reason_alignmentScore','TenX.HP1_alt_alnScore_mean','TenX.HP1_alt_alnScore_std','TenX.HP1_alt_count','TenX.HP1_alt_insertSize_mean','TenX.HP1_alt_insertSize_std','TenX.HP1_alt_reason_alignmentScore','TenX.HP1_alt_reason_insertSizeScore','TenX.HP1_alt_reason_orientation','TenX.HP1_amb_alnScore_mean','TenX.HP1_amb_alnScore_std','TenX.HP1_amb_count','TenX.HP1_amb_insertSize_mean','TenX.HP1_amb_insertSize_std','TenX.HP1_amb_reason_alignmentScore_alignmentScore','TenX.HP1_amb_reason_alignmentScore_orientation','TenX.HP1_amb_reason_flanking','TenX.HP1_amb_reason_insertSizeScore_alignmentScore','TenX.HP1_amb_reason_multimapping','TenX.HP1_amb_reason_orientation_alignmentScore','TenX.HP1_amb_reason_orientation_orientation','TenX.HP1_amb_reason_same_scores','TenX.HP1_ref_alnScore_mean','TenX.HP1_ref_alnScore_std','TenX.HP1_ref_count','TenX.HP1_ref_insertSize_mean','TenX.HP1_ref_insertSize_std','TenX.HP1_ref_reason_alignmentScore','TenX.HP1_ref_reason_orientation','TenX.HP2_alt_alnScore_mean','TenX.HP2_alt_alnScore_std','TenX.HP2_alt_count','TenX.HP2_alt_insertSize_mean','TenX.HP2_alt_insertSize_std','TenX.HP2_alt_reason_alignmentScore','TenX.HP2_alt_reason_insertSizeScore','TenX.HP2_alt_reason_orientation','TenX.HP2_amb_alnScore_mean','TenX.HP2_amb_alnScore_std','TenX.HP2_amb_count','TenX.HP2_amb_insertSize_mean','TenX.HP2_amb_insertSize_std','TenX.HP2_amb_reason_alignmentScore_alignmentScore','TenX.HP2_amb_reason_alignmentScore_orientation','TenX.HP2_amb_reason_flanking','TenX.HP2_amb_reason_insertSizeScore_alignmentScore','TenX.HP2_amb_reason_multimapping','TenX.HP2_amb_reason_orientation_alignmentScore','TenX.HP2_amb_reason_orientation_orientation','TenX.HP2_amb_reason_same_scores','TenX.HP2_ref_alnScore_mean','TenX.HP2_ref_alnScore_std','TenX.HP2_ref_count','TenX.HP2_ref_insertSize_mean','TenX.HP2_ref_insertSize_std','TenX.HP2_ref_reason_alignmentScore','TenX.HP2_ref_reason_orientation','size','Svsize','Label']

In [186]:
X.head(3)

Unnamed: 0,Ill300x.alt_alnScore_mean,Ill300x.alt_alnScore_std,Ill300x.alt_count,Ill300x.alt_insertSize_mean,Ill300x.alt_insertSize_std,Ill300x.alt_reason_alignmentScore,Ill300x.alt_reason_insertSizeScore,Ill300x.alt_reason_orientation,Ill300x.amb_alnScore_mean,Ill300x.amb_alnScore_std,...,TenX.HP2_ref_alnScore_mean,TenX.HP2_ref_alnScore_std,TenX.HP2_ref_count,TenX.HP2_ref_insertSize_mean,TenX.HP2_ref_insertSize_std,TenX.HP2_ref_reason_alignmentScore,TenX.HP2_ref_reason_orientation,size,Svsize,Label
0,586.375534,9.673593,229.624201,622.346332,167.207706,136.239193,93.385008,0.0,536.0116,89.944503,...,503.5,25.5,2.0,281.5,26.5,2.0,0.0,103.0,103.0,2.0
1,585.158132,8.614902,226.527365,638.034493,156.638135,118.938271,107.589093,0.0,524.344866,93.783706,...,528.138889,20.997556,36.0,339.777778,122.282485,27.0,9.0,275.0,275.0,2.0
2,580.681818,15.496001,22.0,1126.909091,202.803736,0.0,22.0,0.0,547.209939,84.570535,...,517.333333,40.540789,3.0,819.666667,361.818862,3.0,0.0,218.0,218.0,2.0


In [187]:
# Store Labels in a new 'Y' DataFrame
Y = pd.DataFrame()
Y['Label'] = X['Label']
#Y = X.pop('Label')

In [247]:
# Remove labels from the X dataframe: Select all columns except for the label column
X=X[['Ill300x.alt_alnScore_mean','Ill300x.alt_alnScore_std','Ill300x.alt_count','Ill300x.alt_insertSize_mean','Ill300x.alt_insertSize_std','Ill300x.alt_reason_alignmentScore','Ill300x.alt_reason_insertSizeScore','Ill300x.alt_reason_orientation','Ill300x.amb_alnScore_mean','Ill300x.amb_alnScore_std','Ill300x.amb_count','Ill300x.amb_insertSize_mean','Ill300x.amb_insertSize_std','Ill300x.amb_reason_alignmentScore_alignmentScore','Ill300x.amb_reason_alignmentScore_orientation','Ill300x.amb_reason_flanking','Ill300x.amb_reason_insertSizeScore_alignmentScore','Ill300x.amb_reason_insertSizeScore_insertSizeScore','Ill300x.amb_reason_multimapping','Ill300x.amb_reason_orientation_alignmentScore','Ill300x.amb_reason_orientation_orientation','Ill300x.amb_reason_same_scores','Ill300x.ref_alnScore_mean','Ill300x.ref_alnScore_std','Ill300x.ref_count','Ill300x.ref_insertSize_mean','Ill300x.ref_insertSize_std','Ill300x.ref_reason_alignmentScore','Ill300x.ref_reason_insertSizeScore','Ill300x.ref_reason_orientation','Ill250.alt_alnScore_mean','Ill250.alt_alnScore_std','Ill250.alt_count','Ill250.alt_insertSize_mean','Ill250.alt_insertSize_std','Ill250.alt_reason_alignmentScore','Ill250.alt_reason_insertSizeScore','Ill250.alt_reason_orientation','Ill250.amb_alnScore_mean','Ill250.amb_alnScore_std','Ill250.amb_count','Ill250.amb_insertSize_mean','Ill250.amb_insertSize_std','Ill250.amb_reason_alignmentScore_alignmentScore','Ill250.amb_reason_alignmentScore_orientation','Ill250.amb_reason_flanking','Ill250.amb_reason_insertSizeScore_alignmentScore','Ill250.amb_reason_multimapping','Ill250.amb_reason_orientation_alignmentScore','Ill250.amb_reason_orientation_orientation','Ill250.amb_reason_same_scores','Ill250.ref_alnScore_mean','Ill250.ref_alnScore_std','Ill250.ref_count','Ill250.ref_insertSize_mean','Ill250.ref_insertSize_std','Ill250.ref_reason_alignmentScore','Ill250.ref_reason_orientation','IllMP.alt_alnScore_mean','IllMP.alt_alnScore_std','IllMP.alt_count','IllMP.alt_insertSize_mean','IllMP.alt_insertSize_std','IllMP.alt_reason_alignmentScore','IllMP.alt_reason_insertSizeScore','IllMP.alt_reason_orientation','IllMP.amb_alnScore_mean','IllMP.amb_alnScore_std','IllMP.amb_count','IllMP.amb_insertSize_mean','IllMP.amb_insertSize_std','IllMP.amb_reason_alignmentScore_alignmentScore','IllMP.amb_reason_alignmentScore_orientation','IllMP.amb_reason_flanking','IllMP.amb_reason_insertSizeScore_insertSizeScore','IllMP.amb_reason_multimapping','IllMP.amb_reason_orientation_alignmentScore','IllMP.amb_reason_orientation_orientation','IllMP.amb_reason_same_scores','IllMP.ref_alnScore_mean','IllMP.ref_alnScore_std','IllMP.ref_count','IllMP.ref_insertSize_mean','IllMP.ref_insertSize_std','IllMP.ref_reason_alignmentScore','IllMP.ref_reason_insertSizeScore','IllMP.ref_reason_orientation','pacbio.alt_alnScore_mean','pacbio.alt_alnScore_std','pacbio.alt_count','pacbio.alt_insertSize_mean','pacbio.alt_insertSize_std','pacbio.alt_reason_alignmentScore','pacbio.amb_alnScore_mean','pacbio.amb_alnScore_std','pacbio.amb_count','pacbio.amb_insertSize_mean','pacbio.amb_insertSize_std','pacbio.amb_reason_alignmentScore_alignmentScore','pacbio.amb_reason_flanking','pacbio.amb_reason_multimapping','pacbio.amb_reason_same_scores','pacbio.ref_alnScore_mean','pacbio.ref_alnScore_std','pacbio.ref_count','pacbio.ref_insertSize_mean','pacbio.ref_insertSize_std','pacbio.ref_reason_alignmentScore','TenX.HP1_alt_alnScore_mean','TenX.HP1_alt_alnScore_std','TenX.HP1_alt_count','TenX.HP1_alt_insertSize_mean','TenX.HP1_alt_insertSize_std','TenX.HP1_alt_reason_alignmentScore','TenX.HP1_alt_reason_insertSizeScore','TenX.HP1_alt_reason_orientation','TenX.HP1_amb_alnScore_mean','TenX.HP1_amb_alnScore_std','TenX.HP1_amb_count','TenX.HP1_amb_insertSize_mean','TenX.HP1_amb_insertSize_std','TenX.HP1_amb_reason_alignmentScore_alignmentScore','TenX.HP1_amb_reason_alignmentScore_orientation','TenX.HP1_amb_reason_flanking','TenX.HP1_amb_reason_insertSizeScore_alignmentScore','TenX.HP1_amb_reason_multimapping','TenX.HP1_amb_reason_orientation_alignmentScore','TenX.HP1_amb_reason_orientation_orientation','TenX.HP1_amb_reason_same_scores','TenX.HP1_ref_alnScore_mean','TenX.HP1_ref_alnScore_std','TenX.HP1_ref_count','TenX.HP1_ref_insertSize_mean','TenX.HP1_ref_insertSize_std','TenX.HP1_ref_reason_alignmentScore','TenX.HP1_ref_reason_orientation','TenX.HP2_alt_alnScore_mean','TenX.HP2_alt_alnScore_std','TenX.HP2_alt_count','TenX.HP2_alt_insertSize_mean','TenX.HP2_alt_insertSize_std','TenX.HP2_alt_reason_alignmentScore','TenX.HP2_alt_reason_insertSizeScore','TenX.HP2_alt_reason_orientation','TenX.HP2_amb_alnScore_mean','TenX.HP2_amb_alnScore_std','TenX.HP2_amb_count','TenX.HP2_amb_insertSize_mean','TenX.HP2_amb_insertSize_std','TenX.HP2_amb_reason_alignmentScore_alignmentScore','TenX.HP2_amb_reason_alignmentScore_orientation','TenX.HP2_amb_reason_flanking','TenX.HP2_amb_reason_insertSizeScore_alignmentScore','TenX.HP2_amb_reason_multimapping','TenX.HP2_amb_reason_orientation_alignmentScore','TenX.HP2_amb_reason_orientation_orientation','TenX.HP2_amb_reason_same_scores','TenX.HP2_ref_alnScore_mean','TenX.HP2_ref_alnScore_std','TenX.HP2_ref_count','TenX.HP2_ref_insertSize_mean','TenX.HP2_ref_insertSize_std','TenX.HP2_ref_reason_alignmentScore','TenX.HP2_ref_reason_orientation']]

In [248]:
X.head()

Unnamed: 0,Ill300x.alt_alnScore_mean,Ill300x.alt_alnScore_std,Ill300x.alt_count,Ill300x.alt_insertSize_mean,Ill300x.alt_insertSize_std,Ill300x.alt_reason_alignmentScore,Ill300x.alt_reason_insertSizeScore,Ill300x.alt_reason_orientation,Ill300x.amb_alnScore_mean,Ill300x.amb_alnScore_std,...,TenX.HP2_amb_reason_orientation_alignmentScore,TenX.HP2_amb_reason_orientation_orientation,TenX.HP2_amb_reason_same_scores,TenX.HP2_ref_alnScore_mean,TenX.HP2_ref_alnScore_std,TenX.HP2_ref_count,TenX.HP2_ref_insertSize_mean,TenX.HP2_ref_insertSize_std,TenX.HP2_ref_reason_alignmentScore,TenX.HP2_ref_reason_orientation
0,586.375534,9.673593,229.624201,622.346332,167.207706,136.239193,93.385008,0.0,536.0116,89.944503,...,0.0,107.0,35.0,503.5,25.5,2.0,281.5,26.5,2.0,0.0
1,585.158132,8.614902,226.527365,638.034493,156.638135,118.938271,107.589093,0.0,524.344866,93.783706,...,11.0,383.0,0.0,528.138889,20.997556,36.0,339.777778,122.282485,27.0,9.0
2,580.681818,15.496001,22.0,1126.909091,202.803736,0.0,22.0,0.0,547.209939,84.570535,...,0.0,13.0,0.0,517.333333,40.540789,3.0,819.666667,361.818862,3.0,0.0
3,588.541516,7.395182,277.0,625.779783,158.419809,169.0,108.0,0.0,523.789116,95.441592,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,577.288462,7.133354,156.0,606.76282,168.474152,126.0,30.0,0.0,540.320059,78.973119,...,0.0,4.0,3.0,520.0,0.0,1.0,136.0,0.0,1.0,0.0


In [249]:
# Train Test Split
# Train on 70% of the data and test on 30%
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.7, random_state=0)

In [250]:
# Train Random Forest Classifier
model = RandomForestClassifier() 
#out of bag samples to estimate general accuracy
model.fit(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [264]:
HG002_pred_2 = pd.read_csv('/Users/lmc2/NIST/Notebooks/CrowdVariant/svviz.Annotate.DEL.HG002_2.csv')

In [265]:
HG002_pred = pd.read_csv('/Users/lmc2/NIST/Notebooks/CrowdVariant/svviz.Annotate.DEL.HG002_2.csv')

**Data Label Update**
GIAB GT Labels and CrowdVar GT Labels do not match

Created a new set of labels for the GIAB HG002 Deletions Dataframe
CrowdVar GT Label Key

- 0: Hom. Var.
- 1: Het Var
- 2: Hom Ref

GIAB GT Labels

- 0: Hom Ref
- 1: Het Var
- 2: Hom Var

Changed GIAB DF (svviz.Annotate.DEL.HG002_2.csv) to have the following data labels (which match CrowdVar)
New Column (GIAB_Crowd)

- 0: Hom. Var.
- 1: Het Var
- 2: Hom Ref

In [266]:
### Drop irrelevant columns
HG002_pred.drop(['GTcons'], axis=1, inplace = True)
HG002_pred.drop(['GTconflict'], axis=1, inplace = True)
HG002_pred.drop(['GTsupp'], axis=1, inplace = True)
HG002_pred.drop('SVtype', axis=1)
HG002_pred.drop('type',axis=1)
HG002_pred.drop(['type'],axis=1, inplace = True)
HG002_pred.drop(['SVtype'],axis=1, inplace = True)
HG002_pred.drop(['start'],axis=1, inplace = True)
HG002_pred.drop(['end'],axis=1, inplace = True)
HG002_pred.drop(['chrom'],axis=1, inplace = True)
HG002_pred.drop('Size',axis=1)
# HG002_pred.drop(['CN0_prob'],axis=1, inplace = True)
# HG002_pred.drop(['CN1_prob'],axis=1, inplace = True)
# HG002_pred.drop(['CN2_prob'],axis=1, inplace = True)
HG002_pred.drop(['TenX.GT'],axis=1, inplace = True)
HG002_pred.drop(['pacbio.GT'],axis=1, inplace = True)
HG002_pred.drop(['IllMP.GT'],axis=1, inplace = True)
HG002_pred.drop(['Ill250.GT'],axis=1, inplace = True)
HG002_pred.drop(['Ill300x.GT'],axis=1, inplace = True)
HG002_pred.drop(['sample'],axis=1, inplace = True)
HG002_pred.drop(['id'],axis=1, inplace = True)

# df_crowd.drop('GTconflict', axis=1)
# df_crowd.drop('GTsupp', axis=1)
# # df_crowd.drop('SVtype', axis=1)
# # df_crowd.drop('type',axis=1)
# df_crowd.drop('start',axis=1)
# df_crowd.drop('end',axis=1)
# df_crowd.drop('chrom',axis=1)
# # df_crowd.drop('Size',axis=1)
# df_crowd.drop('CN0_prob',axis=1)
# df_crowd.drop('CN1_prob',axis=1)
# df_crowd.drop('CN2_prob',axis=1)
# df_crowd.drop('TenX.GT',axis=1)
# df_crowd.drop('pacbio.GT',axis=1)
# df_crowd.drop('IllMP.GT',axis=1)
# df_crowd.drop('Ill250.GT',axis=1)
# df_crowd.drop('Ill300x.GT',axis=1)

In [267]:
HG002_pred.head()

Unnamed: 0,Size,Ill300x.alt_alnScore_mean,Ill300x.alt_alnScore_std,Ill300x.alt_count,Ill300x.alt_insertSize_mean,Ill300x.alt_insertSize_std,Ill300x.alt_reason_alignmentScore,Ill300x.alt_reason_insertSizeScore,Ill300x.alt_reason_orientation,Ill300x.amb_alnScore_mean,...,pacbio.ref_insertSize_mean,pacbio.ref_insertSize_std,pacbio.ref_reason_alignmentScore,tandemrep_cnt,tandemrep_pct,segdup_cnt,segdup_pct,refN_cnt,refN_pct,GIAB_Crowd
0,-178,562.0,17.962925,6.0,1044.666667,71.448505,0.0,6.0,0.0,533.576802,...,8139.555556,4575.304996,18.0,0,0.0,1,0.679775,0,0,2
1,-90,580.0,18.547237,4.0,995.25,257.372663,1.0,3.0,0.0,545.406928,...,9962.93617,4301.89526,47.0,0,0.0,1,1.0,0,0,2
2,-33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,539.46868,...,11189.14634,4525.45141,41.0,0,0.0,0,0.0,0,0,2
3,-145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,541.033679,...,9694.425532,4306.492796,47.0,0,0.0,0,0.0,0,0,2
4,-98,568.0,0.0,1.0,1093.0,0.0,0.0,1.0,0.0,536.675808,...,9724.0,4161.441384,51.0,0,0.0,0,0.0,0,0,2


In [268]:
HG002_pred.to_csv('HG002_pred.csv', index=False)

In [269]:
X2 = HG002_pred

In [270]:
# Convert dataframe to matrix
X2=X2.as_matrix()

#Imput missing values from three closest observations
X2_imputed=KNN(k=3).complete(X2)
X2=pd.DataFrame(X2_imputed)

Imputing row 1/1071 with 1 missing, elapsed time: 1.031
Imputing row 101/1071 with 1 missing, elapsed time: 1.053
Imputing row 201/1071 with 1 missing, elapsed time: 1.068
Imputing row 301/1071 with 1 missing, elapsed time: 1.080
Imputing row 401/1071 with 1 missing, elapsed time: 1.088
Imputing row 501/1071 with 1 missing, elapsed time: 1.095
Imputing row 601/1071 with 1 missing, elapsed time: 1.116
Imputing row 701/1071 with 1 missing, elapsed time: 1.133
Imputing row 801/1071 with 2 missing, elapsed time: 1.147
Imputing row 901/1071 with 2 missing, elapsed time: 1.162
Imputing row 1001/1071 with 60 missing, elapsed time: 1.173


In [272]:
X2.columns = ['Size','Ill300x.alt_alnScore_mean','Ill300x.alt_alnScore_std','Ill300x.alt_count','Ill300x.alt_insertSize_mean','Ill300x.alt_insertSize_std','Ill300x.alt_reason_alignmentScore','Ill300x.alt_reason_insertSizeScore','Ill300x.alt_reason_orientation','Ill300x.amb_alnScore_mean','Ill300x.amb_alnScore_std','Ill300x.amb_count','Ill300x.amb_insertSize_mean','Ill300x.amb_insertSize_std','Ill300x.amb_reason_alignmentScore_alignmentScore','Ill300x.amb_reason_alignmentScore_orientation','Ill300x.amb_reason_flanking','Ill300x.amb_reason_insertSizeScore_alignmentScore','Ill300x.amb_reason_insertSizeScore_insertSizeScore','Ill300x.amb_reason_insertSizeScore_orientation','Ill300x.amb_reason_multimapping','Ill300x.amb_reason_orientation_alignmentScore','Ill300x.amb_reason_orientation_orientation','Ill300x.amb_reason_same_scores','Ill300x.ref_alnScore_mean','Ill300x.ref_alnScore_std','Ill300x.ref_count','Ill300x.ref_insertSize_mean','Ill300x.ref_insertSize_std','Ill300x.ref_reason_alignmentScore','Ill300x.ref_reason_insertSizeScore','Ill300x.ref_reason_orientation','Ill250.alt_alnScore_mean','Ill250.alt_alnScore_std','Ill250.alt_count','Ill250.alt_insertSize_mean','Ill250.alt_insertSize_std','Ill250.alt_reason_alignmentScore','Ill250.alt_reason_insertSizeScore','Ill250.alt_reason_orientation','Ill250.amb_alnScore_mean','Ill250.amb_alnScore_std','Ill250.amb_count','Ill250.amb_insertSize_mean','Ill250.amb_insertSize_std','Ill250.amb_reason_alignmentScore_alignmentScore','Ill250.amb_reason_alignmentScore_orientation','Ill250.amb_reason_flanking','Ill250.amb_reason_insertSizeScore_alignmentScore','Ill250.amb_reason_multimapping','Ill250.amb_reason_orientation_alignmentScore','Ill250.amb_reason_orientation_orientation','Ill250.amb_reason_same_scores','Ill250.ref_alnScore_mean','Ill250.ref_alnScore_std','Ill250.ref_count','Ill250.ref_insertSize_mean','Ill250.ref_insertSize_std','Ill250.ref_reason_alignmentScore','Ill250.ref_reason_orientation','IllMP.alt_alnScore_mean','IllMP.alt_alnScore_std','IllMP.alt_count','IllMP.alt_insertSize_mean','IllMP.alt_insertSize_std','IllMP.alt_reason_alignmentScore','IllMP.alt_reason_insertSizeScore','IllMP.alt_reason_orientation','IllMP.amb_alnScore_mean','IllMP.amb_alnScore_std','IllMP.amb_count','IllMP.amb_insertSize_mean','IllMP.amb_insertSize_std','IllMP.amb_reason_alignmentScore_alignmentScore','IllMP.amb_reason_alignmentScore_orientation','IllMP.amb_reason_flanking','IllMP.amb_reason_insertSizeScore_alignmentScore','IllMP.amb_reason_insertSizeScore_insertSizeScore','IllMP.amb_reason_multimapping','IllMP.amb_reason_orientation_alignmentScore','IllMP.amb_reason_orientation_orientation','IllMP.amb_reason_same_scores','IllMP.ref_alnScore_mean','IllMP.ref_alnScore_std','IllMP.ref_count','IllMP.ref_insertSize_mean','IllMP.ref_insertSize_std','IllMP.ref_reason_alignmentScore','IllMP.ref_reason_insertSizeScore','IllMP.ref_reason_orientation','TenX.HP1_alt_alnScore_mean','TenX.HP1_alt_alnScore_std','TenX.HP1_alt_count','TenX.HP1_alt_insertSize_mean','TenX.HP1_alt_insertSize_std','TenX.HP1_alt_reason_alignmentScore','TenX.HP1_alt_reason_insertSizeScore','TenX.HP1_alt_reason_orientation','TenX.HP1_amb_alnScore_mean','TenX.HP1_amb_alnScore_std','TenX.HP1_amb_count','TenX.HP1_amb_insertSize_mean','TenX.HP1_amb_insertSize_std','TenX.HP1_amb_reason_alignmentScore_alignmentScore','TenX.HP1_amb_reason_alignmentScore_orientation','TenX.HP1_amb_reason_flanking','TenX.HP1_amb_reason_insertSizeScore_alignmentScore','TenX.HP1_amb_reason_insertSizeScore_insertSizeScore','TenX.HP1_amb_reason_multimapping','TenX.HP1_amb_reason_orientation_alignmentScore','TenX.HP1_amb_reason_orientation_orientation','TenX.HP1_amb_reason_same_scores','TenX.HP1_ref_alnScore_mean','TenX.HP1_ref_alnScore_std','TenX.HP1_ref_count','TenX.HP1_ref_insertSize_mean','TenX.HP1_ref_insertSize_std','TenX.HP1_ref_reason_alignmentScore','TenX.HP1_ref_reason_insertSizeScore','TenX.HP1_ref_reason_orientation','TenX.HP2_alt_alnScore_mean','TenX.HP2_alt_alnScore_std','TenX.HP2_alt_count','TenX.HP2_alt_insertSize_mean','TenX.HP2_alt_insertSize_std','TenX.HP2_alt_reason_alignmentScore','TenX.HP2_alt_reason_insertSizeScore','TenX.HP2_alt_reason_orientation','TenX.HP2_amb_alnScore_mean','TenX.HP2_amb_alnScore_std','TenX.HP2_amb_count','TenX.HP2_amb_insertSize_mean','TenX.HP2_amb_insertSize_std','TenX.HP2_amb_reason_alignmentScore_alignmentScore','TenX.HP2_amb_reason_alignmentScore_orientation','TenX.HP2_amb_reason_flanking','TenX.HP2_amb_reason_insertSizeScore_alignmentScore','TenX.HP2_amb_reason_insertSizeScore_insertSizeScore','TenX.HP2_amb_reason_multimapping','TenX.HP2_amb_reason_orientation_alignmentScore','TenX.HP2_amb_reason_orientation_insertSizeScore','TenX.HP2_amb_reason_orientation_orientation','TenX.HP2_amb_reason_same_scores','TenX.HP2_ref_alnScore_mean','TenX.HP2_ref_alnScore_std','TenX.HP2_ref_count','TenX.HP2_ref_insertSize_mean','TenX.HP2_ref_insertSize_std','TenX.HP2_ref_reason_alignmentScore','TenX.HP2_ref_reason_orientation','pacbio.alt_alnScore_mean','pacbio.alt_alnScore_std','pacbio.alt_count','pacbio.alt_insertSize_mean','pacbio.alt_insertSize_std','pacbio.alt_reason_alignmentScore','pacbio.amb_alnScore_mean','pacbio.amb_alnScore_std','pacbio.amb_count','pacbio.amb_insertSize_mean','pacbio.amb_insertSize_std','pacbio.amb_reason_alignmentScore_alignmentScore','pacbio.amb_reason_flanking','pacbio.amb_reason_multimapping','pacbio.amb_reason_same_scores','pacbio.ref_alnScore_mean','pacbio.ref_alnScore_std','pacbio.ref_count','pacbio.ref_insertSize_mean','pacbio.ref_insertSize_std','pacbio.ref_reason_alignmentScore','tandemrep_cnt','tandemrep_pct','segdup_cnt','segdup_pct','refN_cnt','refN_pct', 'GIAB_Crowd']

In [273]:
X2=X2[['Ill300x.alt_alnScore_mean','Ill300x.alt_alnScore_std','Ill300x.alt_count','Ill300x.alt_insertSize_mean','Ill300x.alt_insertSize_std','Ill300x.alt_reason_alignmentScore','Ill300x.alt_reason_insertSizeScore','Ill300x.alt_reason_orientation','Ill300x.amb_alnScore_mean','Ill300x.amb_alnScore_std','Ill300x.amb_count','Ill300x.amb_insertSize_mean','Ill300x.amb_insertSize_std','Ill300x.amb_reason_alignmentScore_alignmentScore','Ill300x.amb_reason_alignmentScore_orientation','Ill300x.amb_reason_flanking','Ill300x.amb_reason_insertSizeScore_alignmentScore','Ill300x.amb_reason_insertSizeScore_insertSizeScore','Ill300x.amb_reason_multimapping','Ill300x.amb_reason_orientation_alignmentScore','Ill300x.amb_reason_orientation_orientation','Ill300x.amb_reason_same_scores','Ill300x.ref_alnScore_mean','Ill300x.ref_alnScore_std','Ill300x.ref_count','Ill300x.ref_insertSize_mean','Ill300x.ref_insertSize_std','Ill300x.ref_reason_alignmentScore','Ill300x.ref_reason_insertSizeScore','Ill300x.ref_reason_orientation','Ill250.alt_alnScore_mean','Ill250.alt_alnScore_std','Ill250.alt_count','Ill250.alt_insertSize_mean','Ill250.alt_insertSize_std','Ill250.alt_reason_alignmentScore','Ill250.alt_reason_insertSizeScore','Ill250.alt_reason_orientation','Ill250.amb_alnScore_mean','Ill250.amb_alnScore_std','Ill250.amb_count','Ill250.amb_insertSize_mean','Ill250.amb_insertSize_std','Ill250.amb_reason_alignmentScore_alignmentScore','Ill250.amb_reason_alignmentScore_orientation','Ill250.amb_reason_flanking','Ill250.amb_reason_insertSizeScore_alignmentScore','Ill250.amb_reason_multimapping','Ill250.amb_reason_orientation_alignmentScore','Ill250.amb_reason_orientation_orientation','Ill250.amb_reason_same_scores','Ill250.ref_alnScore_mean','Ill250.ref_alnScore_std','Ill250.ref_count','Ill250.ref_insertSize_mean','Ill250.ref_insertSize_std','Ill250.ref_reason_alignmentScore','Ill250.ref_reason_orientation','IllMP.alt_alnScore_mean','IllMP.alt_alnScore_std','IllMP.alt_count','IllMP.alt_insertSize_mean','IllMP.alt_insertSize_std','IllMP.alt_reason_alignmentScore','IllMP.alt_reason_insertSizeScore','IllMP.alt_reason_orientation','IllMP.amb_alnScore_mean','IllMP.amb_alnScore_std','IllMP.amb_count','IllMP.amb_insertSize_mean','IllMP.amb_insertSize_std','IllMP.amb_reason_alignmentScore_alignmentScore','IllMP.amb_reason_alignmentScore_orientation','IllMP.amb_reason_flanking','IllMP.amb_reason_insertSizeScore_insertSizeScore','IllMP.amb_reason_multimapping','IllMP.amb_reason_orientation_alignmentScore','IllMP.amb_reason_orientation_orientation','IllMP.amb_reason_same_scores','IllMP.ref_alnScore_mean','IllMP.ref_alnScore_std','IllMP.ref_count','IllMP.ref_insertSize_mean','IllMP.ref_insertSize_std','IllMP.ref_reason_alignmentScore','IllMP.ref_reason_insertSizeScore','IllMP.ref_reason_orientation','pacbio.alt_alnScore_mean','pacbio.alt_alnScore_std','pacbio.alt_count','pacbio.alt_insertSize_mean','pacbio.alt_insertSize_std','pacbio.alt_reason_alignmentScore','pacbio.amb_alnScore_mean','pacbio.amb_alnScore_std','pacbio.amb_count','pacbio.amb_insertSize_mean','pacbio.amb_insertSize_std','pacbio.amb_reason_alignmentScore_alignmentScore','pacbio.amb_reason_flanking','pacbio.amb_reason_multimapping','pacbio.amb_reason_same_scores','pacbio.ref_alnScore_mean','pacbio.ref_alnScore_std','pacbio.ref_count','pacbio.ref_insertSize_mean','pacbio.ref_insertSize_std','pacbio.ref_reason_alignmentScore','TenX.HP1_alt_alnScore_mean','TenX.HP1_alt_alnScore_std','TenX.HP1_alt_count','TenX.HP1_alt_insertSize_mean','TenX.HP1_alt_insertSize_std','TenX.HP1_alt_reason_alignmentScore','TenX.HP1_alt_reason_insertSizeScore','TenX.HP1_alt_reason_orientation','TenX.HP1_amb_alnScore_mean','TenX.HP1_amb_alnScore_std','TenX.HP1_amb_count','TenX.HP1_amb_insertSize_mean','TenX.HP1_amb_insertSize_std','TenX.HP1_amb_reason_alignmentScore_alignmentScore','TenX.HP1_amb_reason_alignmentScore_orientation','TenX.HP1_amb_reason_flanking','TenX.HP1_amb_reason_insertSizeScore_alignmentScore','TenX.HP1_amb_reason_multimapping','TenX.HP1_amb_reason_orientation_alignmentScore','TenX.HP1_amb_reason_orientation_orientation','TenX.HP1_amb_reason_same_scores','TenX.HP1_ref_alnScore_mean','TenX.HP1_ref_alnScore_std','TenX.HP1_ref_count','TenX.HP1_ref_insertSize_mean','TenX.HP1_ref_insertSize_std','TenX.HP1_ref_reason_alignmentScore','TenX.HP1_ref_reason_orientation','TenX.HP2_alt_alnScore_mean','TenX.HP2_alt_alnScore_std','TenX.HP2_alt_count','TenX.HP2_alt_insertSize_mean','TenX.HP2_alt_insertSize_std','TenX.HP2_alt_reason_alignmentScore','TenX.HP2_alt_reason_insertSizeScore','TenX.HP2_alt_reason_orientation','TenX.HP2_amb_alnScore_mean','TenX.HP2_amb_alnScore_std','TenX.HP2_amb_count','TenX.HP2_amb_insertSize_mean','TenX.HP2_amb_insertSize_std','TenX.HP2_amb_reason_alignmentScore_alignmentScore','TenX.HP2_amb_reason_alignmentScore_orientation','TenX.HP2_amb_reason_flanking','TenX.HP2_amb_reason_insertSizeScore_alignmentScore','TenX.HP2_amb_reason_multimapping','TenX.HP2_amb_reason_orientation_alignmentScore','TenX.HP2_amb_reason_orientation_orientation','TenX.HP2_amb_reason_same_scores','TenX.HP2_ref_alnScore_mean','TenX.HP2_ref_alnScore_std','TenX.HP2_ref_count','TenX.HP2_ref_insertSize_mean','TenX.HP2_ref_insertSize_std','TenX.HP2_ref_reason_alignmentScore','TenX.HP2_ref_reason_orientation']]

In [274]:
model.predict(X2)

array([ 0.,  1.,  1., ...,  0.,  0.,  0.])

In [275]:
X2['model_pred_label'] = model.predict(X2)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [276]:
X2['GIAB_Crowd'] = HG002_pred_2['GIAB_Crowd']



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [278]:
test = X2['model_pred_label']
true = X2['GIAB_Crowd']

In [279]:
precision_score(true, test, average='micro') 

0.55368814192343607

In [None]:
# Display graph that shows number of unknown '-1'

In [263]:
X2.to_csv('X2_lab.csv', index=False)

In [262]:
X2.tail()

Unnamed: 0,Ill300x.alt_alnScore_mean,Ill300x.alt_alnScore_std,Ill300x.alt_count,Ill300x.alt_insertSize_mean,Ill300x.alt_insertSize_std,Ill300x.alt_reason_alignmentScore,Ill300x.alt_reason_insertSizeScore,Ill300x.alt_reason_orientation,Ill300x.amb_alnScore_mean,Ill300x.amb_alnScore_std,...,TenX.HP2_amb_reason_same_scores,TenX.HP2_ref_alnScore_mean,TenX.HP2_ref_alnScore_std,TenX.HP2_ref_count,TenX.HP2_ref_insertSize_mean,TenX.HP2_ref_insertSize_std,TenX.HP2_ref_reason_alignmentScore,TenX.HP2_ref_reason_orientation,model_pred_label,GTcons
3991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,542.783489,87.716016,...,3.962482,522.899125,18.379355,5.265224,422.481658,96.803787,5.265224,0.0,0.0,-1
3992,577.3875,11.565567,80.0,589.1,166.619747,79.0,1.0,0.0,534.01217,93.363463,...,1.670895,537.017135,14.195912,6.249317,333.941396,67.557745,6.249317,0.0,0.0,2
3993,560.058139,9.923375,86.0,590.302326,162.94628,85.0,1.0,0.0,540.574874,78.391526,...,3.921682,538.037569,16.740353,4.635323,381.612003,56.291199,4.635323,0.0,0.0,2
3994,571.828571,12.022564,35.0,620.914286,144.01178,35.0,0.0,0.0,533.425277,94.321792,...,0.3249,166.782188,6.051254,0.974701,127.685844,22.691912,0.974701,0.0,0.0,-1
3995,580.179104,7.443169,67.0,615.119403,146.736796,66.0,1.0,0.0,538.482993,90.914579,...,2.054873,518.780866,14.3632,2.0,354.866034,58.391841,2.0,0.0,0.0,-1


**Which are the most important features?**

NIHFAES Class

In [191]:
# Overall Scores
score = metrics.accuracy_score(y_test, model.predict(X_test))
score

0.97549481621112155

In [192]:
pred = model.predict_proba(X_test)

In [193]:
# This is the correct way to add predicted probability and labels
pred = model.predict_proba(X_test)
X9 = pd.concat([X_test, pd.DataFrame(pred, columns=['CrowdVar_0','CrowdVar_1','CrowdVar_2'])])
X9['Label'] = y_test

In [159]:
X9.head(3)

Unnamed: 0,CrowdVar_0,CrowdVar_1,CrowdVar_2,Ill250.alt_alnScore_mean,Ill250.alt_alnScore_std,Ill250.alt_count,Ill250.alt_insertSize_mean,Ill250.alt_insertSize_std,Ill250.alt_reason_alignmentScore,Ill250.alt_reason_insertSizeScore,...,pacbio.amb_reason_multimapping,pacbio.amb_reason_same_scores,pacbio.ref_alnScore_mean,pacbio.ref_alnScore_std,pacbio.ref_count,pacbio.ref_insertSize_mean,pacbio.ref_insertSize_std,pacbio.ref_reason_alignmentScore,size,Label
1222,,,,986.788462,15.092454,52.0,443.019231,86.135736,49.0,3.0,...,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,331.0,0.0
310,,,,991.5,7.996874,20.0,433.75,98.915052,19.0,1.0,...,2.0,10.0,17790.36364,6010.058989,22.0,11391.45455,3342.39583,22.0,213.0,1.0
9,,,,979.703704,10.164626,27.0,443.296296,64.292051,26.0,1.0,...,1.0,16.0,16981.6087,6413.617061,23.0,10570.13043,3814.924289,23.0,162.0,1.0


In [194]:
X9.to_csv('X9.csv', index=False)

In [195]:
X10 = pd.read_csv('X9.csv')

** Accuracy Metrics **

Here is a list of metrics: 

http://scikit-learn.org/stable/modules/classes.html#classification-metrics

http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics

Compare Model Labels (Based on Predict_Proba scores - not directly output from models)

In [207]:
model_labels = pd.read_csv('/Users/lmc2/NIST/Notebooks/CrowdVariant/CrowdVar_Model_Label_remove_0.5.csv')

- **True Labels**: labels assigned by the crowd
- **Predicted Labels**: labels from model (Based on Predict_Proba scores - not directly output from models)

In [208]:
model_labels['Label'].dtypes

dtype('int64')

In [209]:
model_labels['model_label'].dtypes

dtype('int64')

In [210]:
y_true = model_labels['Label']
y_pred = model_labels['model_label']

In [228]:
from sklearn.metrics import precision_score
#precision_score(y_true, y_pred, average='binary') 
#precision_score(y_true, y_pred, average='macro') 
#precision_score(y_true, y_pred, average=None)
precision_score(y_true, y_pred, average='micro') 
#precision_score(y_true, y_pred, average='weighted') 

0.98390151515151514

In [None]:
# The following metrics will not work, multiclass error?

In [214]:
y_true = model_labels['Label']
y_pred = model_labels['model_label']
f1_score(y_true, y_pred, average=None)


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



array([ 0.        ,  0.98084291,  0.98793363,  0.        ])

In [215]:
from sklearn.metrics import roc_auc_score
y_true = model_labels['Label']
y_pred = model_labels['model_label']
roc_auc_score(y_true, y_pred)


ValueError: multiclass format is not supported

In [56]:
X6 = pd.read_csv('X6.csv')

In [57]:
X6.shape

(1061, 170)

In [47]:
X_test.shape

(1061, 167)

In [48]:
y_test.shape

(1061,)

In [50]:
X6.shape

(2122, 170)

In [58]:
X6['Labels'] = y_test

In [62]:
X6.shape

(1061, 171)

In [60]:
X6.to_csv('X6_label.csv', index=False)

In [31]:
df_crowd.drop(['Label'],axis=1, inplace = True)

In [32]:
df_crowd.head(3)

Unnamed: 0,Ill300x.alt_alnScore_mean,Ill300x.alt_alnScore_std,Ill300x.alt_count,Ill300x.alt_insertSize_mean,Ill300x.alt_insertSize_std,Ill300x.alt_reason_alignmentScore,Ill300x.alt_reason_insertSizeScore,Ill300x.alt_reason_orientation,Ill300x.amb_alnScore_mean,Ill300x.amb_alnScore_std,...,TenX.HP2_amb_reason_same_scores,TenX.HP2_ref_alnScore_mean,TenX.HP2_ref_alnScore_std,TenX.HP2_ref_count,TenX.HP2_ref_insertSize_mean,TenX.HP2_ref_insertSize_std,TenX.HP2_ref_reason_alignmentScore,TenX.HP2_ref_reason_orientation,size,Svsize
0,,,,,,,,,,,...,35.0,503.5,25.5,2.0,281.5,26.5,2.0,0.0,103,103
1,,,,,,,,,,,...,0.0,528.138889,20.997556,36.0,339.777778,122.282485,27.0,9.0,275,275
2,580.681818,15.496001,22.0,1126.909091,202.803736,0.0,22.0,0.0,547.209939,84.570535,...,0.0,517.333333,40.540789,3.0,819.666667,361.818862,3.0,0.0,218,218


In [33]:
df_crowd.to_csv('crowded.csv', index=False)

** Impute Missing Values Using KNN** 
#Question? Deciding on K

In [49]:
#Count Number of NaN in each column before imputaiton
dfNaN=pd.DataFrame()
NaN_count_pre=df_crowd.isnull().sum()
NaN_count_pre

Ill300x.alt_alnScore_mean                             23
Ill300x.alt_alnScore_std                              23
Ill300x.alt_count                                     23
Ill300x.alt_insertSize_mean                           23
Ill300x.alt_insertSize_std                            23
Ill300x.alt_reason_alignmentScore                     23
Ill300x.alt_reason_insertSizeScore                    23
Ill300x.alt_reason_orientation                        23
Ill300x.amb_alnScore_mean                             23
Ill300x.amb_alnScore_std                              23
Ill300x.amb_count                                     23
Ill300x.amb_insertSize_mean                           23
Ill300x.amb_insertSize_std                            23
Ill300x.amb_reason_alignmentScore_alignmentScore      23
Ill300x.amb_reason_alignmentScore_orientation         23
Ill300x.amb_reason_flanking                           23
Ill300x.amb_reason_insertSizeScore_alignmentScore     23
Ill300x.amb_reason_insertSizeSc

In [54]:
# Add column names
df_crowd2.columns = ['Ill300x.alt_alnScore_mean','Ill300x.alt_alnScore_std','Ill300x.alt_count','Ill300x.alt_insertSize_mean','Ill300x.alt_insertSize_std','Ill300x.alt_reason_alignmentScore','Ill300x.alt_reason_insertSizeScore','Ill300x.alt_reason_orientation','Ill300x.amb_alnScore_mean','Ill300x.amb_alnScore_std','Ill300x.amb_count','Ill300x.amb_insertSize_mean','Ill300x.amb_insertSize_std','Ill300x.amb_reason_alignmentScore_alignmentScore','Ill300x.amb_reason_alignmentScore_orientation','Ill300x.amb_reason_flanking','Ill300x.amb_reason_insertSizeScore_alignmentScore','Ill300x.amb_reason_insertSizeScore_insertSizeScore','Ill300x.amb_reason_multimapping','Ill300x.amb_reason_orientation_alignmentScore','Ill300x.amb_reason_orientation_orientation','Ill300x.amb_reason_same_scores','Ill300x.ref_alnScore_mean','Ill300x.ref_alnScore_std','Ill300x.ref_count','Ill300x.ref_insertSize_mean','Ill300x.ref_insertSize_std','Ill300x.ref_reason_alignmentScore','Ill300x.ref_reason_insertSizeScore','Ill300x.ref_reason_orientation','Ill250.alt_alnScore_mean','Ill250.alt_alnScore_std','Ill250.alt_count','Ill250.alt_insertSize_mean','Ill250.alt_insertSize_std','Ill250.alt_reason_alignmentScore','Ill250.alt_reason_insertSizeScore','Ill250.alt_reason_orientation','Ill250.amb_alnScore_mean','Ill250.amb_alnScore_std','Ill250.amb_count','Ill250.amb_insertSize_mean','Ill250.amb_insertSize_std','Ill250.amb_reason_alignmentScore_alignmentScore','Ill250.amb_reason_alignmentScore_orientation','Ill250.amb_reason_flanking','Ill250.amb_reason_insertSizeScore_alignmentScore','Ill250.amb_reason_insertSizeScore_insertSizeScore','Ill250.amb_reason_multimapping','Ill250.amb_reason_orientation_alignmentScore','Ill250.amb_reason_orientation_orientation','Ill250.amb_reason_same_scores','Ill250.ref_alnScore_mean','Ill250.ref_alnScore_std','Ill250.ref_count','Ill250.ref_insertSize_mean','Ill250.ref_insertSize_std','Ill250.ref_reason_alignmentScore','Ill250.ref_reason_orientation','IllMP.alt_alnScore_mean','IllMP.alt_alnScore_std','IllMP.alt_count','IllMP.alt_insertSize_mean','IllMP.alt_insertSize_std','IllMP.alt_reason_alignmentScore','IllMP.alt_reason_insertSizeScore','IllMP.alt_reason_orientation','IllMP.amb_alnScore_mean','IllMP.amb_alnScore_std','IllMP.amb_count','IllMP.amb_insertSize_mean','IllMP.amb_insertSize_std','IllMP.amb_reason_alignmentScore_alignmentScore','IllMP.amb_reason_alignmentScore_orientation','IllMP.amb_reason_flanking','IllMP.amb_reason_insertSizeScore_insertSizeScore','IllMP.amb_reason_multimapping','IllMP.amb_reason_orientation_alignmentScore','IllMP.amb_reason_orientation_orientation','IllMP.amb_reason_same_scores','IllMP.ref_alnScore_mean','IllMP.ref_alnScore_std','IllMP.ref_count','IllMP.ref_insertSize_mean','IllMP.ref_insertSize_std','IllMP.ref_reason_alignmentScore','IllMP.ref_reason_insertSizeScore','IllMP.ref_reason_orientation','pacbio.alt_alnScore_mean','pacbio.alt_alnScore_std','pacbio.alt_count','pacbio.alt_insertSize_mean','pacbio.alt_insertSize_std','pacbio.alt_reason_alignmentScore','pacbio.amb_alnScore_mean','pacbio.amb_alnScore_std','pacbio.amb_count','pacbio.amb_insertSize_mean','pacbio.amb_insertSize_std','pacbio.amb_reason_alignmentScore_alignmentScore','pacbio.amb_reason_flanking','pacbio.amb_reason_multimapping','pacbio.amb_reason_same_scores','pacbio.ref_alnScore_mean','pacbio.ref_alnScore_std','pacbio.ref_count','pacbio.ref_insertSize_mean','pacbio.ref_insertSize_std','pacbio.ref_reason_alignmentScore','TenX.HP1_alt_alnScore_mean','TenX.HP1_alt_alnScore_std','TenX.HP1_alt_count','TenX.HP1_alt_insertSize_mean','TenX.HP1_alt_insertSize_std','TenX.HP1_alt_reason_alignmentScore','TenX.HP1_alt_reason_insertSizeScore','TenX.HP1_alt_reason_orientation','TenX.HP1_amb_alnScore_mean','TenX.HP1_amb_alnScore_std','TenX.HP1_amb_count','TenX.HP1_amb_insertSize_mean','TenX.HP1_amb_insertSize_std','TenX.HP1_amb_reason_alignmentScore_alignmentScore','TenX.HP1_amb_reason_alignmentScore_orientation','TenX.HP1_amb_reason_flanking','TenX.HP1_amb_reason_insertSizeScore_alignmentScore','TenX.HP1_amb_reason_multimapping','TenX.HP1_amb_reason_orientation_alignmentScore','TenX.HP1_amb_reason_orientation_orientation','TenX.HP1_amb_reason_same_scores','TenX.HP1_ref_alnScore_mean','TenX.HP1_ref_alnScore_std','TenX.HP1_ref_count','TenX.HP1_ref_insertSize_mean','TenX.HP1_ref_insertSize_std','TenX.HP1_ref_reason_alignmentScore','TenX.HP1_ref_reason_orientation','TenX.HP2_alt_alnScore_mean','TenX.HP2_alt_alnScore_std','TenX.HP2_alt_count','TenX.HP2_alt_insertSize_mean','TenX.HP2_alt_insertSize_std','TenX.HP2_alt_reason_alignmentScore','TenX.HP2_alt_reason_insertSizeScore','TenX.HP2_alt_reason_orientation','TenX.HP2_amb_alnScore_mean','TenX.HP2_amb_alnScore_std','TenX.HP2_amb_count','TenX.HP2_amb_insertSize_mean','TenX.HP2_amb_insertSize_std','TenX.HP2_amb_reason_alignmentScore_alignmentScore','TenX.HP2_amb_reason_alignmentScore_orientation','TenX.HP2_amb_reason_flanking','TenX.HP2_amb_reason_insertSizeScore_alignmentScore','TenX.HP2_amb_reason_multimapping','TenX.HP2_amb_reason_orientation_alignmentScore','TenX.HP2_amb_reason_orientation_orientation','TenX.HP2_amb_reason_same_scores','TenX.HP2_ref_alnScore_mean','TenX.HP2_ref_alnScore_std','TenX.HP2_ref_count','TenX.HP2_ref_insertSize_mean','TenX.HP2_ref_insertSize_std','TenX.HP2_ref_reason_alignmentScore','TenX.HP2_ref_reason_orientation','size','Svsize']

In [55]:
#Count Number of NaN in each column after imputaiton
NaN_count_post=df_crowd2.isnull().sum()
NaN_count_post

Ill300x.alt_alnScore_mean                             0
Ill300x.alt_alnScore_std                              0
Ill300x.alt_count                                     0
Ill300x.alt_insertSize_mean                           0
Ill300x.alt_insertSize_std                            0
Ill300x.alt_reason_alignmentScore                     0
Ill300x.alt_reason_insertSizeScore                    0
Ill300x.alt_reason_orientation                        0
Ill300x.amb_alnScore_mean                             0
Ill300x.amb_alnScore_std                              0
Ill300x.amb_count                                     0
Ill300x.amb_insertSize_mean                           0
Ill300x.amb_insertSize_std                            0
Ill300x.amb_reason_alignmentScore_alignmentScore      0
Ill300x.amb_reason_alignmentScore_orientation         0
Ill300x.amb_reason_flanking                           0
Ill300x.amb_reason_insertSizeScore_alignmentScore     0
Ill300x.amb_reason_insertSizeScore_insertSizeSco

** Scale Data **

In [56]:
#ScaledData
#Standardizefeaturesbyremovingthemeanandscalingtounitvariance
scaler=preprocessing.StandardScaler()
X=scaler.fit_transform(df_crowd2)

** tSNE Analysis **

In [58]:
# 1. SVD Decomposition - will make tSNE run faster

###########################################
#SVD
###########################################
ncomps=100
svd=TruncatedSVD(n_components=ncomps)
svd_fit=svd.fit(X)
Y=svd.fit_transform(X)
dfsvd = pd.DataFrame(Y, columns=['c{}'.format(c) for c in range(ncomps)], index=df_crowd2.index)

In [60]:
###########################################
# TSNE
########################################### 
tsne = TSNE(n_components=2, random_state=0)
Z = tsne.fit_transform(dfsvd)
dftsne = pd.DataFrame(Z, columns=['x','y'], index=dfsvd.index)
dftsne.shape

(1515, 2)

** DBSCAN **

Genenrate cluster labels from tSNE Clusters

In [73]:
dbscan = DBSCAN(min_samples=4)
labels = dbscan.fit_predict(Z)
print("Unique labels: {}".format(np.unique(labels)))
df_crowd2['clusterLabel'] = labels
df_crowd2.to_csv('CrowdVar.DEL.tSNE_minSample_4.csv', index=False)

Unique labels: [ -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16
  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52
  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70
  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88
  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106
 107 108 109 110 111 112 113 114 115 116 117 118 119 120]


In [87]:

'''
Data Cleaning
'''
# dftsne['tandemrep_pct'] = df_crowd['tandemrep_pct']
# dftsne['segdup_pct'] = df_crowd['segdup_pct']
# dftsne['segdup_pct'].replace(0,-1,inplace=True)
# dftsne['tandemrep_pct'].replace(0,-1,inplace=True)

# bins = [-1, 0.2, 0.5, 1]
# group_names = ['0-0.2', '0.2-0.5', '0.5-1']
# df_crowd['cat'] = pd.cut(df_crowd['segdup_pct'], bins, labels=group_names)
# df_crowd['cat2'] = pd.cut(df_crowd['tandemrep_pct'], bins, labels=group_names)

#Size Bins
bins = [20,50,100,1000,3000,9062]
df_crowd['Size'] = df_crowd['size'].abs()
group_names_size = ['20-50', '50-100', '100-1000', '1000-3000', '3000-9062']
df_crowd['size_bin'] = pd.cut(df_crowd['size'], bins, labels=group_names_size)
# dftsne['cat'] = df_crowd['cat']
# dftsne['cat2'] = df_crowd['cat2']
dftsne['size_bin'] = df_crowd['size_bin']


df_crowd['Size2'] = df_crowd['size'].apply(lambda x: x/1000)
dftsne['Size2'] = df_crowd['Size2']
dftsne['Size'] = df_crowd['size']
dftsne['GTcons'] = df_crowd['GTcons']
dftsne['sample'] = df_crowd['sample']
# dftsne['refN_pct'] = df_crowd['refN_pct']
dftsne['label'] = df_crowd['Label']
dftsne['clusterLabel'] = df_crowd2['clusterLabel']

In [85]:
output_notebook()

In [96]:
x = dftsne['x']
y = dftsne['y']
samp = dftsne['clusterLabel']
source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            samp=samp,
        )
    )

hover = HoverTool(
        tooltips=[
            ("index", "$index"),
            ("(x,y)", "($x, $y)"),
            ("Group ID", "@clusterLabel"),
        ]
    )

g = figure()
_tools_to_show = 'box_zoom,pan,save,hover,resize,reset,tap,wheel_zoom'  
g = Scatter(dftsne, x='x', y='y', color='clusterLabel', title='HG002 DEL: DBSCAN labels', palette=palettes.Category20[20], legend="top_left",tools= _tools_to_show)
output_file("DBSCAN_DEL_SVD.html")
show(g)

INFO:bokeh.core.state:Session output file 'DBSCAN_DEL_SVD.html' already exists, will be overwritten.


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.7, random_state=0)

In [None]:
p = Scatter(dftsne, x='x', y='y', color='GTcons', title='HG002 INS: Consensus Genotypes', legend="top_left")
output_file("/Volumes/lesleydata/SVVIZOutput/June122017/Step4/MachineLearning/Step3.tSNE/June262017/INS/1/tSNE6_INS_GTcons.html")
show(p)