<a href="https://colab.research.google.com/github/laffertybrian/W_William_Project_001/blob/main/Genetic_Variant_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer

from sklearn.decomposition import PCA

from tensorflow.keras import Sequential
from tensorflow.keras import metrics
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
# Loading dataset

df = pd.read_csv('/content/drive/MyDrive/Data Science Projects/clinvar_conflicting.csv', dtype={'CHROM': str, 38: str, 40: object})
df.head()

Unnamed: 0,CHROM,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDISDBINCL,CLNDN,...,SIFT,PolyPhen,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,LoFtool,CADD_PHRED,CADD_RAW,BLOSUM62
0,1,1168180,G,C,0.0771,0.1002,0.1066,MedGen:CN169374,,not_specified,...,tolerated,benign,,,,,,1.053,-0.208682,2.0
1,1,1470752,G,A,0.0,0.0,0.0,"MedGen:C1843891,OMIM:607454,Orphanet:ORPHA9877...",,Spinocerebellar_ataxia_21|not_provided,...,deleterious_low_confidence,benign,,,,,,31.0,6.517838,-3.0
2,1,1737942,A,G,0.0,1e-05,0.0,"Human_Phenotype_Ontology:HP:0000486,MedGen:C00...",,Strabismus|Nystagmus|Hypothyroidism|Intellectu...,...,deleterious,probably_damaging,,,,,,28.1,6.061752,-1.0
3,1,2160305,G,A,0.0,0.0,0.0,"MedGen:C1321551,OMIM:182212,SNOMED_CT:83092002...",,Shprintzen-Goldberg_syndrome|not_provided,...,,,,,,,,22.5,3.114491,
4,1,2160305,G,T,0.0,0.0,0.0,"MedGen:C1321551,OMIM:182212,SNOMED_CT:83092002",,Shprintzen-Goldberg_syndrome,...,,,,,,,,24.7,4.766224,-3.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65188 entries, 0 to 65187
Data columns (total 46 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CHROM               65188 non-null  object 
 1   POS                 65188 non-null  int64  
 2   REF                 65188 non-null  object 
 3   ALT                 65188 non-null  object 
 4   AF_ESP              65188 non-null  float64
 5   AF_EXAC             65188 non-null  float64
 6   AF_TGP              65188 non-null  float64
 7   CLNDISDB            65188 non-null  object 
 8   CLNDISDBINCL        167 non-null    object 
 9   CLNDN               65188 non-null  object 
 10  CLNDNINCL           167 non-null    object 
 11  CLNHGVS             65188 non-null  object 
 12  CLNSIGINCL          167 non-null    object 
 13  CLNVC               65188 non-null  object 
 14  CLNVI               27659 non-null  object 
 15  MC                  64342 non-null  object 
 16  ORIG

In [5]:
def dataframe_statistics(DF):
  """Takes input of a dataframe and retuns a summary dataframe of key statistics"""

  # initialize lists for data storage
  feature_list = []
  dtype_list = []
  values_list = []
  missing_values_list = []
  data_percent_list = []
  unique_values_list = []
  number_unique_values_list = []

  for column in DF.columns:

    # length and missing values of column
    entries = len(DF[column])
    missing = DF[column].isna().sum()

    feature_list.append(column)
    dtype_list.append(DF[column].dtype)
    values_list.append(entries - missing)
    missing_values_list.append(missing)
    data_percent_list.append(round((entries - missing) / entries, 2))
    unique_values_list.append(DF[column].unique())
    number_unique_values_list.append(len(DF[column].value_counts()))

  # creating summary dataframe for return
  summary = pd.DataFrame({'Features': feature_list, 'Dtype': dtype_list, 
                          '#_Values': values_list, '#_Missing_Values': missing_values_list, 
                          '%_Data': data_percent_list, 'Unique_Values': unique_values_list,
                          '#_Unique_Values': number_unique_values_list})
  
  summary = summary.set_index('Features')
  

  return summary.sort_values(by = '#_Missing_Values', ascending = False)

In [6]:
df_summary = dataframe_statistics(df)
df_summary

Unnamed: 0_level_0,Dtype,#_Values,#_Missing_Values,%_Data,Unique_Values,#_Unique_Values
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MOTIF_SCORE_CHANGE,float64,2,65186,0.0,"[nan, -0.063, -0.097]",2
HIGH_INF_POS,object,2,65186,0.0,"[nan, N]",1
MOTIF_POS,float64,2,65186,0.0,"[nan, 1.0]",1
MOTIF_NAME,object,2,65186,0.0,"[nan, Egr1:MA0341.1, FOXA1:MA0546.1]",2
DISTANCE,float64,108,65080,0.0,"[nan, 1811.0, 1855.0, 2202.0, 1651.0, 1407.0, ...",96
SSR,float64,130,65058,0.0,"[nan, 1.0, 16.0]",2
CLNSIGINCL,object,167,65021,0.0,"[nan, 424754:Likely_pathogenic, 30118:risk_fac...",137
CLNDISDBINCL,object,167,65021,0.0,"[nan, MedGen:C1828210,OMIM:153870,Orphanet:ORP...",93
CLNDNINCL,object,167,65021,0.0,"[nan, Bull's_eye_maculopathy|Methylmalonic_aci...",101
INTRON,object,8803,56385,0.14,"[nan, 6/27, 8/17, 3/20, 24/24, 6/38, 16/38, 20...",1929


In [7]:
df.drop(df[df['Codons'].isna()].index, inplace=True)
df.drop(df[df['CADD_RAW'].isna()].index, inplace=True)
df.drop(df[df['MC'].isna()].index, inplace=True)
df.drop(df[df['LoFtool'].isna()].index, inplace=True)
df.drop(df[df['EXON'].isna()].index, inplace=True)

In [8]:
df_summary_1 = dataframe_statistics(df)
df_summary_1

Unnamed: 0_level_0,Dtype,#_Values,#_Missing_Values,%_Data,Unique_Values,#_Unique_Values
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
INTRON,object,0,50248,0.0,[nan],0
MOTIF_NAME,object,0,50248,0.0,[nan],0
DISTANCE,float64,0,50248,0.0,[nan],0
MOTIF_SCORE_CHANGE,float64,0,50248,0.0,[nan],0
HIGH_INF_POS,object,0,50248,0.0,[nan],0
MOTIF_POS,float64,0,50248,0.0,[nan],0
SSR,float64,95,50153,0.0,"[nan, 1.0, 16.0]",2
CLNDNINCL,object,143,50105,0.0,"[nan, Bull's_eye_maculopathy|Methylmalonic_aci...",86
CLNSIGINCL,object,143,50105,0.0,"[nan, 424754:Likely_pathogenic, 30118:risk_fac...",117
CLNDISDBINCL,object,143,50105,0.0,"[nan, MedGen:C1828210,OMIM:153870,Orphanet:ORP...",79


In [9]:
# columns with data less than 100%
columns_to_drop = list(df_summary_1[df_summary_1["#_Missing_Values"] > 1].index)

df.drop(columns = columns_to_drop, inplace = True)

summary = dataframe_statistics(df)
summary

Unnamed: 0_level_0,Dtype,#_Values,#_Missing_Values,%_Data,Unique_Values,#_Unique_Values
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CHROM,object,50248,0,1.0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",23
IMPACT,object,50248,0,1.0,"[MODERATE, LOW, HIGH]",3
CADD_PHRED,float64,50248,0,1.0,"[0.172, 23.0, 11.36, 22.1, 26.1, 6.773, 22.2, ...",7699
LoFtool,float64,50248,0,1.0,"[0.101, 0.021, 0.0674, 0.183, 0.3, 0.372, 0.27...",1181
STRAND,float64,50248,0,1.0,"[1.0, -1.0]",2
Codons,object,50248,0,1.0,"[Tcg/Ccg, cCt/cTt, Gtg/Atg, aCg/aTg, Cgg/Tgg, ...",1285
Amino_acids,object,50248,0,1.0,"[S/P, P/L, V/M, T/M, R/W, A/V, R/H, R/Q, V/I, ...",581
Protein_position,object,50248,0,1.0,"[534, 634, 1102, 1225, 1192, 1110, 961, 959, 8...",6608
CDS_position,object,50248,0,1.0,"[1600, 1901, 3304, 3674, 3574, 3329, 2882, 287...",12261
cDNA_position,object,50248,0,1.0,"[1858, 2159, 3562, 3942, 3842, 3597, 3150, 314...",12453


###Prepare data for modling

In [10]:
# creating feature matrix and target vector
X = df.drop(columns = 'CLASS')

y = df['CLASS']

In [11]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [12]:
# creating column selectors
numerical_cols = make_column_selector(dtype_include='number')

category_cols = make_column_selector(dtype_include='object')

# instantiate scaler and onehotencoder
scaler = StandardScaler()

one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse = False)

# creating pipelines
numeric_tuple = (scaler, numerical_cols)
category_tuple = (one_hot_encoder, category_cols)

# column transformer
preprocessor = make_column_transformer(numeric_tuple, category_tuple, remainder='passthrough')

In [None]:
# transform train and test dataset
preprocessor.fit(X_train)
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)