# Supplementary File 4: Encoding and Imputing
(1) encoding the LoFtool dataset 
(2) dealing with missing values 

Input: clinvar_loftool.csv
Output: loftool_encoded.csv, then, df_loftool.csv

References to learn Target Encoding
[1] https://www.youtube.com/watch?v=Qp9L2dmFxVY
[2] https://www.youtube.com/watch?v=nd7vc4MZQz4  
[3] https://stackoverflow.com/questions/48817592/how-to-drop-dataframe-columns-based-on-dtype

In [2]:
import pandas as pd 
import numpy as np 
import category_encoders as ce
from category_encoders import TargetEncoder

#read in the file
df = pd.read_csv('clinvar_loftool.csv') 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39693 entries, 0 to 39692
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CHROM             39693 non-null  object 
 1   POS               39693 non-null  int64  
 2   REF               39693 non-null  object 
 3   ALT               39693 non-null  object 
 4   AF_ESP            39693 non-null  float64
 5   AF_EXAC           39693 non-null  float64
 6   AF_TGP            39693 non-null  float64
 7   MC                39681 non-null  object 
 8   IMPACT            39693 non-null  object 
 9   SYMBOL            39693 non-null  object 
 10  Feature           39693 non-null  object 
 11  EXON              39693 non-null  object 
 12  cDNA_position     39693 non-null  float64
 13  CDS_position      39693 non-null  float64
 14  Protein_position  39693 non-null  int64  
 15  Amino_acids       39693 non-null  object 
 16  Codons            39693 non-null  object

In [4]:
#number of categories
categories_symbol = df.SYMBOL.value_counts()
print(categories_symbol)

categories_exon = df.EXON.value_counts()
print(categories_exon)

SYMBOL
TTN        1645
ATM        1381
BRCA2      1136
APC         938
MSH6        800
           ... 
HOXD10        1
RPS6KA3       1
WIPF1         1
MBTPS2        1
CLIC2         1
Name: count, Length: 2069, dtype: int64
EXON
16/16    874
4/10     600
11/27    557
3/3      372
10/24    369
        ... 
19/43      1
36/47      1
19/47      1
2/45       1
30/53      1
Name: count, Length: 2983, dtype: int64


In [6]:
#dealing with missing values
#delete the null values for LoFtool since machine learning models need input and output
df = df.dropna(subset = 'LoFtool')
df.info()
df.to_csv('loftool_full.csv', index = False)

<class 'pandas.core.frame.DataFrame'>
Index: 37220 entries, 5 to 39692
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CHROM             37220 non-null  object 
 1   POS               37220 non-null  int64  
 2   REF               37220 non-null  object 
 3   ALT               37220 non-null  object 
 4   AF_ESP            37220 non-null  float64
 5   AF_EXAC           37220 non-null  float64
 6   AF_TGP            37220 non-null  float64
 7   MC                37210 non-null  object 
 8   IMPACT            37220 non-null  object 
 9   SYMBOL            37220 non-null  object 
 10  Feature           37220 non-null  object 
 11  EXON              37220 non-null  object 
 12  cDNA_position     37220 non-null  float64
 13  CDS_position      37220 non-null  float64
 14  Protein_position  37220 non-null  int64  
 15  Amino_acids       37220 non-null  object 
 16  Codons            37220 non-null  object 
 17

In [7]:
#input taken from claude.ai and chatgpt 

import pandas as pd 
import numpy as np 
import category_encoders as ce
from category_encoders import TargetEncoder

categorical_cols = ['CHROM', 'REF', 'ALT', 'MC', 'IMPACT', 'SYMBOL', 'Feature', 'EXON', 'Amino_acids', 'Codons']
transformed_cols = []
target = 'LoFtool'
for col in categorical_cols:
    encoder = TargetEncoder(cols = col, smoothing = 10)
    encoder.fit(X=df[col], y=df[target])
    transformed_col = encoder.transform(df[col])
    transformed_col.name = col + '_encoded'
    print(transformed_col)
    transformed_cols.append(transformed_col) 
transformed_df = pd.concat(transformed_cols, axis = 1)
encoded_df = pd.merge(df, transformed_df, left_index=True, right_index=True)
encoded_df.info()

encoded_df = encoded_df.select_dtypes(exclude=['object'])
print("after i take out object columns:")
encoded_df.info()

for col in encoded_df.columns:
    encoded_df = encoded_df.rename(columns={col: col.replace('_y', '')})
encoded_df.info()

encoded_df.to_csv('df_loftool.csv', index = False)

          CHROM
5      0.275599
6      0.275599
7      0.275599
8      0.275599
9      0.275599
...         ...
39684  0.146512
39688  0.146512
39689  0.146512
39690  0.146512
39692  0.146512

[37220 rows x 1 columns]
            REF
5      0.406165
6      0.332054
7      0.335588
8      0.332054
9      0.332054
...         ...
39684  0.332054
39688  0.332054
39689  0.406165
39690  0.332054
39692  0.335588

[37220 rows x 1 columns]
            ALT
5      0.387783
6      0.334983
7      0.335324
8      0.334983
9      0.334983
...         ...
39684  0.334983
39688  0.334983
39689  0.363932
39690  0.334983
39692  0.387783

[37220 rows x 1 columns]
             MC
5      0.361955
6      0.361955
7      0.361955
8      0.361955
9      0.361955
...         ...
39684  0.361955
39688  0.323842
39689  0.323842
39690  0.323842
39692  0.361955

[37220 rows x 1 columns]
         IMPACT
5      0.362895
6      0.362895
7      0.362895
8      0.362895
9      0.362895
...         ...
39684  0.362895
