In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import os.path
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

import time
import joblib
from sklearn.model_selection import cross_val_score

import get_metrics
import lazypredict
from lazypredict.Supervised import LazyClassifier

import sklearn
import xgboost
import lightgbm

from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from random import shuffle
from sklearn.metrics import classification_report, confusion_matrix
#from sklearn.metrics import plot_roc_curve

In [3]:
srr_names = ['ERR1831349',
             'ERR1831346',
             'ERR1831347',
             'ERR1831348',
             'ERR1831350',
             'ERR1831351',
             'ERR1831352',
             'ERR1831353',
             'ERR1905889',
             'ERR1905890',
             'SRR14724463',
             'SRR14724473',
             'SRR14724483',
             'SRR14724493', 
             'SRR14724503',
             'SRR14724513',
             'SRR2106342',
             'SRR2106344',
             'DRR189730',
             'DRR189731',
             'DRR189732']

In [4]:
# Read in the pkl SRR SNV metrics df files we created in the previous cell   
srr_snv_dfs = []
for srr_name in srr_names:
    pkl_path = f'results/outputs/pickle/{srr_name}_snv_metrics.pkl'
    df = pd.read_pickle(pkl_path)
    # downcast integer and float dtypes to save memory
    #float_cols = df.select_dtypes('int').columns
    #int_cols = df.select_dtypes('float').columns
    #df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float')
    #df[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='integer')
    srr_snv_dfs.append(df) 

In [5]:
# Convert GIAB VCF file into a pandas df 
giab_snv_dfs = []
for srr_name in srr_names: 
    giab_vcf = f'results/outputs/bedtools/{srr_name}_GIAB_variants_exome_intersection.vcf'
    giab_df = get_metrics.vcf_to_df(giab_vcf).set_index(["CHROM", "POS"]) # makes index unique based on CHROM and POS
    # Only select SNVs from the GIAB datasets
    giab_snv_df1 = get_metrics.filter_for_snvs(giab_df)
    giab_snv_df2 = get_metrics.snp_freq_not1(giab_df)
    giab_snv_df = pd.concat([giab_snv_df1, giab_snv_df2])
    giab_snv_dfs.append(giab_snv_df)

In [6]:
# Match each SRR SNV DF with GIAB SNV DF using zip() then merge them into a single DF
joined_dfs = []
for srr_snv_df, giab_snv_df in zip(srr_snv_dfs, giab_snv_dfs):
    joined_df = get_metrics.join_datasets(srr_snv_df, giab_snv_df)
    joined_dfs.append(joined_df)

In [7]:
# Count the number of artifacts per case

# Case 1: GIAB and SRR contain the same variant at the same locus is considered true variant (not artifact)
# Case 2: GIAB contains a variant and SRR contains a different variant at the same locus (artifact)
# Case 3: GIAB contains a variant, but SRR does not contain a variant in the same locus
     # 3.1: SRR does not contain anything (not artifact)
     # 3.2: SRR does not contain a variant because the SRR read matches the reference genome (variant)
# Case 4: SRR contains a variant, but GIAB does not contain a variant in the same locus (artifact)

# artifact_cols contains boolean values for each index -- False means case 1, True means case 2 or 4
artifact_cols = []
for srr_name, joined_df in zip(srr_names, joined_dfs):
    print(f"Stats for {srr_name}")
    artifact_col = get_metrics.extract_artifacts_column(joined_df)
    artifact_cols.append(artifact_col)
    print("")

Stats for ERR1831349
Case 1: 31052
Case 2: 1
Case 3: 1764
Case 4: 9381

Stats for ERR1831346
Case 1: 31218
Case 2: 0
Case 3: 1599
Case 4: 28157

Stats for ERR1831347
Case 1: 31163
Case 2: 0
Case 3: 1654
Case 4: 20005

Stats for ERR1831348
Case 1: 31183
Case 2: 0
Case 3: 1634
Case 4: 23751

Stats for ERR1831350
Case 1: 31150
Case 2: 0
Case 3: 1667
Case 4: 18249

Stats for ERR1831351
Case 1: 31159
Case 2: 2
Case 3: 1656
Case 4: 21227

Stats for ERR1831352
Case 1: 31200
Case 2: 1
Case 3: 1616
Case 4: 27328

Stats for ERR1831353
Case 1: 31167
Case 2: 1
Case 3: 1649
Case 4: 21628

Stats for ERR1905889
Case 1: 31204
Case 2: 1
Case 3: 1612
Case 4: 67932

Stats for ERR1905890
Case 1: 31227
Case 2: 3
Case 3: 1587
Case 4: 59447

Stats for SRR14724463
Case 1: 24232
Case 2: 0
Case 3: 1641
Case 4: 93208

Stats for SRR14724473
Case 1: 16130
Case 2: 0
Case 3: 1300
Case 4: 53557

Stats for SRR14724483
Case 1: 18053
Case 2: 0
Case 3: 1022
Case 4: 64341

Stats for SRR14724493
Case 1: 24105
Case 2: 1
Cas

In [8]:
# Extract features for training
dfs = []
for srr_snv_df, artifact_col in zip(srr_snv_dfs, artifact_cols):
    # REF and ALT are categorical, everything else is numerical. DONT Drop LSEQ and RSEQ yet because I need to see if C-A SNP is part of the CCG DNA seq
    features_df = srr_snv_df.join(artifact_col.rename("IS_ARTIFACT"))
    dfs.append(features_df)

In [9]:
# Concatenate the dataframes to get some statistics about artifacts
df = pd.concat(dfs)

In [10]:
# ['REF','ALT','AF','HIAF','HICNT','VD','SN','ADJAF','VARBIAS','LSEQ', 'RSEQ'] are objects
# Change data types that are meant to be numerical to float or int (dtypes are listed in VCF file)

# astype() depracted since 1.3.0 so now it keeps crashing kernal
cast_dtypes = {'AF':'float','HIAF':'float','HICNT':'int','VD':'int','SN':'float','ADJAF':'float'} 
df = df.astype(cast_dtypes)

In [11]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,REF,ALT,AF,HIAF,HICNT,VD,SN,ADJAF,VARBIAS,LSEQ,...,R_TTAA,R_GTAA,R_TATC,R_HOMO_POLY_A,R_HOMO_POLY_T,R_HOMO_POLY_G,R_HOMO_POLY_C,R_PALINDROME,R_HAIRPIN,IS_ARTIFACT
CHROM,POS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
chr1,826893,G,A,1.00,1.00,40,40,80.00,0.05,6:34,GAAGAAGCCCCTGCCAAGGA,...,0,0,0,0,0,3,3,0,0,False
chr1,941119,A,G,1.00,1.00,8,8,16.00,0.38,6:2,GAGGGCGCATAGCCGGGGGG,...,0,0,0,0,0,0,5,0,0,False
chr1,944168,G,T,0.05,0.05,2,2,4.00,0.00,1:1,TGTTGAGGTTGCCGGGGGTA,...,0,0,0,3,0,4,0,4,0,True
chr1,944858,A,G,1.00,1.00,67,67,134.00,0.01,40:27,ATCAGGAAGAAGCCAGCCTT,...,0,0,0,0,0,0,0,4,0,False
chr1,946247,G,A,0.95,1.00,19,19,38.00,0.00,17:2,GTGGCTGTAGTACAAGGTCA,...,0,0,0,0,0,3,4,6,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrX,143629683,G,A,0.03,0.03,2,2,4.00,0.00,1:1,ATTGTTTAAGTACAGTAACT,...,0,0,0,3,3,0,0,8,0,True
chrX,143629724,T,A,0.04,0.04,2,2,4.00,0.00,1:1,AGGTGCCTGCTGAGATTTCC,...,0,0,0,3,0,0,0,4,0,True
chrX,143630393,T,A,0.03,0.03,2,2,4.00,0.00,1:1,TGGGAGTTTCACAGATAGCT,...,0,1,0,3,0,0,0,4,0,True
chrX,143631074,A,G,1.00,1.00,18,18,36.00,0.06,3:15,CTGCATTTGTCGAAGAAATC,...,0,0,0,4,0,3,0,0,5,True


In [12]:
# Drop LSEQ and RSEQ for modeling because we don't need them anymore
df = df.drop(columns=['LSEQ', 'RSEQ', 'VARBIAS'])

In [13]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,REF,ALT,AF,HIAF,HICNT,VD,SN,ADJAF,L_A,L_T,...,R_TTAA,R_GTAA,R_TATC,R_HOMO_POLY_A,R_HOMO_POLY_T,R_HOMO_POLY_G,R_HOMO_POLY_C,R_PALINDROME,R_HAIRPIN,IS_ARTIFACT
CHROM,POS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
chr1,826893,G,A,1.00,1.00,40,40,80.00,0.05,35.00,5.00,...,0,0,0,0,0,3,3,0,0,False
chr1,941119,A,G,1.00,1.00,8,8,16.00,0.38,15.00,5.00,...,0,0,0,0,0,0,5,0,0,False
chr1,944168,G,T,0.05,0.05,2,2,4.00,0.00,10.00,30.00,...,0,0,0,3,0,4,0,4,0,True
chr1,944858,A,G,1.00,1.00,67,67,134.00,0.01,35.00,15.00,...,0,0,0,0,0,0,0,4,0,False
chr1,946247,G,A,0.95,1.00,19,19,38.00,0.00,25.00,25.00,...,0,0,0,0,0,3,4,6,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrX,143629683,G,A,0.03,0.03,2,2,4.00,0.00,35.00,40.00,...,0,0,0,3,3,0,0,8,0,True
chrX,143629724,T,A,0.04,0.04,2,2,4.00,0.00,15.00,30.00,...,0,0,0,3,0,0,0,4,0,True
chrX,143630393,T,A,0.03,0.03,2,2,4.00,0.00,25.00,30.00,...,0,1,0,3,0,0,0,4,0,True
chrX,143631074,A,G,1.00,1.00,18,18,36.00,0.06,30.00,30.00,...,0,0,0,4,0,3,0,0,5,True


In [None]:
df.to_pickle('ML_df.pkl')

# Try joining L and R features into 1 feature

In [None]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,REF,ALT,AF,HIAF,HICNT,VD,SN,ADJAF,L_A,L_T,...,R_TTAA,R_GTAA,R_TATC,R_HOMO_POLY_A,R_HOMO_POLY_T,R_HOMO_POLY_G,R_HOMO_POLY_C,R_PALINDROME,R_HAIRPIN,IS_ARTIFACT
CHROM,POS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
chr1,826893,G,A,1.0,1.0,40,40,80.0,0.05,35.0,5.0,...,0,0,0,0,0,3,3,0,0,False
chr1,941119,A,G,1.0,1.0,8,8,16.0,0.38,15.0,5.0,...,0,0,0,0,0,0,5,0,0,False
chr1,944168,G,T,0.05,0.05,2,2,4.0,0.0,10.0,30.0,...,0,0,0,3,0,4,0,4,0,True
chr1,944858,A,G,1.0,1.0,67,67,134.0,0.01,35.0,15.0,...,0,0,0,0,0,0,0,4,0,False
chr1,946247,G,A,0.95,1.0,19,19,38.0,0.0,25.0,25.0,...,0,0,0,0,0,3,4,6,0,False


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1834281 entries, ('chr1', '826893') to ('chrX', '143712338')
Columns: 541 entries, REF to IS_ARTIFACT
dtypes: bool(1), float64(12), int64(526), object(2)
memory usage: 7.4+ GB


In [None]:
# Add content of each base
for nt in ['A', 'T', 'C', 'G']:
    df[nt] = df['L_'+nt] + df['R_'+nt]
#df['A'] = df['L_A'] + df['R_A']

In [None]:
df.info() # +4

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1834281 entries, ('chr1', '826893') to ('chrX', '143712338')
Columns: 545 entries, REF to G
dtypes: bool(1), float64(16), int64(526), object(2)
memory usage: 7.5+ GB


In [None]:
# Max homopoly size of each base
for nt in ['A', 'T', 'C', 'G']:
    df['HOMO_POLY_'+nt] = df[['L_HOMO_POLY_'+nt, 'R_HOMO_POLY_'+nt]].max(axis=1)

In [None]:
df.info() # +4

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1834281 entries, ('chr1', '826893') to ('chrX', '143712338')
Columns: 549 entries, REF to HOMO_POLY_G
dtypes: bool(1), float64(16), int64(530), object(2)
memory usage: 7.5+ GB


In [None]:
# Max palindrome size 
df['PALINDROME'] = df[['L_PALINDROME', 'R_PALINDROME']].max(axis=1)

In [10]:
df.info() # +1

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1834281 entries, ('chr1', '826893') to ('chrX', '143712338')
Columns: 550 entries, REF to PALINDROME
dtypes: bool(1), float64(16), int64(531), object(2)
memory usage: 7.5+ GB


In [11]:
# Max hairpin loop size 
df['HAIRPIN'] = df[['L_HAIRPIN', 'R_HAIRPIN']].max(axis=1)

In [12]:
df.info() # +1

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1834281 entries, ('chr1', '826893') to ('chrX', '143712338')
Columns: 551 entries, REF to HAIRPIN
dtypes: bool(1), float64(16), int64(532), object(2)
memory usage: 7.5+ GB


In [13]:
# Add each kmer type
nt = ['A', 'T', 'C', 'G']
kmers = [n1+n2+n3+n4 for n1 in nt for n2 in nt for n3 in nt for n4 in nt] 
# looks like: ['AAAA', 'AAAT', 'AAAC',..]

for kmer in kmers:
    df[kmer] = df['L_'+kmer] + df['R_'+kmer]

In [14]:
df.info() # +256

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1834281 entries, ('chr1', '826893') to ('chrX', '143712338')
Columns: 807 entries, REF to GGGG
dtypes: bool(1), float64(16), int64(788), object(2)
memory usage: 11.0+ GB


In [15]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,REF,ALT,AF,HIAF,HICNT,VD,SN,ADJAF,L_A,L_T,...,GGTC,GGTG,GGCA,GGCT,GGCC,GGCG,GGGA,GGGT,GGGC,GGGG
CHROM,POS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
chr1,826893,G,A,1.0,1.0,40,40,80.0,0.05,35.0,5.0,...,0,0,0,0,0,0,1,0,0,0
chr1,941119,A,G,1.0,1.0,8,8,16.0,0.38,15.0,5.0,...,0,0,0,0,0,1,0,0,1,3
chr1,944168,G,T,0.05,0.05,2,2,4.0,0.0,10.0,30.0,...,0,1,0,0,1,0,0,2,1,3
chr1,944858,A,G,1.0,1.0,67,67,134.0,0.01,35.0,15.0,...,0,0,0,0,0,0,0,0,0,0
chr1,946247,G,A,0.95,1.0,19,19,38.0,0.0,25.0,25.0,...,1,1,0,1,0,0,0,1,0,0


In [18]:
# Remove all the L and R features since we have combined features now

# Remove L and R content of each base
for nt in ['A', 'T', 'C', 'G']:
    df.drop(columns=['L_'+nt, 'R_'+nt], axis=1, inplace=True)

In [19]:
df.info() # -8

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1834281 entries, ('chr1', '826893') to ('chrX', '143712338')
Columns: 799 entries, REF to GGGG
dtypes: bool(1), float64(8), int64(788), object(2)
memory usage: 10.9+ GB


In [20]:
# Remove L and R homopoly size of each base
for nt in ['A', 'T', 'C', 'G']:
    df.drop(columns=['L_HOMO_POLY_'+nt, 'R_HOMO_POLY_'+nt], axis=1, inplace=True)

In [21]:
df.info() # -8

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1834281 entries, ('chr1', '826893') to ('chrX', '143712338')
Columns: 791 entries, REF to GGGG
dtypes: bool(1), float64(8), int64(780), object(2)
memory usage: 10.8+ GB


In [22]:
# Remove L and R palindrome size 
df.drop(columns=['L_PALINDROME', 'R_PALINDROME'], axis=1, inplace=True)

In [23]:
df.info() # -2

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1834281 entries, ('chr1', '826893') to ('chrX', '143712338')
Columns: 789 entries, REF to GGGG
dtypes: bool(1), float64(8), int64(778), object(2)
memory usage: 10.8+ GB


In [24]:
# Remove L and R hairpin loop size 
df.drop(columns=['L_HAIRPIN', 'R_HAIRPIN'], axis=1, inplace=True)

In [25]:
df.info() # -2

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1834281 entries, ('chr1', '826893') to ('chrX', '143712338')
Columns: 787 entries, REF to GGGG
dtypes: bool(1), float64(8), int64(776), object(2)
memory usage: 10.8+ GB


In [26]:
# Remove L and R of each kmer type
for kmer in kmers:
    df.drop(columns=['L_'+kmer, 'R_'+kmer], axis=1, inplace=True)

In [27]:
df.info() # -512

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1834281 entries, ('chr1', '826893') to ('chrX', '143712338')
Columns: 275 entries, REF to GGGG
dtypes: bool(1), float64(8), int64(264), object(2)
memory usage: 3.8+ GB


In [29]:
df.to_pickle('ML_df_joined_LR_feats.pkl') 

# Try joining L and R features into 1 feature BY ADDING

In [21]:
df = pd.read_pickle('ML_df.pkl')

In [22]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,REF,ALT,AF,HIAF,HICNT,VD,SN,ADJAF,L_A,L_T,...,R_TTAA,R_GTAA,R_TATC,R_HOMO_POLY_A,R_HOMO_POLY_T,R_HOMO_POLY_G,R_HOMO_POLY_C,R_PALINDROME,R_HAIRPIN,IS_ARTIFACT
CHROM,POS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
chr1,826893,G,A,1.0,1.0,40,40,80.0,0.05,35.0,5.0,...,0,0,0,0,0,3,3,0,0,False
chr1,941119,A,G,1.0,1.0,8,8,16.0,0.38,15.0,5.0,...,0,0,0,0,0,0,5,0,0,False
chr1,944168,G,T,0.05,0.05,2,2,4.0,0.0,10.0,30.0,...,0,0,0,3,0,4,0,4,0,True
chr1,944858,A,G,1.0,1.0,67,67,134.0,0.01,35.0,15.0,...,0,0,0,0,0,0,0,4,0,False
chr1,946247,G,A,0.95,1.0,19,19,38.0,0.0,25.0,25.0,...,0,0,0,0,0,3,4,6,0,False


In [23]:
# Add content of each base
for nt in ['A', 'T', 'C', 'G']:
    df[nt] = df['L_'+nt] + df['R_'+nt]

In [24]:
# Add homopoly size of each base
for nt in ['A', 'T', 'C', 'G']:
    df['HOMO_POLY_'+nt] = df['L_HOMO_POLY_'+nt] + df['R_HOMO_POLY_'+nt]

In [30]:
# Max palindrome size 
df['PALINDROME'] = df['L_PALINDROME'] + df['R_PALINDROME']

In [26]:
# Add hairpin loop size 
df['HAIRPIN'] = df['L_HAIRPIN'] + df['R_HAIRPIN']

In [27]:
# Add each kmer type
nt = ['A', 'T', 'C', 'G']
kmers = [n1+n2+n3+n4 for n1 in nt for n2 in nt for n3 in nt for n4 in nt] 
# looks like: ['AAAA', 'AAAT', 'AAAC',..]

for kmer in kmers:
    df[kmer] = df['L_'+kmer] + df['R_'+kmer]

In [32]:
# Remove all the L and R features since we have combined features now

# Remove L and R content of each base
for nt in ['A', 'T', 'C', 'G']:
    df.drop(columns=['L_'+nt, 'R_'+nt], axis=1, inplace=True)

In [33]:
# Remove L and R homopoly size of each base
for nt in ['A', 'T', 'C', 'G']:
    df.drop(columns=['L_HOMO_POLY_'+nt, 'R_HOMO_POLY_'+nt], axis=1, inplace=True)

In [34]:
# Remove L and R palindrome size 
df.drop(columns=['L_PALINDROME', 'R_PALINDROME'], axis=1, inplace=True)

In [35]:
# Remove L and R hairpin loop size 
df.drop(columns=['L_HAIRPIN', 'R_HAIRPIN'], axis=1, inplace=True)

In [None]:
# Remove L and R of each kmer type
for kmer in kmers:
    df.drop(columns=['L_'+kmer, 'R_'+kmer], axis=1, inplace=True)

In [None]:
df.info() # 275

In [None]:
df.to_pickle('ML_df_add_LR_feats.pkl') 