In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline


Let's open our sequencing metrics for our runs over several month long period. Our goal will be to clean up the data as best we can and save a new csv we can work with.

In [50]:
seq = pd.read_csv('Export_Runs_2022-06-01_10_06_23.csv',sep=',',low_memory=False)
seq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5839 entries, 0 to 5838
Columns: 2761 entries, Run Name to Basecaller Sbx
dtypes: bool(5), float64(2591), int64(4), object(161)
memory usage: 122.8+ MB


In [40]:
seq.head()


Unnamed: 0,Run Name,Run Status,Type of Subsampling,Station,Analysis Start Date/Time,Run Group,User Name,Token,Master Mix,Complex,...,Percentile90 Mt Align Procession Length Align Hqmt,Percentile90 Mt Align Procession Length Good Mt,Percentile90 Mt Align Procession Length Qc Align Hqmt,Percentile90 Mt Align Procession Length Qc Mt,Percentile90 Mt Align Procession Length Unthreaded At Start Or End Of Bright Cycle Align Hqmt,Percentile90 Mt Base Call Read Length Unthreaded At Start Or End Of Bright Cycle Align Hqmt,Num Full Length Mt,Num Full Length Mt Cells,Basecaller Sxb,Basecaller Sbx
0,220510_SEA-HTP_03_fm-034_WWY13R06C05_cycle01,Done,ACAP full,fm-034,2022-05-12 06:35:04,SEA-HTP,Svetlana Kritzer,220510_066,,,...,,,,,,,,,,
1,220510_SEA-HTP_04_fm-035_WWY19R03C04_cycle01,Done,ACAP full,fm-035,2022-05-12 06:34:33,SEA-HTP,Svetlana Kritzer,220510_068,,,...,,,,,,,,,,
2,220510_SEA-HTP_03_fm-028_WWY13R04C08_cycle01,Done,ACAP full,fm-028,2022-05-12 06:34:02,SEA-HTP,Svetlana Kritzer,220510_067,,,...,,,,,,,,,,
3,220510_SEA-HTP_02_fm-027_WWY13R05C02_cycle01,Done,ACAP full,fm-027,2022-05-12 06:33:02,SEA-HTP,Svetlana Kritzer,220510_065,,,...,,,,,,,,,,
4,220511_CDI-HTP_02_fm-004_WWV21R01C05_cycle01,Done,ACAP full,fm-004,2022-05-12 05:40:07,CDI-HTP,Diego Moreno Tezanos,220511_018,,,...,,,,,,,,,,


In [51]:
#drop all NaN columns
seq.replace("", float("NaN"), inplace=True)
seq.dropna(how='all', axis=1, inplace=True)

In [63]:
#drop columns that have only one unique value
nunique = seq.nunique()
cols_to_drop = nunique[nunique == 1].index
seq = seq.drop(cols_to_drop, axis=1)
seq.head()


Unnamed: 0,Run Name,Type of Subsampling,Station,Analysis Start Date/Time,Run Group,User Name,Token,Template,Num Total Cells,Pct Cells Sampled,...,Sampling Rate,Single Pore Sbx State,Acap Parameters Hash,Avg Mt Base 1 2 Distance Align Hqmt,Avg Mt Base 2 3 Distance Align Hqmt,Avg Mt Base 3 4 Distance Align Hqmt,Avg Mt Hist 1 2 Distance Align Hqmt,Avg Mt Hist 2 3 Distance Align Hqmt,Avg Mt Hist 3 4 Distance Align Hqmt,Num Proto Pores
0,220510_SEA-HTP_03_fm-034_WWY13R06C05_cycle01,ACAP full,fm-034,2022-05-12 06:35:04,SEA-HTP,Svetlana Kritzer,220510_066,HG001-maternal-v1,2097152.0,0.5,...,,,,,,,,,,
1,220510_SEA-HTP_04_fm-035_WWY19R03C04_cycle01,ACAP full,fm-035,2022-05-12 06:34:33,SEA-HTP,Svetlana Kritzer,220510_068,HG001-maternal-v1,2097152.0,0.5,...,,,,,,,,,,
2,220510_SEA-HTP_03_fm-028_WWY13R04C08_cycle01,ACAP full,fm-028,2022-05-12 06:34:02,SEA-HTP,Svetlana Kritzer,220510_067,HG001-maternal-v1,2097152.0,0.5,...,,,,,,,,,,
3,220510_SEA-HTP_02_fm-027_WWY13R05C02_cycle01,ACAP full,fm-027,2022-05-12 06:33:02,SEA-HTP,Svetlana Kritzer,220510_065,HG001-maternal-v1,2097152.0,0.5,...,,,,,,,,,,
4,220511_CDI-HTP_02_fm-004_WWV21R01C05_cycle01,ACAP full,fm-004,2022-05-12 05:40:07,CDI-HTP,Diego Moreno Tezanos,220511_018,Strep,2097152.0,0.5,...,,,,,,,,,,


In [66]:
#drop columns that are outdated
cols_to_drop = [x for x in seq.columns if 'deprecate' in x]
seq = seq.drop(cols_to_drop, axis=1)
seq.head()


Unnamed: 0,Run Name,Type of Subsampling,Station,Analysis Start Date/Time,Run Group,User Name,Token,Template,Num Total Cells,Pct Cells Sampled,...,Sampling Rate,Single Pore Sbx State,Acap Parameters Hash,Avg Mt Base 1 2 Distance Align Hqmt,Avg Mt Base 2 3 Distance Align Hqmt,Avg Mt Base 3 4 Distance Align Hqmt,Avg Mt Hist 1 2 Distance Align Hqmt,Avg Mt Hist 2 3 Distance Align Hqmt,Avg Mt Hist 3 4 Distance Align Hqmt,Num Proto Pores
0,220510_SEA-HTP_03_fm-034_WWY13R06C05_cycle01,ACAP full,fm-034,2022-05-12 06:35:04,SEA-HTP,Svetlana Kritzer,220510_066,HG001-maternal-v1,2097152.0,0.5,...,,,,,,,,,,
1,220510_SEA-HTP_04_fm-035_WWY19R03C04_cycle01,ACAP full,fm-035,2022-05-12 06:34:33,SEA-HTP,Svetlana Kritzer,220510_068,HG001-maternal-v1,2097152.0,0.5,...,,,,,,,,,,
2,220510_SEA-HTP_03_fm-028_WWY13R04C08_cycle01,ACAP full,fm-028,2022-05-12 06:34:02,SEA-HTP,Svetlana Kritzer,220510_067,HG001-maternal-v1,2097152.0,0.5,...,,,,,,,,,,
3,220510_SEA-HTP_02_fm-027_WWY13R05C02_cycle01,ACAP full,fm-027,2022-05-12 06:33:02,SEA-HTP,Svetlana Kritzer,220510_065,HG001-maternal-v1,2097152.0,0.5,...,,,,,,,,,,
4,220511_CDI-HTP_02_fm-004_WWV21R01C05_cycle01,ACAP full,fm-004,2022-05-12 05:40:07,CDI-HTP,Diego Moreno Tezanos,220511_018,Strep,2097152.0,0.5,...,,,,,,,,,,


In [2]:
seq.to_csv('filtered_seq_data.csv')

NameError: name 'seq' is not defined

In [3]:
seq = pd.read_csv('filtered_seq_data.csv',low_memory=False)
seq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5839 entries, 0 to 5838
Columns: 1080 entries, Unnamed: 0 to Num Proto Pores
dtypes: float64(935), int64(5), object(140)
memory usage: 48.1+ MB


In [93]:
seq.head()

Unnamed: 0.1,Unnamed: 0,Run Name,Type of Subsampling,Station,Analysis Start Date/Time,Run Group,User Name,Token,Template,Num Total Cells,...,Sampling Rate,Single Pore Sbx State,Acap Parameters Hash,Avg Mt Base 1 2 Distance Align Hqmt,Avg Mt Base 2 3 Distance Align Hqmt,Avg Mt Base 3 4 Distance Align Hqmt,Avg Mt Hist 1 2 Distance Align Hqmt,Avg Mt Hist 2 3 Distance Align Hqmt,Avg Mt Hist 3 4 Distance Align Hqmt,Num Proto Pores
0,0,220510_SEA-HTP_03_fm-034_WWY13R06C05_cycle01,ACAP full,fm-034,2022-05-12 06:35:04,SEA-HTP,Svetlana Kritzer,220510_066,HG001-maternal-v1,2097152.0,...,,,,,,,,,,
1,1,220510_SEA-HTP_04_fm-035_WWY19R03C04_cycle01,ACAP full,fm-035,2022-05-12 06:34:33,SEA-HTP,Svetlana Kritzer,220510_068,HG001-maternal-v1,2097152.0,...,,,,,,,,,,
2,2,220510_SEA-HTP_03_fm-028_WWY13R04C08_cycle01,ACAP full,fm-028,2022-05-12 06:34:02,SEA-HTP,Svetlana Kritzer,220510_067,HG001-maternal-v1,2097152.0,...,,,,,,,,,,
3,3,220510_SEA-HTP_02_fm-027_WWY13R05C02_cycle01,ACAP full,fm-027,2022-05-12 06:33:02,SEA-HTP,Svetlana Kritzer,220510_065,HG001-maternal-v1,2097152.0,...,,,,,,,,,,
4,4,220511_CDI-HTP_02_fm-004_WWV21R01C05_cycle01,ACAP full,fm-004,2022-05-12 05:40:07,CDI-HTP,Diego Moreno Tezanos,220511_018,Strep,2097152.0,...,,,,,,,,,,


In [3]:
#grab dataframe of data with no nan
seq_fill=seq.fillna(seq.mean()) 
seq_fill.head()
seq_nonan=seq_fill[seq_fill.columns[~seq_fill.isna().any()]]

#normalize 
scaler = MinMaxScaler() 
seq_num=seq_nonan.select_dtypes(np.number)
arr_scaled = scaler.fit_transform(seq_num) 
seq_num = pd.DataFrame(arr_scaled, columns=seq_num.columns,index=seq_num.index)


  seq_fill=seq.fillna(seq.mean())


In [11]:
#saving the names of variables having variance more than a threshold value
variance=seq_num.var()
columns =seq_num.columns
variable = [ ]

for i in range(0,len(variance)):
    if variance[i]>=0.006: #setting the threshold as 1%
        variable.append(columns[i])


In [12]:
#add back in string features
string_col = seq_nonan.select_dtypes(exclude=np.number).columns
for i in string_col:
    variable.append(i)

In [16]:
#save cleaned dataset
seq_nonan[variable].to_csv('clean_seq_data.csv')

In [4]:
seq = pd.read_csv('clean_seq_data.csv',low_memory=False)
seq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5839 entries, 0 to 5838
Columns: 425 entries, Unnamed: 0 to Is Single Point Experiment Sbx
dtypes: float64(383), int64(5), object(37)
memory usage: 18.9+ MB


In [5]:
cols_to_drop = [x for x in seq.columns if 'Unnamed' in x]
seq = seq.drop(cols_to_drop, axis=1)
seq.head()

Unnamed: 0,Pct Cells Sampled,Num Cells Processed,Median Sequencing Lifetime,Total UBF Size,P Folder Number,GPU Processing Used,Bilayers (3.0+),Activethinning Num Cells,Electroporation Num Cells,Hz,...,Created At,Labcodes Version,SBX Pore Stock Name,SBX Pore Name,Tag Order,Target Reference Genome,Run Type,Waveform Params File,Reagent Params File,Is Single Point Experiment Sbx
0,0.5,20971.0,3907.726637,339226300000.0,1.0,0.021526,1796536.0,20971.0,20971.0,0.192338,...,2022-05-11 23:35:21,6.0.3,200618_01,P-0320,CTAG,HG001-maternal-v1,ACAP ONLY,/home/genia/projects/labcodes_branch/NSParamet...,/home/genia/projects/labcodes_branch/NSParamet...,0.0555555555555555
1,0.5,20971.0,3907.726637,339226300000.0,1.0,0.021526,4.0,20971.0,20971.0,0.192338,...,2022-05-11 23:34:50,6.0.3,200618_01,P-0320,CTAG,HG001-maternal-v1,ACAP ONLY,/home/genia/projects/labcodes_branch/NSParamet...,/home/genia/projects/labcodes_branch/NSParamet...,0.0555555555555555
2,0.5,20971.0,3907.726637,339226300000.0,1.0,0.021526,5.0,20971.0,20971.0,0.192338,...,2022-05-11 23:34:15,6.0.3,200618_01,P-0320,CTAG,HG001-maternal-v1,ACAP ONLY,/home/genia/projects/labcodes_branch/NSParamet...,/home/genia/projects/labcodes_branch/NSParamet...,0.0555555555555555
3,0.5,20971.0,3907.726637,339226300000.0,1.0,0.0,1930847.0,20971.0,20971.0,0.192338,...,2022-05-11 23:33:09,6.0.3,200618_01,P-0320,CTAG,HG001-maternal-v1,ACAP ONLY,/home/genia/projects/labcodes_branch/NSParamet...,/home/genia/projects/labcodes_branch/NSParamet...,0.0555555555555555
4,0.5,20971.0,3907.726637,33341860000.0,0.0,0.021526,1899953.0,20971.0,20971.0,0.307117,...,2022-05-11 22:42:40,6.0.3,210824_PORE_03,P-0445,CTAG,not_genomic,Both,/home/genia/projects/labcodes_branch/NSParamet...,/home/genia/projects/labcodes_branch/NSParamet...,0.0555555555555555


In [21]:
seq.columns

Index(['Pct Cells Sampled', 'Num Cells Processed',
       'Median Sequencing Lifetime', 'Total UBF Size', 'P Folder Number',
       'GPU Processing Used', 'Bilayers (3.0+)', 'Activethinning Num Cells',
       'Electroporation Num Cells', 'Hz',
       ...
       'Created At', 'Labcodes Version', 'SBX Pore Stock Name',
       'SBX Pore Name', 'Tag Order', 'Target Reference Genome', 'Run Type',
       'Waveform Params File', 'Reagent Params File',
       'Is Single Point Experiment Sbx'],
      dtype='object', length=423)

Using domain knownledge, we define 3 metrics used to predict sequencing yield. These are, High quality Mts (number of cells sequencing), Read Length and Accuracy (by percent identical). 

In [41]:
y_col=['Num Align Hqmt','Mode Mt Base Call Read Length Align Hqmt', 'Avg Mt Align Edit Percent Identical']
y=seq[y_col]
scaler = MinMaxScaler() 
arr_scaled = scaler.fit_transform(y) 
y = pd.DataFrame(arr_scaled, columns=y.columns,index=y.index)

In [32]:
#normalize numerical features
X= seq.drop(y_col, axis=1)
scaler = MinMaxScaler() 
X_num=X.select_dtypes(np.number)
arr_scaled = scaler.fit_transform(X_num) 
X_num = pd.DataFrame(arr_scaled, columns=X_num.columns,index=X_num.index)
X_num.head()

Unnamed: 0,Pct Cells Sampled,Num Cells Processed,Median Sequencing Lifetime,Total UBF Size,P Folder Number,GPU Processing Used,Bilayers (3.0+),Activethinning Num Cells,Electroporation Num Cells,Hz,...,Avg Mt Qc Align Edit Percent Identical Full Read Bed Include Filename Is Good Mt,Avg Mt Qc Align Edit Percent Identical Full Read Bed Include Filename Is Qc Mt,Avg Mt Base Call Read Length Bed Exclude Filename Is Qc Align Hqmt,Avg Mt Base Call Read Length Bed Include Filename Is Qc Align Hqmt,Avg Mt Base Call Read Length Bed Low Coverage Filename Is Qc Align Hqmt,Total Cycle Time (sec),Avg Mt Base Call Read Length Bed Other Coverage Filename Is Align Hqmt,Frac Oc Bump,Frac Oc Med Filt Bump,Avg Mt Qc Bed Gtoa Mismatches Highq Rate
0,0.002506,0.080002,0.53943,0.038936,0.000883,0.021526,0.2402763,0.079914,0.079914,0.007602,...,0.946363,0.935756,0.596296,0.560295,0.591228,0.347073,0.642192,0.081276,0.11825,0.11399
1,0.002506,0.080002,0.53943,0.038936,0.000883,0.021526,6.687208e-07,0.079914,0.079914,0.007602,...,0.946363,0.935756,0.596296,0.560295,0.591228,0.347471,0.642192,0.081276,0.11825,0.11608
2,0.002506,0.080002,0.53943,0.038936,0.000883,0.021526,8.02465e-07,0.079914,0.079914,0.007602,...,0.946363,0.935756,0.596296,0.560295,0.591228,0.346569,0.642192,0.081276,0.11825,0.110262
3,0.002506,0.080002,0.53943,0.038936,0.000883,0.0,0.2582396,0.079914,0.079914,0.007602,...,0.946363,0.935756,0.596296,0.560295,0.591228,0.334814,0.642192,0.081276,0.11825,0.108398
4,0.002506,0.080002,0.53943,0.003752,0.0,0.021526,0.2541078,0.079914,0.079914,0.085803,...,0.946363,0.935756,0.596296,0.560295,0.591228,0.220495,0.642192,0.000947,0.019888,0.271412


In [56]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.feature_selection import f_classif

In [43]:
corr_matrix = X_num.corr()
print(corr_matrix)

                                                    Pct Cells Sampled  \
Pct Cells Sampled                                            1.000000   
Num Cells Processed                                         -0.042991   
Median Sequencing Lifetime                                   0.009612   
Total UBF Size                                               0.741245   
P Folder Number                                              0.039378   
...                                                               ...   
Total Cycle Time (sec)                                       0.146141   
Avg  Mt Base Call Read Length  Bed Other Covera...           0.006463   
Frac Oc Bump                                                -0.008740   
Frac Oc Med Filt Bump                                       -0.021748   
Avg Mt Qc Bed Gtoa Mismatches Highq Rate                     0.006169   

                                                    Num Cells Processed  \
Pct Cells Sampled                               

In [46]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
print(to_drop)

['Activethinning Num Cells', 'Electroporation Num Cells', 'Normalization Stop Time Median', 'Normalization Stop Time P10', 'Normalization Stop Time P90', 'Num Single Pores Deactivated By Tagflow Manual', 'Num Total Cells Deactivated', 'Reference Window Upper Bound', 'Sequencing Lifetime Mean', 'Sequencing Lifetime P10', 'Sequencing Lifetime P90', 'Wet Test Num Cells', 'Num Align High Quality Reads', 'High Quality Reads Tn', 'High Quality Reads Tp', 'Ubfc Subsamp Cells', 'Lifetime After Tag Flow Align Hqr P10', 'Lifetime After Tag Flow Align Hqr Median', 'Sequencing Lifetime Align Hqr P90', 'Sequencing Lifetime Align Hqr P10', 'Bilayerformation Bilayers', 'Cell Mask Protobilayers Sum', 'Drytest Num Cells', 'Ubfc Subsamp Percent', 'File Date', 'Gpu Subsampling Percent', 'Align Mean Called Homopolymer 5 A', 'Align Median Called Homopolymer 5 G', 'Align Median Called Homopolymer 5 T', 'Align Median Called Homopolymer 4 A', 'Align Median Called Homopolymer 4 C', 'Align Median Called Homopol

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [49]:
X_drop = X_num.drop(to_drop, axis=1)

In [50]:
X_drop.head()

Unnamed: 0,Pct Cells Sampled,Num Cells Processed,Median Sequencing Lifetime,Total UBF Size,P Folder Number,GPU Processing Used,Bilayers (3.0+),Hz,Lifetime After Tag Flow Hqr P10,Lifetime After Tag Flow P10,...,Mode Mt Base Call Read Length Unthreaded At Start Or End Of Bright Cycle,Std Mt Base Call Read Length Unthreaded At Start Or End Of Bright Cycle,Num Good Mt,Avg Mt Base Call Read Length Bed Exclude Filename Is Qc Mt,Avg Mt Base Call Read Length Bed High Coverage Filename Is Align Hqmt,Avg Mt Base Call Read Length Bed Include Filename Mt End Adapter Found,Avg Mt Qc Align Edit Percent Identical Full Read Bed Include Filename Is Good Mt,Frac Oc Bump,Frac Oc Med Filt Bump,Avg Mt Qc Bed Gtoa Mismatches Highq Rate
0,0.002506,0.080002,0.53943,0.038936,0.000883,0.021526,0.2402763,0.007602,0.059566,0.088863,...,0.486503,0.342082,0.120325,0.607122,0.663887,0.694284,0.946363,0.081276,0.11825,0.11399
1,0.002506,0.080002,0.53943,0.038936,0.000883,0.021526,6.687208e-07,0.007602,0.053566,0.081759,...,0.486503,0.342082,0.120325,0.607122,0.663887,0.694284,0.946363,0.081276,0.11825,0.11608
2,0.002506,0.080002,0.53943,0.038936,0.000883,0.021526,8.02465e-07,0.007602,0.068816,0.063036,...,0.486503,0.342082,0.120325,0.607122,0.663887,0.694284,0.946363,0.081276,0.11825,0.110262
3,0.002506,0.080002,0.53943,0.038936,0.000883,0.0,0.2582396,0.007602,0.048325,0.053535,...,0.486503,0.342082,0.120325,0.607122,0.663887,0.694284,0.946363,0.081276,0.11825,0.108398
4,0.002506,0.080002,0.53943,0.003752,0.0,0.021526,0.2541078,0.085803,0.167676,0.370771,...,0.580311,0.338184,0.098752,0.607122,0.663887,0.694284,0.946363,0.000947,0.019888,0.271412


In [51]:
X_str=X.select_dtypes(exclude=np.number)
X_str

Unnamed: 0,Run Name,Type of Subsampling,Station,Analysis Start Date/Time,Run Group,User Name,Token,Template,Flow Cell Type,Analysis End Date/Time,...,Created At,Labcodes Version,SBX Pore Stock Name,SBX Pore Name,Tag Order,Target Reference Genome,Run Type,Waveform Params File,Reagent Params File,Is Single Point Experiment Sbx
0,220510_SEA-HTP_03_fm-034_WWY13R06C05_cycle01,ACAP full,fm-034,2022-05-12 06:35:04,SEA-HTP,Svetlana Kritzer,220510_066,HG001-maternal-v1,[POR] FMI GP615 C276 4x2M Qbert2M,2022-05-12 09:07:02,...,2022-05-11 23:35:21,6.0.3,200618_01,P-0320,CTAG,HG001-maternal-v1,ACAP ONLY,/home/genia/projects/labcodes_branch/NSParamet...,/home/genia/projects/labcodes_branch/NSParamet...,0.05555555555555555
1,220510_SEA-HTP_04_fm-035_WWY19R03C04_cycle01,ACAP full,fm-035,2022-05-12 06:34:33,SEA-HTP,Svetlana Kritzer,220510_068,HG001-maternal-v1,[POR] FMI GP615 C276 4x2M Qbert2M,2022-05-12 09:53:36,...,2022-05-11 23:34:50,6.0.3,200618_01,P-0320,CTAG,HG001-maternal-v1,ACAP ONLY,/home/genia/projects/labcodes_branch/NSParamet...,/home/genia/projects/labcodes_branch/NSParamet...,0.05555555555555555
2,220510_SEA-HTP_03_fm-028_WWY13R04C08_cycle01,ACAP full,fm-028,2022-05-12 06:34:02,SEA-HTP,Svetlana Kritzer,220510_067,HG001-maternal-v1,[POR] FMI GP615 C276 4x2M Qbert2M,2022-05-12 09:30:15,...,2022-05-11 23:34:15,6.0.3,200618_01,P-0320,CTAG,HG001-maternal-v1,ACAP ONLY,/home/genia/projects/labcodes_branch/NSParamet...,/home/genia/projects/labcodes_branch/NSParamet...,0.05555555555555555
3,220510_SEA-HTP_02_fm-027_WWY13R05C02_cycle01,ACAP full,fm-027,2022-05-12 06:33:02,SEA-HTP,Svetlana Kritzer,220510_065,HG001-maternal-v1,[POR] FMI GP615 C276 4x2M Qbert2M,2022-05-12 11:35:28,...,2022-05-11 23:33:09,6.0.3,200618_01,P-0320,CTAG,HG001-maternal-v1,ACAP ONLY,/home/genia/projects/labcodes_branch/NSParamet...,/home/genia/projects/labcodes_branch/NSParamet...,0.05555555555555555
4,220511_CDI-HTP_02_fm-004_WWV21R01C05_cycle01,ACAP full,fm-004,2022-05-12 05:40:07,CDI-HTP,Diego Moreno Tezanos,220511_018,Strep,[POR] FMI GP615 C276 4x2M Qbert2M,2022-05-12 19:41:56,...,2022-05-11 22:42:40,6.0.3,210824_PORE_03,P-0445,CTAG,not_genomic,Both,/home/genia/projects/labcodes_branch/NSParamet...,/home/genia/projects/labcodes_branch/NSParamet...,0.05555555555555555
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5834,211203_SEA-HTP_01_fm-024_WXX00R00C00_cycle01,1k,fm-024,2022-01-28 06:22:51,SEA-HTP,Cynthia Cech,210428_000,HG001-maternal-v1,FKM 8M B0 (DaPro),2022-01-28 12:49:58,...,2022-01-27 22:22:52,5.3.6,200618_01,P-0320,CTAG,HG001-maternal-v1,GPU ONLY,/home/genia/projects/labcodes_branch/parameter...,/home/genia/projects/labcodes_branch/parameter...,0.05555555555555555
5835,220104_SEA-HTP_03_fm-033_WVN16R02C05_cycle01,1k,fm-033,2022-01-28 06:22:21,SEA-HTP,Svetlana Kritzer,220104_026,HG001-maternal-v1,FKM 8M B0 (DaPro),2022-01-28 07:58:18,...,2022-01-27 22:22:22,5.3.6,200618_01,P-0320,CTAG,HG001-maternal-v1,ACAP ONLY,/home/genia/projects/labcodes_branch/parameter...,/home/genia/projects/labcodes_branch/parameter...,0.05555555555555555
5836,220127_TI-RED-HTP_01_fm-050_WXX00R00C00_cycle01,ACAP full,fm-050,2022-01-28 05:57:03,TI-RED-HTP,Masa Absec,210428_000,HG001-maternal-v1,FKM 8M B0 (DaPro),2022-01-28 09:00:24,...,2022-01-27 21:57:05,5.3.6,200618_01,P-0320,CTAG,HG001-maternal-v1,GPU ONLY,/home/genia/projects/labcodes_branch/parameter...,/home/genia/projects/labcodes_branch/parameter...,0.05555555555555555
5837,220125_TI-RED-HTP_01_fm-050_WXX00R00C00_cycle0...,ACAP full,fm-050,2022-01-27 17:05:52,TI-RED-HTP,Masa Absec,210428_000,HG001-maternal-v1,FKM 8M B0 (DaPro),2022-01-27 21:39:03,...,2022-01-27 09:05:54,5.3.6,200618_01,P-0320,CTAG,HG001-maternal-v1,GPU ONLY,/home/genia/projects/labcodes_branch/parameter...,/home/genia/projects/labcodes_branch/parameter...,0.05555555555555555


In [69]:
X_ohc = X_str.copy()


In [70]:
X_ohc_f = pd.DataFrame()
# for all categorical variables we selected
def top_x(df2,variable,top_x_labels):
    for label in top_x_labels:
        X_ohc_f[variable+'_'+label] = np.where(df2[variable]==label,1,0)

for feat in X_ohc.columns:
    # make list with top 3 variables
    top_3 = [x for x in X_str[feat].value_counts().sort_values(ascending=False).head(3).index]
    top_3

    #encode Nighborhood into the 3 most frequent categories
    top_x(X_ohc,feat,top_3)
    # display data
    X_ohc_f.head()

  X_ohc_f[variable+'_'+label] = np.where(df2[variable]==label,1,0)


In [76]:
#concatenate one hot encoded features and filtered numerical features
X_full = pd.concat([X_ohc_f, X_drop], sort=False, axis=1)  
X_full

Unnamed: 0,Run Name_201203_TI-RED-HTP_03_fm-024_WWI06R04C04_cycle01,Run Name_201120_SEA-HTP_03_fm-032_WWA05R05C04_cycle01,Run Name_211207_SEA-HTP_01_fm-032_WXX00R00C00_cycle01,Type of Subsampling _ACAP full,Type of Subsampling _1k,Type of Subsampling _0.5p,Station_fm-032,Station_fm-024,Station_fm-034,Analysis Start Date/Time_2022-03-24 01:20:01,...,Mode Mt Base Call Read Length Unthreaded At Start Or End Of Bright Cycle,Std Mt Base Call Read Length Unthreaded At Start Or End Of Bright Cycle,Num Good Mt,Avg Mt Base Call Read Length Bed Exclude Filename Is Qc Mt,Avg Mt Base Call Read Length Bed High Coverage Filename Is Align Hqmt,Avg Mt Base Call Read Length Bed Include Filename Mt End Adapter Found,Avg Mt Qc Align Edit Percent Identical Full Read Bed Include Filename Is Good Mt,Frac Oc Bump,Frac Oc Med Filt Bump,Avg Mt Qc Bed Gtoa Mismatches Highq Rate
0,0,0,0,1,0,0,0,0,1,0,...,0.486503,0.342082,0.120325,0.607122,0.663887,0.694284,0.946363,0.081276,0.118250,0.113990
1,0,0,0,1,0,0,0,0,0,0,...,0.486503,0.342082,0.120325,0.607122,0.663887,0.694284,0.946363,0.081276,0.118250,0.116080
2,0,0,0,1,0,0,0,0,0,0,...,0.486503,0.342082,0.120325,0.607122,0.663887,0.694284,0.946363,0.081276,0.118250,0.110262
3,0,0,0,1,0,0,0,0,0,0,...,0.486503,0.342082,0.120325,0.607122,0.663887,0.694284,0.946363,0.081276,0.118250,0.108398
4,0,0,0,1,0,0,0,0,0,0,...,0.580311,0.338184,0.098752,0.607122,0.663887,0.694284,0.946363,0.000947,0.019888,0.271412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5834,0,0,0,0,1,0,0,1,0,0,...,0.486503,0.342082,0.120325,0.607122,0.663887,0.694284,0.946363,0.081276,0.118250,0.271412
5835,0,0,0,0,1,0,0,0,0,0,...,0.486503,0.342082,0.120325,0.607122,0.663887,0.694284,0.946363,0.081276,0.118250,0.271412
5836,0,0,0,1,0,0,0,0,0,0,...,0.486503,0.342082,0.120325,0.607122,0.663887,0.694284,0.946363,0.081276,0.118250,0.271412
5837,0,0,0,1,0,0,0,0,0,0,...,0.486503,0.342082,0.120325,0.607122,0.663887,0.694284,0.946363,0.081276,0.118250,0.271412


In [77]:
#Save final cleaned X and Y data sets
X_full.to_csv('X_seq.csv')
y.to_csv('Y_seq.csv')

In summary, this dataset had a large number of similar features as well as a mix of categorical and numerical data types. The approach used to clean the data was to replace the missing values with the mean, normalize the numerical features, and trim down the number of features. Methods used for feature selection includes filtering by variance and correlation. Finally, categorical features were one hot encoded, but limited to the top 3 selections to keep the number of features low. 