This notebook uses iFeatureOmega, a feature generation software, to add to our feature space for a RandomForestClassifier that predicts protein pair functionality.

To do:

1) Write unit tests for iFeatureOmega
2) time trial for different descriptors - DONE
3) Figure out how to append meso and thermo descriptors
4) Make sure protein length is in training
5) try changing subject_align_len to subject_align_len/m_protein_len
6) log(ratio) for proteins approacing zero or infinity
    take distributions to assess for this.
    Update -- taking log and then droping infinites and NaN improves score to 0.86 <-- cannot repeat this

In [44]:
import duckdb

con = duckdb.connect(path_to_db)
cmd = """SELECT list_of_column_names FROM fafsa_final"""
df = con.execute(cmd).df()

NameError: name 'path_to_db' is not defined

In [None]:
output_df.to_parquet(path)
cmd = f"""CREATE OR REPLACE TABLE table_name AS SELECT * FROM '{path}'"""
con.execute(cmd)

 protein_pair_cmd = """CREATE OR REPLACE TABLE fafsa_protein_pairs AS
                          SELECT 
                          meso_pid,
                          thermo_pid,
                          bit_score,
                          local_gap_compressed_percent_id,
                          scaled_local_query_percent_id,
                          scaled_local_symmetric_percent_id,
                          query_align_len,
                          query_align_cov,
                          subject_align_len,
                          subject_align_cov,
                          FROM protein_pairs USING SAMPLE 100
"""

In [93]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.ensemble
import sklearn.feature_selection
import unittest
import iFeatureOmegaCLI
import Bio.SeqIO
import Bio.SeqRecord
import io
from io import StringIO
import time

In [94]:
def get_fasta_from_dataframe(dataframe, output_file_a, output_file_b):
    #adjust this to write function with BioPython
    #separate functions for each of the input sequences
    #in training, seq_a = meso and seq_b = thermo
    
    
    #meso sequence to fasta
    with open(output_file_a, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), row['m_protein_seq']))
    
    #thermo sequence to fasta
    with open(output_file_b, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), (row['t_protein_seq'])))
   
    #return output files
    return [output_file_a, output_file_b]

In [95]:
def get_protein_descriptors(fasta_file, descriptors=[]):
    
    """
    Generates features from a protein sequence

    Parameters
    ----------
    Fasta file with amino acid sequences.

    Returns
    -------
    Vector of descriptors
    """
    
    #create iProtein object
    protein = iFeatureOmegaCLI.iProtein(fasta_file)
    
    #not sure why we need this yet. Right now it is stored in local directory.
    params = protein.import_parameters('protein_parameters.json')
    
    protein_descriptors = {}
    
    for descriptor in descriptors:
        protein.get_descriptor(descriptor)
        protein_descriptors.update({f'{descriptor}':protein.encodings})
        
    return protein_descriptors

In [96]:
def create_new_dataframe(dataframe, output_files, descriptors=[]):
    """
    Creates new dataframe with descriptors added.

    Parameters
    ----------
    Pandas dataframe, list of descriptors as strings, output file name.

    Returns
    -------
    Dataframe including vector(s) of descriptors
    """

    fasta_files = get_fasta_from_dataframe(dataframe, output_files[0], output_files[1])
    
    df = dataframe.reset_index()
    
    def compute_descriptor_ratio(fasta_files, descriptors=[]):
        """
        Generates dictionary of descriptors for each of the two input sequences.
        Computes the difference between each instance of a descriptor.
        
        Parameters
        ----------
        List of two fasta files (str) and list of descriptors (str).

        Returns
        -------
        Dictionary with difference between descriptors for each of the 
        input sequences.
        """
        desc_a = get_protein_descriptors(fasta_files[0], descriptors)
        desc_b = get_protein_descriptors(fasta_files[1], descriptors)

        feature_dict = {}

        for key in desc_a:
                        
            if 'GDPC type 2' in key:
                feature_dict[key] = np.log(desc_a[key] - desc_b[key])
                
            elif 'CKSAAGP type 2' in key:
                feature_dict[key] = np.log(desc_a[key] - desc_b[key])
                
            else:
                feature_dict[key] = (desc_a[key] / desc_b[key])

        return feature_dict
    
    feature_dict = compute_descriptor_ratio(fasta_files, descriptors)

    for desc in descriptors:

        feature_dict[desc].index = feature_dict[desc].index.astype(int)
        features = feature_dict[desc].reset_index()

        df = pd.merge(
            df,
            features,
            how='outer',
            left_index=True,
            right_index=True)

    return df

In [97]:
#iFeature properties

protein = iFeatureOmegaCLI.iProtein('meso_50k.fasta')
protein.display_feature_types()


        ----- Available feature types ------        
        
        AAC                                                Amino acid composition
        EAAC                                               Enhanced amino acid composition
        CKSAAP type 1                                      Composition of k-spaced amino acid pairs type 1 - normalized
        CKSAAP type 2                                      Composition of k-spaced amino acid pairs type 2 - raw count
        DPC type 1                                         Dipeptide composition type 1 - normalized
        DPC type 2                                         Dipeptide composition type 2 - raw count
        TPC type 1                                         Tripeptide composition type 1 - normalized
        TPC type 2                                         Tripeptide composition type 1 - raw count
        CTDC                                               Composition
        CTDT                                      

In [98]:
cd /Users/loganroberts/Learn2Therm/ValidProt/FAFSA

/Users/loganroberts/Learn2Therm/ValidProt/FAFSA


In [99]:
#convert to pandas df
df = pd.read_csv('learn2therm_sample_50k.csv')
df.columns

Index(['Unnamed: 0', 'local_gap_compressed_percent_id',
       'scaled_local_query_percent_id', 'scaled_local_symmetric_percent_id',
       'query_align_len', 'query_align_cov', 'subject_align_len',
       'subject_align_cov', 'bit_score', 'thermo_index', 'meso_index',
       'prot_pair_index', 'meso_protein_int_index', 'thermo_protein_int_index',
       'taxa_pair_index', 'local_gap_compressed_percent_id_16s',
       'scaled_local_query_percent_id_16s',
       'scaled_local_symmetric_percent_id_16s', 'query_align_cov_16s',
       'subject_align_cov_16s', 'bit_score_16s', 'm_ogt', 't_ogt',
       'ogt_difference', 'm_protein_seq', 't_protein_seq', 'm_protein_desc',
       't_protein_desc', 'm_protein_len', 't_protein_len'],
      dtype='object')

In [100]:
cd /Users/loganroberts/Learn2Therm/ValidProt/notebooks

/Users/loganroberts/Learn2Therm/ValidProt/notebooks


In [101]:
target = pd.read_csv('protein_match_50k')

In [102]:
target.drop(columns=["Unnamed: 0"], inplace=True)

In [103]:
target['protein_match'].value_counts()

Yes    40296
No      8554
Name: protein_match, dtype: int64

In [104]:
from sklearn.utils import resample

# Assuming your data is in a pandas DataFrame called 'data'
# Separate the majority and minority classes
majority_class = target[target['protein_match'] == 'Yes']
minority_class = target[target['protein_match'] == 'No']

# Undersample the majority class to match the number of minority class samples
n_samples = len(minority_class)
undersampled_majority = resample(majority_class, n_samples=n_samples, replace=False)

# Combine the undersampled majority class with the minority class
balanced_data = pd.concat([undersampled_majority, minority_class])

In [105]:
balanced_data

Unnamed: 0,prot_pair_index,protein_match,Jaccard_Score
48351,101743711,Yes,0.666667
18370,41958716,Yes,0.666667
7120,47936077,Yes,1.000000
21689,145409638,Yes,0.666667
1854,29480676,Yes,0.400000
...,...,...,...
48833,70996712,No,0.285714
48839,14456723,No,0.250000
48845,78849058,No,0.250000
48847,161110219,No,0.250000


In [106]:
df = pd.merge(df, balanced_data, on=['prot_pair_index'])
df.shape

(17108, 32)

In [107]:
"""
this list comes from a combination of reading through the features and determining which might be useful
and timing some of the feature generations. those that took more than 30ish seconds were eliminated
Also removed those that have really high dimensionality (>4000)
"""

feature_list = ['AAC', 'GAAC', 'DistancePair',
               'CTDC', 'CTDT', 'CTDD', 'CTriad', 'GDPC type 1', 'GDPC type 2',
                'CKSAAGP type 1', 'CKSAAGP type 2', 'PseKRAAC type 2', 'PseKRAAC type 3A','PseKRAAC type 7',
                'PseKRAAC type 9', 'Geary','APAAC', 'QSOrder']

In [108]:
# df = create_new_dataframe(df, 'seq_50k.fasta', descriptors= ['EGAAC', 'CKSAAGP type 1', 'CKSAAGP type 2',
#                                                             'GDPC type 1', 'GDPC type 2', 'TPC type 1', 'TPC type 2',
#                                                             'GTPC type 1', 'GTPC type 2', 'DPC type 1', 'DPC type 2',
#                                                             'Moran', 'Geary', 'NMBroto', 'AC', 'CC', 'ACC',
#                                                             'PAAC', 'APAAC'])

In [109]:
df = create_new_dataframe(df, ['seq_50k_a.fasta', 'seq_50k_b.fasta'], descriptors=[feature for feature in feature_list])

File imported successfully.
File imported successfully.


  df = pd.merge(
  df = pd.merge(
  df = pd.merge(
  df = pd.merge(
  df = pd.merge(
  df = pd.merge(
  df = pd.merge(
  df = pd.merge(


Split data into dev and test, and then split that into train and validation.

In [110]:
#drop columns that don't exihibit signficant pearson correlation with bit_score

df = df.drop(columns = ['meso_index', 'meso_protein_int_index', 'local_gap_compressed_percent_id_16s', 
                        'scaled_local_query_percent_id_16s', 'scaled_local_symmetric_percent_id_16s',
                       'bit_score_16s', 'm_ogt', 't_ogt', 'taxa_pair_index', 'thermo_protein_int_index'
                       , 'prot_pair_index', 'ogt_difference', 'Jaccard_Score',
                       'query_align_cov_16s', 'subject_align_cov_16s',
                       'Unnamed: 0', 'thermo_index','m_protein_seq', 't_protein_seq', 
                       'm_protein_desc', 't_protein_desc'])

In [111]:
df=df.drop(columns=['index_y', 'index_x'])
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_len,t_protein_len,...,QSOrder_Grantham.Xr.T,QSOrder_Grantham.Xr.W,QSOrder_Grantham.Xr.Y,QSOrder_Grantham.Xr.V,QSOrder_Schneider.Xd.1,QSOrder_Schneider.Xd.2,QSOrder_Schneider.Xd.3,QSOrder_Grantham.Xd.1,QSOrder_Grantham.Xd.2,QSOrder_Grantham.Xd.3
0,0.333333,0.267380,0.254453,148,0.791444,163,0.791262,127,206,187,...,0.982467,1.768440,0.884220,1.326330,0.986566,0.936489,1.095793,0.991611,0.979039,1.031788
1,0.268482,0.190083,0.189300,248,0.683196,271,0.740437,132,366,363,...,1.094398,0.926029,0.771691,0.956897,0.996233,0.932948,1.078083,1.006552,0.982711,1.012484
2,0.333333,0.295082,0.291498,218,0.893443,221,0.884000,158,250,244,...,0.757039,0.000000,1.419449,0.716893,1.047843,0.959824,0.992939,1.031137,0.980542,0.989400
3,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,375,269,...,0.840951,1.324497,0.378428,1.000131,1.035213,0.999638,1.027001,0.972217,1.003861,1.025202
4,0.331915,0.245283,0.242613,239,0.751572,238,0.732308,202,325,318,...,0.398850,0.864175,0.439943,1.639541,1.072840,0.907991,1.008479,1.081410,0.899625,1.023937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17103,0.331707,0.294372,0.282158,202,0.874459,218,0.868526,232,251,231,...,1.335836,0.445279,1.781114,0.805742,1.058559,0.895396,1.073475,0.988015,0.990166,1.023015
17104,0.304878,0.188679,0.163666,166,0.626415,170,0.491329,125,346,265,...,0.838998,0.671198,1.957662,1.082578,0.993621,0.976925,1.090877,0.991223,0.994509,1.015135
17105,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,271,353,...,1.685137,11.234249,0.561712,1.235767,1.037673,0.938086,1.001577,1.014846,0.996361,0.989802
17106,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,331,338,...,1.093541,3.107960,0.690658,0.897855,0.961526,1.004391,1.030476,1.010615,1.009159,0.981011


In [86]:
df.columns

Index(['local_gap_compressed_percent_id', 'scaled_local_query_percent_id',
       'scaled_local_symmetric_percent_id', 'query_align_len',
       'query_align_cov', 'subject_align_len', 'subject_align_cov',
       'bit_score', 'm_protein_len', 't_protein_len',
       ...
       'QSOrder_Grantham.Xr.T', 'QSOrder_Grantham.Xr.W',
       'QSOrder_Grantham.Xr.Y', 'QSOrder_Grantham.Xr.V',
       'QSOrder_Schneider.Xd.1', 'QSOrder_Schneider.Xd.2',
       'QSOrder_Schneider.Xd.3', 'QSOrder_Grantham.Xd.1',
       'QSOrder_Grantham.Xd.2', 'QSOrder_Grantham.Xd.3'],
      dtype='object', length=1035)

In [50]:
df=df.drop(columns=['local_gap_compressed_percent_id', 'scaled_local_query_percent_id',
       'scaled_local_symmetric_percent_id', 'query_align_len',
       'query_align_cov', 'subject_align_len', 'subject_align_cov',
       'bit_score', 'm_protein_len', 't_protein_len'])

In [51]:
df

Unnamed: 0,protein_match,AAC_A,AAC_C,AAC_D,AAC_E,AAC_F,AAC_G,AAC_H,AAC_I,AAC_K,...,QSOrder_Grantham.Xr.T,QSOrder_Grantham.Xr.W,QSOrder_Grantham.Xr.Y,QSOrder_Grantham.Xr.V,QSOrder_Schneider.Xd.1,QSOrder_Schneider.Xd.2,QSOrder_Schneider.Xd.3,QSOrder_Grantham.Xd.1,QSOrder_Grantham.Xd.2,QSOrder_Grantham.Xd.3
0,Yes,1.271459,0.000000,1.445448,0.957334,1.424034,0.642422,0.254292,0.762876,0.881545,...,0.730477,0.996105,0.996105,1.261733,0.980941,1.056692,0.967775,0.910832,1.147138,0.966570
1,Yes,1.072934,0.000000,0.431090,1.149573,1.245370,1.204314,inf,0.766382,1.293269,...,0.849359,2.807602,0.656323,1.323584,0.950250,1.057052,0.957343,0.921186,1.106142,0.987225
2,Yes,1.348332,1.816901,0.961889,0.778672,0.613204,0.965553,0.218028,1.557344,0.915718,...,1.368059,inf,0.619853,0.879467,0.996096,1.021678,0.951924,1.040048,0.978314,0.983529
3,No,1.716476,0.430400,1.434667,0.843922,1.229714,0.938051,0.807000,0.896667,0.239111,...,0.840951,1.324497,0.378428,1.000131,1.035213,0.999638,1.027001,0.972217,1.003861,1.025202
4,Yes,0.559396,4.251412,1.400465,0.890772,1.700565,0.818791,1.020339,3.231073,5.526836,...,1.199196,inf,0.782085,0.857770,0.973425,1.016259,1.037198,0.967060,0.991860,1.044636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17103,Yes,0.804878,1.000000,1.636364,0.777778,0.800000,0.666667,0.900000,1.200000,2.000000,...,0.987935,inf,1.637893,1.021024,1.107671,0.930671,0.953020,1.141282,0.910084,0.959902
17104,No,1.632058,1.224044,0.504018,0.612022,1.224044,1.550455,3.672131,0.544019,0.612022,...,1.182575,inf,0.000000,1.429841,0.912084,0.981403,1.017698,0.930389,0.962463,1.113306
17105,No,1.056813,0.868389,1.915563,0.754127,1.860833,0.844919,0.868389,0.592083,0.781550,...,1.685137,11.234249,0.561712,1.235767,1.037673,0.938086,1.001577,1.014846,0.996361,0.989802
17106,No,0.932353,0.170191,1.340257,0.970091,1.225378,1.152909,0.714804,2.135128,1.633837,...,1.093541,3.107960,0.690658,0.897855,0.961526,1.004391,1.030476,1.010615,1.009159,0.981011


In [142]:
#maybe divide all of the AAC/GAAC columns by the protein length 

In [87]:
df.shape

(17108, 1035)

In [112]:
df = df.replace([np.inf, -np.inf], np.nan)

In [114]:
nan_counts = df.isna().sum()
print(nan_counts)
nan_counts.unique()

local_gap_compressed_percent_id      0
scaled_local_query_percent_id        0
scaled_local_symmetric_percent_id    0
query_align_len                      0
query_align_cov                      0
                                    ..
QSOrder_Schneider.Xd.2               0
QSOrder_Schneider.Xd.3               0
QSOrder_Grantham.Xd.1                0
QSOrder_Grantham.Xd.2                0
QSOrder_Grantham.Xd.3                0
Length: 304, dtype: int64


array([0])

In [107]:
df.columns[36:]

Index(['QSOrder_Grantham.Xd.2', 'QSOrder_Grantham.Xd.3'], dtype='object')

In [110]:
log_cols = df.columns[36:]
df[log_cols] = df[log_cols].apply(lambda x: np.log(x))

In [113]:
df = df.dropna(axis=1, how='any')

In [88]:
nan_counts = df.isna().sum()
nan_counts

local_gap_compressed_percent_id      0
scaled_local_query_percent_id        0
scaled_local_symmetric_percent_id    0
query_align_len                      0
query_align_cov                      0
                                    ..
QSOrder_Schneider.Xd.2               0
QSOrder_Schneider.Xd.3               0
QSOrder_Grantham.Xd.1                0
QSOrder_Grantham.Xd.2                0
QSOrder_Grantham.Xd.3                0
Length: 1035, dtype: int64

In [115]:
df.shape

(17108, 304)

Use MRMR to select for the best features. Going to start by grouping into different categories of features generated from iFeature Omega.

In [123]:
#use MRMR to select for the best features from PseKRAAC
df_subset = df.loc[:, df.columns != 'protein_match']
print(type(df_subset))

# select top 10 features using mRMR
from mrmr import mrmr_classif
selected_features = mrmr_classif(X=df_subset.iloc[:,10:], y=df['protein_match'], K=20)

selected_features

<class 'pandas.core.frame.DataFrame'>


100%|██████████| 20/20 [00:07<00:00,  2.83it/s]


['type3A_T1.G.1_T1.G.1_gap2',
 'CTDC_solventaccess.G1',
 'Geary_BEGF750102.lag2',
 'type7_T1.G.2_T1.G.2_gap2',
 'type9_T1.G.1_T1.G.1_gap2',
 'CTDC_secondarystruct.G1',
 'type2_T1.G.1_T1.G.1_gap2',
 'APAAC_Pc1.G',
 'CTDD_normwaalsvolume.3.residue50',
 'CTDT_secondarystruct.Tr1221',
 'APAAC_Pc1.V',
 'QSOrder_Schneider.Xd.3',
 'CTDC_hydrophobicity_ARGP820101.G2',
 'CTDD_hydrophobicity_ENGD860101.3.residue75',
 'CTDD_charge.2.residue0',
 'CTDC_hydrophobicity_ENGD860101.G2',
 'CTDD_secondarystruct.1.residue100',
 'CTDD_secondarystruct.1.residue0',
 'APAAC_Pc2.Hydrophobicity.1',
 'GDPC_alphaticr.alphaticr_x']

In [124]:
best_features_df = df[[feature for feature in selected_features]]

#concatenates original feature vector back into the dataframe
df = pd.concat([df.iloc[:, :11], best_features_df], axis=1)
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_len,t_protein_len,...,APAAC_Pc1.V,QSOrder_Schneider.Xd.3,CTDC_hydrophobicity_ARGP820101.G2,CTDD_hydrophobicity_ENGD860101.3.residue75,CTDD_charge.2.residue0,CTDC_hydrophobicity_ENGD860101.G2,CTDD_secondarystruct.1.residue100,CTDD_secondarystruct.1.residue0,APAAC_Pc2.Hydrophobicity.1,GDPC_alphaticr.alphaticr_x
0,0.333333,0.267380,0.254453,148,0.791444,163,0.791262,127,206,187,...,1.458032,1.095793,0.974189,0.895744,0.907767,0.960391,1.000000,0.907767,-0.418672,1.017965
1,0.268482,0.190083,0.189300,248,0.683196,271,0.740437,132,366,363,...,1.036049,1.078083,1.111504,1.147146,0.991803,1.248937,1.002762,0.991803,0.721630,1.405023
2,0.333333,0.295082,0.291498,218,0.893443,221,0.884000,158,250,244,...,0.767388,0.992939,0.847094,0.986844,0.976000,0.903319,1.012448,0.976000,27.063868,0.948021
3,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,375,269,...,1.320507,1.027001,1.152081,0.958679,0.717333,1.120330,1.003731,0.717333,-0.547390,1.328926
4,0.331915,0.245283,0.242613,239,0.751572,238,0.732308,202,325,318,...,1.753038,1.008479,1.048352,0.990743,0.978462,0.881497,1.000000,0.978462,0.579979,1.188936
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17103,0.331707,0.294372,0.282158,202,0.874459,218,0.868526,232,251,231,...,0.897562,1.073475,0.965212,0.970336,0.920319,1.057680,1.008734,0.920319,-0.310949,1.070204
17104,0.304878,0.188679,0.163666,166,0.626415,170,0.491329,125,346,265,...,1.287880,1.090877,1.117224,0.944200,0.765896,0.968633,0.988439,0.765896,0.340536,0.869565
17105,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,271,353,...,1.112860,1.001577,1.018956,1.170603,1.302583,1.089045,0.995440,1.302583,-3.313974,0.835708
17106,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,331,338,...,0.872070,1.030476,0.857453,1.008222,1.021148,0.976361,1.015015,1.021148,-2.922809,0.940892


In [125]:
#choosing 80/20 split instead of 85/15 because of volume of data

dev, test = sklearn.model_selection.train_test_split(df, test_size=0.15, random_state=1)

train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)

print(dev.shape)
print(test.shape)
print(train.shape)
print(val.shape)

(14541, 31)
(2567, 31)
(12359, 31)
(2182, 31)


In [126]:
#ID target and features, separate into separate arrays

target = 'protein_match'
input_features = [columns for columns in df]
input_features.remove(target)

In [127]:
#split X and y

dev_X = dev[input_features].values
test_X = test[input_features].values

dev_y = dev[target].values.reshape(-1,1)
test_y = test[target].values.reshape(-1,1)  

print(dev_X.shape, test_X.shape, dev_y.shape, test_y.shape)

(14541, 30) (2567, 30) (14541, 1) (2567, 1)


In [128]:
#same thing for training and validation data

train_X = train[input_features].values
val_X = val[input_features].values

train_y = train[target].values.reshape(-1,1)
val_y = val[target].values.reshape(-1,1) 

Scale the data

In [139]:
scaler = sklearn.preprocessing.StandardScaler()
dev_X = scaler.fit_transform(dev_X)
test_X = scaler.fit_transform(test_X)
train_X = scaler.fit_transform(train_X)
val_X = scaler.fit_transform(val_X)

Train the model

In [140]:
#Random Forest

#hyperparameters determiend with optuna
model = sklearn.ensemble.RandomForestClassifier(n_estimators=150, max_depth=None, max_samples=0.5,
                                                max_features=0.5, min_weight_fraction_leaf=0.000215,
                                               min_samples_split=10)

model.fit(train_X, train_y.ravel())

Test the model, report relevant statistics

In [141]:
score = model.score(val_X, val_y)
print('Model score is: {}'.format(score))

preds = model.predict(test_X)
print(preds)

Model score is: 0.775435380384968
['Yes' 'No' 'Yes' ... 'Yes' 'Yes' 'Yes']


In [142]:
%matplotlib notebook

In [143]:
#confusion matrix

confusion_matrix = sklearn.metrics.confusion_matrix(preds, test_y)
sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()