This notebook uses iFeatureOmega, a feature generation software, to add to our feature space for a RandomForestClassifier that predicts protein pair functionality.

To do:

1) Write unit tests for iFeatureOmega
2) time trial for different descriptors - DONE
3) Figure out how to append meso and thermo descriptors
4) Make sure protein length is in training
5) try changing subject_align_len to subject_align_len/m_protein_len
6) log(ratio) for proteins approacing zero or infinity
    take distributions to assess for this.
    Update -- taking log and then droping infinites and NaN improves score to 0.86 <-- cannot repeat this

In [44]:
import duckdb

con = duckdb.connect(path_to_db)
cmd = """SELECT list_of_column_names FROM fafsa_final"""
df = con.execute(cmd).df()

NameError: name 'path_to_db' is not defined

In [None]:
output_df.to_parquet(path)
cmd = f"""CREATE OR REPLACE TABLE table_name AS SELECT * FROM '{path}'"""
con.execute(cmd)

 protein_pair_cmd = """CREATE OR REPLACE TABLE fafsa_protein_pairs AS
                          SELECT 
                          meso_pid,
                          thermo_pid,
                          bit_score,
                          local_gap_compressed_percent_id,
                          scaled_local_query_percent_id,
                          scaled_local_symmetric_percent_id,
                          query_align_len,
                          query_align_cov,
                          subject_align_len,
                          subject_align_cov,
                          FROM protein_pairs USING SAMPLE 100
"""

In [197]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.ensemble
import sklearn.feature_selection
import unittest
import iFeatureOmegaCLI
import Bio.SeqIO
import Bio.SeqRecord
import io
from io import StringIO
import time

In [198]:
def get_fasta_from_dataframe(dataframe, output_file_a, output_file_b):
    #adjust this to write function with BioPython
    #separate functions for each of the input sequences
    #in training, seq_a = meso and seq_b = thermo
    
    
    #meso sequence to fasta
    with open(output_file_a, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), row['m_protein_seq']))
    
    #thermo sequence to fasta
    with open(output_file_b, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), (row['t_protein_seq'])))
   
    #return output files
    return [output_file_a, output_file_b]

In [199]:
def get_protein_descriptors(fasta_file, descriptors=[]):
    
    """
    Generates features from a protein sequence

    Parameters
    ----------
    Fasta file with amino acid sequences.

    Returns
    -------
    Vector of descriptors
    """
    
    #create iProtein object
    protein = iFeatureOmegaCLI.iProtein(fasta_file)
    
    #not sure why we need this yet. Right now it is stored in local directory.
    params = protein.import_parameters('protein_parameters.json')
    
    protein_descriptors = {}
    
    for descriptor in descriptors:
        protein.get_descriptor(descriptor)
        protein_descriptors.update({f'{descriptor}':protein.encodings})
        
    return protein_descriptors

In [200]:
def create_new_dataframe(dataframe, output_files, descriptors=[]):
    """
    Creates new dataframe with descriptors added.

    Parameters
    ----------
    Pandas dataframe, list of descriptors as strings, output file name.

    Returns
    -------
    Dataframe including vector(s) of descriptors
    """

    fasta_files = get_fasta_from_dataframe(dataframe, output_files[0], output_files[1])
    
    def compute_descriptor_ratio(fasta_files, descriptors=[]):
        """
        Generates dictionary of descriptors for each of the two input sequences.
        Computes the difference between each instance of a descriptor.
        
        Parameters
        ----------
        List of two fasta files (str) and list of descriptors (str).

        Returns
        -------
        Dictionary with difference between descriptors for each of the 
        input sequences.
        """
        desc_a = get_protein_descriptors(fasta_files[0], descriptors)
        desc_b = get_protein_descriptors(fasta_files[1], descriptors)

        feature_dict = {}

        for key in desc_a:
            
            if 'AAC' in key:
                feature_dict[key] = desc_a[key] - desc_b[key]
            elif 'GAAC' in key:
                feature_dict[key] = desc_a[key] - desc_b[key]
            else:
                feature_dict[key] = desc_a[key] / desc_b[key]

        return feature_dict
    
    feature_dict = compute_descriptor_ratio(fasta_files, descriptors)


    df = dataframe.reset_index()

    for desc in descriptors:

        feature_dict[desc].index = feature_dict[desc].index.astype(int)
        features = feature_dict[desc].reset_index()

        df = pd.merge(
            df,
            features,
            how='outer',
            left_index=True,
            right_index=True)

    return df

In [201]:
#iFeature properties

# protein = iFeatureOmegaCLI.iProtein('meso_50k.fasta')
# protein.display_feature_types()

In [202]:
cd /Users/loganroberts/Learn2Therm/ValidProt/FAFSA

/Users/loganroberts/Learn2Therm/ValidProt/FAFSA


In [203]:
#convert to pandas df
df = pd.read_csv('learn2therm_sample_50k.csv')
df.columns

Index(['Unnamed: 0', 'local_gap_compressed_percent_id',
       'scaled_local_query_percent_id', 'scaled_local_symmetric_percent_id',
       'query_align_len', 'query_align_cov', 'subject_align_len',
       'subject_align_cov', 'bit_score', 'thermo_index', 'meso_index',
       'prot_pair_index', 'meso_protein_int_index', 'thermo_protein_int_index',
       'taxa_pair_index', 'local_gap_compressed_percent_id_16s',
       'scaled_local_query_percent_id_16s',
       'scaled_local_symmetric_percent_id_16s', 'query_align_cov_16s',
       'subject_align_cov_16s', 'bit_score_16s', 'm_ogt', 't_ogt',
       'ogt_difference', 'm_protein_seq', 't_protein_seq', 'm_protein_desc',
       't_protein_desc', 'm_protein_len', 't_protein_len'],
      dtype='object')

In [204]:
cd /Users/loganroberts/Learn2Therm/ValidProt/notebooks

/Users/loganroberts/Learn2Therm/ValidProt/notebooks


In [205]:
target = pd.read_csv('protein_match_50k')

In [206]:
df

Unnamed: 0.1,Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,thermo_index,...,bit_score_16s,m_ogt,t_ogt,ogt_difference,m_protein_seq,t_protein_seq,m_protein_desc,t_protein_desc,m_protein_len,t_protein_len
0,0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,875,...,1153.0,27.5,50.0,22.5,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,MPSQITESERIELAERFERDALPLLDQLYSAALRMTRNPADAEDLV...,ECF RNA polymerase sigma factor SigK,sigma-70 family RNA polymerase sigma factor,206,202
1,1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,11324,...,1014.0,25.0,54.0,29.0,MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMP...,MRVLLVEDDPNTSRSIEMMLTHANLNVYATDMGEEGIDLAKLYDYD...,response regulator transcription factor,response regulator transcription factor,233,237
2,2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,875,...,1138.0,28.0,50.0,22.0,MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAE...,MTPEQIFSGQTAIVTGGASGIGAATVEHIARRGGRVFSVDLSYDSP...,SDR family oxidoreductase,SDR family oxidoreductase,287,252
3,3,0.327273,0.200743,0.214712,166,0.617100,163,0.696581,175,875,...,1077.0,28.0,50.0,22.0,MTSGLWERVLDGVWVTIQLLVLSALLATAVSFVVGIARTHRLWIVR...,MAMSRRKRGQLARGIQYAILVIVVVVLALLADWGKIGKAFFDWEAA...,ectoine/hydroxyectoine ABC transporter permeas...,amino acid ABC transporter permease,234,269
4,4,0.338710,0.318182,0.287671,60,0.909091,71,0.887500,61,9827,...,991.0,30.0,50.0,20.0,MIISLRRGLRFIRFIVFFAALVYLFYHVLDLFNGWISPVDQYQMPT...,MKRMVWRTLKVFIIFIACTLLFYFGLRFMHLEYEQFHRYEPPEGPA...,YqzK family protein,YqzK family protein,80,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49995,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,14963,...,1055.0,28.0,52.5,24.5,MDNATFRLGDDLSVRLPGHSRWIGQVEREQRWLPWLAPRLPLTVST...,MPPQPRPLRPNDPREIGGFALLGRLGEGGQGTVYLGGAPDGRRVAV...,aminoglycoside phosphotransferase family protein,serine/threonine protein kinase,271,353
49996,49996,0.417989,0.389163,0.392060,190,0.935961,187,0.935000,314,7134,...,1083.0,26.5,52.5,26.0,MFRTGVKAEIGRSLAVVGEAEDVERAVRVVLEQRPDVVLLDVHLPG...,MILEAEPDIVVVGEAGDGEKAVEEARALQPDVVLMDIRMPRKDGVE...,response regulator transcription factor,response regulator transcription factor,200,203
49997,49997,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,7134,...,991.0,28.0,52.5,24.5,MIRLAELTKTYPGQQHPAVDGISMEVAEGEIVVLVGPSGCGKTTTL...,MTEQPILSARGLTVDFRLRGGRRARAVDGVDLDLAPGEVLALAGES...,ABC transporter ATP-binding protein,ABC transporter ATP-binding protein,331,338
49998,49998,0.334764,0.331915,0.329810,232,0.987234,230,0.966387,281,11324,...,907.0,29.0,54.0,25.0,MSESHAGALLSVRGLTAGYGGATALDGVSLTVAAGETVALLGANGA...,MSLLTTSGLTRHFSGIHAVEGVDFTLEAGEIRALIGSNGAGKTTLV...,ABC transporter ATP-binding protein,ABC transporter ATP-binding protein,238,235


In [207]:
from sklearn.utils import resample

# Assuming your data is in a pandas DataFrame called 'data'
# Separate the majority and minority classes
majority_class = target[target['protein_match'] == 'Yes']
minority_class = target[target['protein_match'] == 'No']

# Undersample the majority class to match the number of minority class samples
n_samples = len(minority_class)
undersampled_majority = resample(majority_class, n_samples=n_samples, replace=False)

# Combine the undersampled majority class with the minority class
balanced_data = pd.concat([undersampled_majority, minority_class])

#drop Unnamed: 0 and Jaccard_Score
balanced_data = balanced_data.drop(columns=['Unnamed: 0', 'Jaccard_Score'])

In [208]:
balanced_data

Unnamed: 0,prot_pair_index,protein_match
26912,109825691,Yes
36053,122428150,Yes
24472,55395266,Yes
34803,126051739,Yes
37800,137998796,Yes
...,...,...
48833,70996712,No
48839,14456723,No
48845,78849058,No
48847,161110219,No


In [209]:
df = pd.merge(df, balanced_data, on=['prot_pair_index'])
df.shape

(17108, 31)

In [210]:
"""
this list comes from a combination of reading through the features and determining which might be useful
and timing some of the feature generations. those that took more than 30ish seconds were eliminated
Also removed those that have really high dimensionality (>4000)
"""

feature_list = ['AAC', 'GAAC', 'DistancePair',
               'CTDC', 'CTDT', 'CTDD', 'CTriad', 'GDPC type 1', 'GDPC type 2',
                'CKSAAGP type 1', 'CKSAAGP type 2', 'PseKRAAC type 2', 'PseKRAAC type 3A','PseKRAAC type 7',
                'PseKRAAC type 9', 'Geary','APAAC', 'QSOrder']

In [211]:
# df = create_new_dataframe(df, 'seq_50k.fasta', descriptors= ['EGAAC', 'CKSAAGP type 1', 'CKSAAGP type 2',
#                                                             'GDPC type 1', 'GDPC type 2', 'TPC type 1', 'TPC type 2',
#                                                             'GTPC type 1', 'GTPC type 2', 'DPC type 1', 'DPC type 2',
#                                                             'Moran', 'Geary', 'NMBroto', 'AC', 'CC', 'ACC',
#                                                             'PAAC', 'APAAC'])

In [212]:
df = create_new_dataframe(df, ['seq_50k_a.fasta', 'seq_50k_b.fasta'], descriptors=['AAC'])

File imported successfully.
File imported successfully.


In [213]:
df['norm_bit_score_m'] = df['bit_score']/df['m_protein_len']
df['norm_bit_score_t'] = df['bit_score']/df['t_protein_len']

Split data into dev and test, and then split that into train and validation.

In [214]:
#drop columns that don't exihibit signficant pearson correlation with bit_score

df = df.drop(columns = ['meso_index', 'meso_protein_int_index', 'local_gap_compressed_percent_id_16s', 
                        'scaled_local_query_percent_id_16s', 'scaled_local_symmetric_percent_id_16s',
                       'bit_score_16s', 'm_ogt', 't_ogt', 'taxa_pair_index', 'thermo_protein_int_index'
                       , 'prot_pair_index', 'ogt_difference', 
                       'query_align_cov_16s', 'subject_align_cov_16s',
                       'thermo_index','m_protein_seq', 't_protein_seq', 
                       'm_protein_desc', 't_protein_desc'])

In [222]:
assert (df.filter(like='index|Unnamed').shape)[1] == 0

In [218]:
df = df.drop(columns=df.columns[df.columns.str.contains('index|Unnamed|pancakes')])

KeyError: 'index_x'

In [73]:
df=df.drop(columns=['local_gap_compressed_percent_id', 'scaled_local_query_percent_id',
       'scaled_local_symmetric_percent_id', 'query_align_len',
       'query_align_cov', 'subject_align_len', 'subject_align_cov',
       'bit_score', 'm_protein_len', 't_protein_len'])

In [26]:
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_len,t_protein_len,...,AAC_W,AAC_Y,index,GAAC_alphatic,GAAC_aromatic,GAAC_postivecharge,GAAC_negativecharge,GAAC_uncharge,norm_bit_score_m,norm_bit_score_t
0,0.265306,0.201550,0.210243,320,0.826873,294,0.828169,140,355,387,...,0.002817,-0.018110,180139301,0.030913,-0.031284,-0.035775,-0.021691,0.057837,0.394366,0.361757
1,0.334630,0.299652,0.299130,253,0.881533,256,0.888889,206,288,287,...,0.003448,-0.013998,85953105,-0.002190,0.006666,-0.007150,-0.000121,0.002795,0.715278,0.717770
2,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,375,269,...,0.003797,-0.019073,37128170,0.064287,-0.009299,0.002657,0.011143,-0.068788,0.128000,0.178439
3,0.336100,0.312741,0.302804,241,0.930502,247,0.894928,218,276,259,...,-0.007960,0.010394,9904825,0.066672,0.001245,-0.015332,-0.000364,-0.052221,0.789855,0.841699
4,0.281588,0.257426,0.239631,276,0.910891,304,0.873563,176,348,303,...,0.007767,0.016701,64719917,-0.030272,0.013287,0.024127,0.003898,-0.011039,0.505747,0.580858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17103,0.300000,0.300000,0.104803,80,1.000000,79,0.208995,104,378,80,...,-0.004563,0.021892,4681374,-0.047156,0.009656,0.013294,-0.064352,0.088558,0.275132,1.300000
17104,0.331707,0.294372,0.282158,202,0.874459,218,0.868526,232,251,231,...,-0.004674,0.007278,142556425,-0.007106,-0.008468,0.012866,0.014936,-0.012228,0.924303,1.004329
17105,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,271,353,...,0.034068,-0.003951,78849058,0.004045,0.047186,-0.022182,0.030858,-0.059908,0.195572,0.150142
17106,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,331,338,...,0.006105,-0.005667,161110219,0.022408,0.003772,-0.027333,0.014337,-0.013184,0.842900,0.825444


In [19]:
#maybe divide all of the AAC/GAAC columns by the protein length 

In [27]:
df.shape

(17108, 39)

In [75]:
df = df.replace([np.inf, -np.inf], np.nan)

In [28]:
nan_counts = df.isna().sum()
print(nan_counts)
nan_counts.unique()

local_gap_compressed_percent_id      0
scaled_local_query_percent_id        0
scaled_local_symmetric_percent_id    0
query_align_len                      0
query_align_cov                      0
subject_align_len                    0
subject_align_cov                    0
bit_score                            0
m_protein_len                        0
t_protein_len                        0
protein_match                        0
AAC_A                                0
AAC_C                                0
AAC_D                                0
AAC_E                                0
AAC_F                                0
AAC_G                                0
AAC_H                                0
AAC_I                                0
AAC_K                                0
AAC_L                                0
AAC_M                                0
AAC_N                                0
AAC_P                                0
AAC_Q                                0
AAC_R                    

array([0])

In [39]:
assert nan_counts.unique() == [0]

In [107]:
df.columns[36:]

Index(['QSOrder_Grantham.Xd.2', 'QSOrder_Grantham.Xd.3'], dtype='object')

In [110]:
log_cols = df.columns[36:]
df[log_cols] = df[log_cols].apply(lambda x: np.log(x))

In [77]:
df = df.dropna(axis=1, how='any')

In [51]:
nan_counts = df.isna().sum()
nan_counts

protein_match            0
AAC_A                    0
AAC_C                  434
AAC_D                    3
AAC_E                    3
AAC_F                   20
AAC_G                    0
AAC_H                   31
AAC_I                    4
AAC_K                   58
AAC_L                    1
AAC_M                    1
AAC_N                   49
AAC_P                    7
AAC_Q                   20
AAC_R                    1
AAC_S                    0
AAC_T                    2
AAC_V                    0
AAC_W                  392
AAC_Y                   48
index                    0
GAAC_alphatic            0
GAAC_aromatic            2
GAAC_postivecharge       0
GAAC_negativecharge      0
GAAC_uncharge            0
dtype: int64

In [25]:
df.shape

(2210, 56)

Use MRMR to select for the best features. Going to start by grouping into different categories of features generated from iFeature Omega.

In [26]:
#use MRMR to select for the best features from PseKRAAC
df_subset = df.loc[:, df.columns != 'protein_match']
print(type(df_subset))

# select top 10 features using mRMR
from mrmr import mrmr_classif
selected_features = mrmr_classif(X=df_subset.iloc[:,10:], y=df['protein_match'], K=20)

selected_features

<class 'pandas.core.frame.DataFrame'>


100%|██████████| 20/20 [00:00<00:00, 48.64it/s]


['QSOrder_Schneider.Xr.A',
 'QSOrder_Grantham.Xd.3',
 'QSOrder_Grantham.Xr.R',
 'QSOrder_Schneider.Xd.1',
 'AAC_A',
 'QSOrder_Schneider.Xd.3',
 'QSOrder_Grantham.Xr.A',
 'AAC_I',
 'QSOrder_Schneider.Xr.R',
 'AAC_S',
 'GAAC_alphatic',
 'AAC_E',
 'QSOrder_Grantham.Xd.2',
 'AAC_N',
 'GAAC_negativecharge',
 'QSOrder_Schneider.Xr.L',
 'QSOrder_Schneider.Xr.G',
 'AAC_T',
 'GAAC_uncharge',
 'AAC_R']

In [34]:
best_features_df = df[[feature for feature in selected_features]]

#concatenates original feature vector back into the dataframe
df = pd.concat([df.iloc[:, :11], best_features_df], axis=1)
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_len,t_protein_len,...,GAAC_alphatic,AAC_E,QSOrder_Grantham.Xd.2,AAC_N,GAAC_negativecharge,QSOrder_Schneider.Xr.L,QSOrder_Schneider.Xr.G,AAC_T,GAAC_uncharge,AAC_R
0,0.283582,0.231707,0.240506,133,0.810976,134,0.881579,99,152,164,...,-0.015083,-0.013479,1.036564,0.006579,0.004974,1.329084,0.701461,0.003851,0.059050,-0.034981
1,0.268482,0.190083,0.189300,248,0.683196,271,0.740437,132,366,363,...,0.078422,-0.030461,0.982711,0.002394,-0.006345,0.920360,1.465759,0.010432,-0.035113,-0.030800
2,0.265306,0.201550,0.210243,320,0.826873,294,0.828169,140,355,387,...,0.030913,-0.020017,0.978314,0.011479,-0.021691,0.886583,1.065708,0.018277,0.057837,-0.010125
3,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,375,269,...,0.064287,-0.009864,1.003861,0.005898,0.011143,1.139824,0.931587,-0.013581,-0.068788,0.016882
4,0.394495,0.346774,0.369099,108,0.870968,106,0.972477,130,109,124,...,-0.028781,-0.030334,0.917208,0.000000,-0.007547,1.961181,1.123222,0.013613,0.063332,-0.031444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2205,0.241758,0.134557,0.144975,181,0.553517,190,0.678571,107,280,327,...,0.049945,0.007678,1.001902,0.024509,0.004642,0.949185,1.416965,-0.051409,-0.086435,-0.043283
2206,0.321429,0.186391,0.223801,197,0.582840,192,0.853333,195,225,338,...,0.082104,-0.038396,0.931608,0.002998,-0.030901,1.624165,1.373086,0.014872,-0.030743,-0.005786
2207,0.290909,0.162437,0.154589,110,0.558376,112,0.516129,85,217,197,...,-0.074037,0.016959,0.948476,0.008281,0.038995,0.890999,0.712799,0.015626,0.022831,0.000327
2208,0.447368,0.387833,0.373626,229,0.870722,233,0.823322,448,283,263,...,-0.037660,-0.007833,0.961154,-0.009217,0.006610,0.900462,0.840163,-0.021430,0.020557,0.004958


In [29]:
#choosing 80/20 split instead of 85/15 because of volume of data

dev, test = sklearn.model_selection.train_test_split(df, test_size=0.15, random_state=1)

train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)

print(dev.shape)
print(test.shape)
print(train.shape)
print(val.shape)

(14541, 39)
(2567, 39)
(12359, 39)
(2182, 39)


In [30]:
#ID target and features, separate into separate arrays

target = 'protein_match'
input_features = [columns for columns in df]
input_features.remove(target)

In [31]:
#split X and y

dev_X = dev[input_features].values
test_X = test[input_features].values

dev_y = dev[target].values.reshape(-1,1)
test_y = test[target].values.reshape(-1,1)  

print(dev_X.shape, test_X.shape, dev_y.shape, test_y.shape)

(14541, 38) (2567, 38) (14541, 1) (2567, 1)


In [32]:
#same thing for training and validation data

train_X = train[input_features].values
val_X = val[input_features].values

train_y = train[target].values.reshape(-1,1)
val_y = val[target].values.reshape(-1,1) 

Scale the data

In [33]:
scaler = sklearn.preprocessing.StandardScaler()
dev_X = scaler.fit_transform(dev_X)
test_X = scaler.fit_transform(test_X)
train_X = scaler.fit_transform(train_X)
val_X = scaler.fit_transform(val_X)

Train the model

In [34]:
#Random Forest

#hyperparameters determiend with optuna
model = sklearn.ensemble.RandomForestClassifier(n_estimators=150, max_depth=None, max_samples=0.5,
                                                max_features=0.5, min_weight_fraction_leaf=0.000215,
                                               min_samples_split=10)

model.fit(train_X, train_y.ravel())

Test the model, report relevant statistics

In [35]:
score = model.score(val_X, val_y)
print('Model score is: {}'.format(score))

preds = model.predict(test_X)
print(preds)

Model score is: 0.770852428964253
['No' 'Yes' 'No' ... 'Yes' 'Yes' 'Yes']


In [36]:
#confusion matrix

confusion_matrix = sklearn.metrics.confusion_matrix(preds, test_y)
sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fbe5e04d490>