In [1]:
import pandas as pd 
import numpy as np 
import re

import torch 
import torch.nn as nn

def load_fixed_dataframe(original = "train.csv", updated = "train_updates_20220929.csv", was_fixed = False):

    def fix_tm_ph(row, update_map):
        update_vals = update_map.get(row["seq_id"], None)
        if update_vals is not None:
            row["tm"] = update_vals["tm"] #processing thermochemical stability metric (Spearman Correlation Coefficient)
            row["pH"] = update_vals["pH"] #iterating through pH values and re-evaluating for precision
        return row
    
    df = pd.read_csv(original)
    updated_df = pd.read_csv(updated)
    seq_id_phtm = updated_df[~pd.isna(updated_df["pH"])].groupby("seq_id")[["pH", "tm"]].first().to_dict("index")

    bad_seqs = updated_df[pd.isna(updated_df["pH"])]["seq_id"].to_list()

    df = df[~df["seq_id"].isin(bad_seqs)].reset_index(drop = True)
    df = df.apply(lambda x : fix_tm_ph(x, seq_id_phtm), axis = 1)

    if was_fixed: df["was_fixed"] = df["seq_id"].isin(bad_seqs + list(seq_id_phtm.keys()))
    return df 

test_df = pd.read_csv("test.csv")
train_df = load_fixed_dataframe()

def return_amino_acid_df(df):    
    search_amino=['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

    for amino_acid in search_amino:
         df[amino_acid] = df['protein_sequence'].str.count(amino_acid,re.I)
    return df

train_df = return_amino_acid_df(train_df)
test_df = return_amino_acid_df(test_df)

train_df["protein_length"] = train_df["protein_sequence"].apply(lambda x : len(x))
test_df["protein_length"] = test_df["protein_sequence"].apply(lambda x : len(x))

In [2]:
from sklearn.model_selection import train_test_split

X, y = train_df.drop(["tm", "seq_id", "protein_sequence", "data_source"], axis = 1), train_df["tm"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [3]:
print("Feature Shapes: ", X_train.shape, "Target Shapes: ", y_train.shape)
print("# of Feature Dimensions: ", X_train.ndim, "\n# of Target Dimensions: ", y_train.ndim)
print("Number of Features: ", X_train.shape[1], "Number of Targets: ", y_train.ndim)

Feature Shapes:  (23184, 22) Target Shapes:  (23184,)
# of Feature Dimensions:  2 
# of Target Dimensions:  1
Number of Features:  22 Number of Targets:  1


In [11]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from scipy.stats import spearmanr

model = XGBRegressor(learning_rate = 0.1, max_depth = 8, n_estimators=150, tree_method="gpu_hist")
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_test)
print("Mean Absolute error on Train data is:{}".format(mean_absolute_error(y_train, y_pred_train)))
print("Mean Absolute error on Val data is:{}".format(mean_absolute_error(y_test, y_pred_val)))

print("Training Correlation Value: {}".format(spearmanr(y_pred_train, y_train)))
print("Validation Correlation Value: {}".format(spearmanr(y_pred_val, y_test)))

Mean Absolute error on Train data is:3.77765233635162
Mean Absolute error on Val data is:5.574443554972829
Training Correlation Value: SpearmanrResult(correlation=0.8238017859126157, pvalue=0.0)
Validation Correlation Value: SpearmanrResult(correlation=0.57679874273538, pvalue=0.0)
