In [None]:
import numpy as np
import re
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from joblib import parallel_backend
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

vocab = "ARNDCQEGHILKMFPSTWYVXU"

def one_hot_pad_seqs(s, length, vocab=vocab):
    aa_dict = {k: v for v, k in enumerate(vocab)}
    embedded = np.zeros([length, len(vocab)])
    for i, l in enumerate(s):
        if i >= length:
            break
        idx = aa_dict[l]
        embedded[i, idx] = 1
    embedded = embedded.flatten()
    return embedded

def get_seq(df, length=100):
    seq = df.sequence.values.tolist()
    X = []
    for s in seq:
        one_hot_encoded = one_hot_pad_seqs(s, length)
        gkyi_content = s.count("GKYI") / len(s)
        features = np.concatenate([one_hot_encoded, [gkyi_content]])
        X.append(features)
    return np.array(X)

def load_train_data(path, val_split=False):
    df = pd.read_csv(path)
    df.sequence = df.sequence.apply(
        lambda s: re.sub(r"[^A-Z]", "", s.upper())
    )  # remove special characters

    if val_split:
        train, val = train_test_split(df, test_size=0.2, random_state=42)
        return train, val
    else:
        return df

def load_test_data(path):
    df = pd.read_csv(path)
    df.sequence = df.sequence.apply(
        lambda s: re.sub(r"[^A-Z]", "", s.upper())
    )  # remove special characters
    return df

def main():
    train = load_train_data("/Users/hinagaur/Documents/MS_BINF/ML/kaggle/train.csv", val_split=False)
    test = load_test_data("/Users/hinagaur/Documents/MS_BINF/ML/kaggle/test.csv")

    print(len(train), len(test))

    train_X, test_X = get_seq(train), get_seq(test)

    train_y = np.array(train.target.values.tolist())
    test_id = test.id.values.tolist()

    # Define the hyperparameter grid
    param_grid = {
        'max_depth': [3, 6, 9,12],
        'learning_rate': [0.1, 0.01, 0.001, 0.0001],
        'n_estimators': [20, 30, 40,60]
    }

    # Initialize XGBoost regressor
    xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse')

    # Initialize Randomized Search with threading
    with parallel_backend('threading', n_jobs=-1):
        random_search = RandomizedSearchCV(estimator=xgb_reg, param_distributions=param_grid, n_iter=10, cv=3)

        # Train XGBoost model with Randomized Search
        random_search.fit(train_X, train_y)

    # Best parameters
    print("Best Parameters:", random_search.best_params_)

    # Cross-validation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(random_search.best_estimator_, train_X, train_y, cv=kfold)
    print("Cross-validation Scores:", scores)
    print("Mean RMSE:", np.mean(scores))

    # Calculate R-squared
    r_squared = r2_score(train_y, random_search.best_estimator_.predict(train_X))
    print("R-squared:", r_squared)

    # Calculate MAE
    mae = mean_absolute_error(train_y, random_search.best_estimator_.predict(train_X))
    print("MAE:", mae)

    # Calculate MSE
    mse = mean_squared_error(train_y, random_search.best_estimator_.predict(train_X))
    print("MSE:", mse)

    # Calculate RMSE
    rmse = np.sqrt(mse)
    print("RMSE:", rmse)

    # Make predictions on test set using the best model from Randomized Search
    test_y = random_search.predict(test_X)

    # Write predictions to file
    with open("prediction.csv", "w") as f:
        f.write("id,target\n")
        for id, y in zip(test_id, test_y):
            f.write(f"{id},{y}\n")

if __name__ == "__main__":
    main()
