### Importing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import set_config
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import time
from mrmr import mrmr_regression
sns.set_style('whitegrid')
set_config(transform_output = "pandas")

### Reading the training data

In [2]:
train_prep=pd.read_csv('./train_prep1.csv')
target_1=pd.read_csv('./target_1.csv')

In [3]:
len(train_prep.columns)

41

In [4]:
train_prep=train_prep.drop(["Unnamed: 0"],axis=1)

In [5]:
target_1=target_1.drop(['Unnamed: 0'],axis=1)

In [6]:
def select_train_split(X,y,k):
    selected_features = mrmr_regression(X=X, y=y, K=k)
    train_X, test_X,train_y,test_y= train_test_split(
    X[selected_features],y, train_size=0.95, shuffle=False)
    return train_X,test_X,train_y,test_y

In [7]:
def generate_random_search(X,y,k):
    train_X,test_X,train_y,test_y=select_train_split(X,y,k)
    train, val, trainy, valy = train_test_split(
    train_X, train_y, train_size=0.9,shuffle=False
)
    xgb_params = {
    "n_estimators": [10, 50, 100, 200],
    "subsample": [0.6, 0.8, 1],
    "learning_rate": [0.01, 0.1, 0.5, 1],
    "gamma": [0.01, 0.1, 1, 5],
    "alpha": [0, 0.1, 0.5]
}

    xgb_regressor = xgb.XGBRegressor()
    random_search = RandomizedSearchCV(estimator=xgb_regressor, param_distributions=xgb_params,
                                   scoring='neg_mean_squared_error', n_iter=10, cv=3, verbose=1)
    random_search.fit(train, trainy)
    return random_search

In [8]:
k_list = [20, 25, 30, 35]
results = []
for k in k_list:
    score = generate_random_search(train_prep, target_1,k).best_score_
    results.append({'k_value': k, 'score': score})
df = pd.DataFrame(results)
df=df['score'].sort_values(ascending=False)


100%|██████████| 20/20 [00:58<00:00,  2.93s/it]


Fitting 3 folds for each of 10 candidates, totalling 30 fits


100%|██████████| 25/25 [00:58<00:00,  2.34s/it]


Fitting 3 folds for each of 10 candidates, totalling 30 fits


100%|██████████| 30/30 [01:00<00:00,  2.02s/it]


Fitting 3 folds for each of 10 candidates, totalling 30 fits


100%|██████████| 35/35 [01:26<00:00,  2.48s/it]


Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [10]:
results

[{'k_value': 20, 'score': -55889.60508412733},
 {'k_value': 25, 'score': -47321.22890232056},
 {'k_value': 30, 'score': -55145.836657613334},
 {'k_value': 35, 'score': -42574.11596227713}]

In [11]:
score = generate_random_search(train_prep, target_1,40).best_score_
print("score :",score)

100%|██████████| 40/40 [01:14<00:00,  1.86s/it]


Fitting 3 folds for each of 10 candidates, totalling 30 fits
score : -40843.05399054429


In [12]:
random_search = generate_random_search(train_prep, target_1,40)

100%|██████████| 40/40 [01:40<00:00,  2.51s/it]


Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(train, label=trainy)
dval = xgb.DMatrix(val, label=valy)
dtest = xgb.DMatrix(test_X, label=test_y)
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',  
    'max_depth': 5, 
    'learning_rate': 0.1, 
    'subsample': 0.8,  
    'colsample_bytree': 0.8  
}

num_round = 1000
watchlist = [(dtrain, 'train'), (dval, 'validation')] 
bst = xgb.train(params, dtrain, num_round, evals=watchlist)
y_pred = bst.predict(dtest)
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(test_y, y_pred, squared=False)
print(f"Root Mean Squared Error on Test Set: {rmse}")