In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_columns', None)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
try:
    train = pd.read_csv('/kaggle/input/housingpricesadvancedregressiontechniques/train.csv')
    test = pd.read_csv('/kaggle/input/housingpricesadvancedregressiontechniques/test.csv')
except:
    train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
    test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

target = 'SalePrice'

train

In [None]:
def get_one_hots(df, cat_cols):
    one_hots = pd.get_dummies(df[cat_cols])
    df = pd.concat([df, one_hots], axis=1)
    return df.loc[:,~df.T.duplicated(keep='first')]

def prep_one_hots(train, test, cat_cols):
    split_idx = len(train)
    df = pd.concat([train, test]).reset_index(drop=True)
    df = get_one_hots(df, cat_cols)
    return df[:split_idx], df[split_idx:].reset_index(drop=True)

percent_missing = train.isnull().sum() / len(train)
drop_cols = percent_missing[percent_missing > 0.5].index.to_list()

print(f'\nColumns with > 0.5 values missing: {drop_cols}\n')

num_cols = list(train.select_dtypes(np.number).columns)

cat_feats = list(train.select_dtypes(exclude=np.number).columns)

train,test = prep_one_hots(train, test, cat_feats)

train = train.select_dtypes(np.number)
test = test.select_dtypes(np.number)

# Save the column names for the one hot cols for later
one_hot_cols = train.columns[~train.columns.isin(num_cols)]

train.info()
test.info()

In [None]:
train

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

class CustomScaler:
    def __init__(self, inp_df, predictors, target):
        self._inp_df = inp_df
        self._predictors = predictors
        self._target = target
        
    def set_scaled_df(self, scaler):
        self.scaler_obj = scaler()
        scaled_df = self.scaler_obj.fit_transform(self._inp_df[self._predictors])
        scaled_df = pd.DataFrame(scaled_df, columns=self.scaler_obj.get_feature_names_out())
        scaled_df[self._target] = self._inp_df[self._target]
        self._scaled_df = scaled_df
    
    def get_scaled_df(self):
        return self._scaled_df
    
    def scaled_test_df(self, test_df):
        scaled_test_df = self.scaler_obj.transform(test_df[self._predictors])
        self._scaled_test_df = pd.DataFrame(scaled_test_df, columns=self.scaler_obj.get_feature_names_out())
        
    def get_scaled_test_df(self):
        return self._scaled_test_df
    

cs = CustomScaler(train, num_cols, target)
cs.set_scaled_df(StandardScaler)

scaled_train = cs.get_scaled_df()
scaled_train = pd.concat([scaled_train,train[one_hot_cols]], axis=1)

# Test Set scaling
# set a dummy Target for input into the Scaler
test[target] = 0

# scale the test set
cs.scaled_test_df(test)
scaled_test = cs.get_scaled_test_df()

# drop the dummy Target
scaled_test = scaled_test.drop(target, axis=1)

# combine the hot columns to the scaled test
scaled_test = pd.concat([scaled_test,test[one_hot_cols]], axis=1)
scaled_test

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF 


def nan_regression(df: pd.DataFrame, predictors: list, mvc: str, thresh: float) -> pd.Series:
    """
    Perform regression to fill missing values in a column using predictors.

    Args:
        df (pd.DataFrame): Input dataframe.
        predictors (list): List of predictor variable column names.
        mvc (str): Column name with missing values to be filled.

    Returns:
        pd.Series: Series with missing values filled using regression.
    """
    lr = Ridge()
    has_nans = df[df[mvc].isnull() == True][predictors + [mvc]]
    no_nans = df[df[mvc].isnull() == False][predictors + [mvc]]
    X = no_nans[predictors]
    y = no_nans[mvc]
    lr.fit(X, y)
    train_pred = lr.score(X, y)
    print(f'{mvc} training score: {train_pred :.3f}\n')
    if train_pred > thresh:
        pred = lr.predict(has_nans[predictors])
        has_nans[mvc] = np.round(np.where(pred < 0, np.mean(y), pred))
        return pd.concat([no_nans[mvc], has_nans[mvc]], axis=0).sort_index()
    else:
        print(f'Score for {mvc} too low for filling missing data')
        return df[mvc]
    
    
def clean_data(train, test, target):
    split_idx = len(train)
    df = pd.concat([train, test]).reset_index(drop=True)
    missing_val_cols = list(df.columns[df.isnull().any()])
    missing_val_cols = [col for col in missing_val_cols if target not in col]
    num_cols = df._get_numeric_data().columns
    no_missing_val_cols = list(num_cols[(~df[num_cols].isnull().any())])
    
    for mvc in missing_val_cols:
        predictors = no_missing_val_cols
        predictors = [col for col in predictors if mvc not in col]

        if np.issubdtype(df[mvc].dtype, np.number):
            replacement = nan_regression(df, predictors, mvc, 0.5)
            if replacement is not None:
                df[mvc] = replacement
            else:
                df[mvc] = np.nan
                
    df = df.dropna(axis=1)
    #df.info()

    if df[[col for col in df.columns if target not in col]].isna().sum().sum() > 0:
        print('\nMore cleaning to do')
        print(df.isna().sum())
    else: 
        print('\nCleaning complete')
        
    return df[:split_idx], df[split_idx:].reset_index(drop=True)


scaled_clean_train, scaled_clean_test = clean_data(scaled_train, scaled_test, target)
## scaled_clean_train = scaled_clean_train[(scaled_clean_train.abs() < 3) | (scaled_clean_train.columns == target)].dropna()

scaled_clean_train[target] = train[target]

preds = list(scaled_clean_train.select_dtypes(np.number).columns) 
preds = [col for col in preds if target not in col]

# drop Id column
if 'Id' in scaled_clean_train.columns:
    scaled_clean_train = scaled_clean_train.drop('Id', axis=1)
if 'Id' in scaled_clean_test.columns:
    scaled_clean_test = scaled_clean_test.drop('Id', axis=1)
    

In [None]:
scaled_clean_test.isna().sum().sort_values(ascending=0)

In [None]:
all_feats = [col for col in scaled_clean_train.columns if target not in col]

corr_df = scaled_clean_train[all_feats].corr()

corr_threshold = 0.6
size = 50
plt.figure(figsize=(size, size))
sns.heatmap(corr_df[(corr_df.abs() > corr_threshold) & (corr_df.abs() < 1)].dropna(how='all',axis=1).dropna(how='all',axis=0), annot=True);

In [None]:
def remove_collinear_features(df_model, target_var, threshold=0.2, verbose=0):
    '''
    Objective:
        Remove collinear features in a dataframe with a correlation coefficient
        greater than the threshold and which have the least correlation with the target (dependent) variable. Removing collinear features can help a model 
        to generalize and improves the interpretability of the model.

    Inputs: 
        df_model: features dataframe
        target_var: target (dependent) variable
        threshold: features with correlations greater than this value are removed
        verbose: set to "True" for the log printing

    Output: 
        dataframe that contains only the non-highly-collinear features
    '''

    # Calculate the correlation matrix
    corr_matrix = df_model.drop(target_var, 1).corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []
    dropped_feature = ""

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i+1): 
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)

            # If correlation exceeds the threshold
            if val >= threshold:
                # Print the correlated features and the correlation value
                if verbose:
                    print(f"{col.values[0]} | {row.values[0]} | {round(val[0][0], 2)}")
                col_value_corr = df_model[col.values[0]].corr(df_model[target_var])
                row_value_corr = df_model[row.values[0]].corr(df_model[target_var])
                if verbose:
                    print(f"{col.values[0]}: {np.round(col_value_corr, 3)}")
                    print(f"{row.values[0]}: {np.round(row_value_corr, 3)}")
                if col_value_corr < row_value_corr:
                    drop_cols.append(col.values[0])
                    dropped_feature = f"dropped: {col.values[0]}"
                else:
                    drop_cols.append(row.values[0])
                    dropped_feature = f"dropped: {row.values[0]}"
                if verbose:
                    print(dropped_feature)
                    print("-----------------------------------------------------------------------------")

    # Drop one of each pair of correlated columns
    drops = set(drop_cols)
    df_model = df_model.drop(columns=drops)
    if verbose:
        print("dropped columns: ")
        print(list(drops))
        print("-----------------------------------------------------------------------------")
        print("used columns: ")
        print(df_model.columns.tolist())

    return df_model

### Remove correlated features removing features corr.abs > 0.7
# Dormann, C. F., J. Elith, S. Bacher, et al. 2013. Collinearity
scaled_clean_train = remove_collinear_features(scaled_clean_train, target, 0.6)
scaled_clean_train

In [None]:
from sklearn.model_selection import train_test_split

# training data split percentage OOS to IS
train_split_pct_OOS = 0.3

# split the training data into IS and OOS
train_IS, train_OOS = train_test_split(scaled_clean_train, test_size=train_split_pct_OOS, shuffle=True)


In [None]:
from sklearn.linear_model import LinearRegression

# fit training IS data to Linear Regression Model
model = LinearRegression()
model.fit(train_IS.drop(target, axis=1), train_IS[target])

#print(model.coef_)
#pd.DataFrame(model.coef_, train_IS.drop(target, axis=1).columns, columns = ['Coeff'])

# predict the target of the training OOS data
predictions = model.predict(train_OOS.drop(target, axis=1))

#predictions = (np.rint(predictions)).astype(int)
#print(predictions)

# fix for predictions that are way off
predictions[predictions > 1000000] = 0
predictions[predictions < -1000000] = 0

plt.scatter(train_OOS[target], predictions)
plt.show()
plt.hist(train_OOS[target] - predictions)
plt.show()

from sklearn import metrics

print('Mean Absolute Error: ' + str(metrics.mean_absolute_error(train_OOS[target], predictions)))

print('Mean Squared Error: ' + str(metrics.mean_squared_error(train_OOS[target], predictions)))

print('Root Mean Squared Error: ' + str(np.sqrt(metrics.mean_squared_error(train_OOS[target], predictions))))

In [None]:
import statsmodels.api as sm

def get_lin_reg(X, y):
    """
    Perform linear regression using ordinary least squares (OLS) method.

    Parameters:
        X (array-like): Input feature matrix.
        y (array-like): Target variable.

    Returns:
        statsmodels.regression.linear_model.RegressionResultsWrapper: Results of the linear regression.
    """
    X2 = sm.add_constant(X)  # Add a constant column to the feature matrix
    est = sm.OLS(y, X2, missing='drop')  # Fit the linear regression model
    est2 = est.fit()  # Get the results of the linear regression
    return est2


#X = train_IS.drop(target, axis=1)
#y = train_IS[target]
X = scaled_clean_train.drop(target, axis=1)
y = scaled_clean_train[target]

p_thresh = 0.01
verbose = 1

start_r2 = get_lin_reg(X, y).rsquared_adj

running = True
drop = ''

const_count = 0

print(f'Starting | n_features: {len(X.columns)} | adj_r2: {start_r2 : .4f}')
while running: 
    results = get_lin_reg(X, y)
    features = list(X.columns)
    max_p = results.pvalues.max()
    if max_p > p_thresh:     
        if drop != 'const':
            drop = results.pvalues.idxmax()
            
        if drop == 'const':
            drop = results.pvalues.nlargest(2).index[1]
            print(f'WARNING: const max taking second: {drop}')
        
        if drop != 'const':
            X = X.drop(drop, axis=1)
        else:
            continue 
            
        if verbose:
            print(f'Dropping: {drop} {max_p : .3f}')
        
    else:
        running = False
        print(f'Finished | n_features: {len(features)} | adj_r2: {results.rsquared_adj : .4f}\n')
        print(results.summary())
        final_features = X.columns

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns


class CustomRegression:
    def __init__(self, data, predictors, target):
        self._data = data
        self._X = data[predictors]
        self._y = data[target]
        self._predictors = list(predictors)
        self._target = target
        self._n = len(self._data)
        self._p = len(self._predictors)
        
    def set_data(self, data, predictors, target):
        self._data = data
        self._X = data[predictors]
        self._y = data[target]
        self._predictors = list(predictors)
        self._target = target
        self._n = len(self._data)
        self._p = len(self._predictors)
   
    def fit_reg(self, reg):
        self._model = reg
        self._model.fit(self._X, self._y)
        self.results(self._X, self._y)

    def predict(self, data, predictors, target):
        self.set_data(data, predictors, target)
        self._data['y_pred'] = self._model.predict(self._X)
        self._data['resids'] = abs(self._y) - abs(self._data['y_pred'])
        self.results(self._X, self._y)

    def results(self, X, y):
        self._r2 = self._model.score(X, y)
        self.set_adj_r2()
        self.set_resids()
        self.set_results_df(X, y)  
        #if self._p > 1:
        #    self.set_fstat()       
        self._ssr = np.sum(self.get_resids()**2)
        self.set_log_likelihood()
        self.resids_assumptions()

    def get_data(self):
        return self._data
    
    def get_r2(self):
        return self._r2

    def set_adj_r2(self):
        self._adj_r2 = 1 - (1 - self._r2)*(self._n - 1) / (self._n - self._p - 1)
    
    def set_resids(self):
        self._data['y_pred'] = self._model.predict(self._X)
        self._data['resids'] = abs(self._data[self._target]) - abs(self._data['y_pred'])

    def get_resids(self):
        return self._data['resids'].values
 
    def set_results_df(self, X, y):
        from scipy.stats import t

        feature_names = ['intercept'] + self._predictors
        
        params = np.append(self._model.intercept_, self._model.coef_)
        predictions = self._model.predict(X)
               
        X2 = np.append(np.ones((len(X),1)), X, axis=1)
        MSE = (np.sum((y-predictions)**2)) / (len(X2) - len(X2[0]))

        var_b = MSE*(np.linalg.pinv(np.dot(X2.T, X2)).diagonal())
        sd_b = np.sqrt(var_b)
        ts_b = params / sd_b

        p_values = [ 2*(1 - t.cdf(np.abs(i), (len(X2) - len(X2[0])))) for i in ts_b]

        sd_b = np.round(sd_b,3)
        ts_b = np.round(ts_b,3)
        p_values = np.round(p_values,3)
        params = np.round(params,4)
        
        results_df = pd.DataFrame()
        results_df.index, results_df["coef"], results_df["std_err"], results_df["t_value"], results_df["p_value"] = [feature_names, params, sd_b, ts_b, p_values]
        results_df['0.025'] = results_df["coef"] - (2* results_df["std_err"])
        results_df['0.975'] = results_df["coef"] + (2* results_df["std_err"])
        self._results_df = results_df
        
    def get_results_df(self, digits=4):
        return self._results_df.round(digits)

    def set_fstat(self):
        from symbulate import F
        self._fstat = (self._r2 / (1 - self._r2)) * ((self._n - self._p - 1) / self._p)
        self._fstat_pval = 1 - F( (self._n-1), (self._p-1)).cdf(self._fstat)

    def get_fstat(self):
        return self._fstat

    def summary(self, digits=4):
        display(pd.DataFrame([self.reg_metrics(digits)], index=['metrics']))
        display(self.get_results_df(digits))
        display(pd.DataFrame([self.error_metrics(digits)], index=['errors']))
    
    ### Assumptions
    def resids_normal(self, alpha=0.05):
        ### Normally distributed residuals
        # Above 0.05 == normal
        from statsmodels.stats.diagnostic import normal_ad

        self._ad_crit_val, self._ad_p_value = normal_ad(self._data['resids'])
        if self._ad_p_value > alpha:
            return True
        return False
    
    def resids_autocorr(self):
        ### Autocorrelation in the residuals
        ##  looking for between 1.5 - 2.5
        from statsmodels.stats.stattools import durbin_watson
        self._durbinWatson = durbin_watson(self._data['resids'])
        if (self._durbinWatson > 1.5) and (self._durbinWatson < 2.5):
            return False
        return True
 
    def resids_assumptions(self):
        normal_assumption = self.resids_normal()
        autocorr_assumption = self.resids_autocorr()
        print(f'residuals normally distributed: {normal_assumption}')
        print(f'residuals autocorrelated:       {autocorr_assumption}')
        if normal_assumption and not autocorr_assumption:
            return True
        return False

    def reg_metrics(self, digits=4):
        statistics = {
            'n_obs' : self._n,
            'n_pred' : self._p,
            'ssr' : round(self._ssr, digits),
            'log_likelihood' : round(self._ll, digits),
            
            'r2' : round(self._r2, digits),
            'adj_r2' : round(self._adj_r2, digits),
            #'f_stat' : round(self._fstat, digits),
            #'P(f_stat)' : round(self._fstat_pval, digits),
            
            'ad' : round(self._ad_crit_val, digits),
            'P(ad)' : round(self._ad_p_value, digits),
            'durbin_watson' : round(self._durbinWatson, digits),
        }
        return statistics
    
    def error_metrics(self, digits=4):
        from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, median_absolute_error
        
        self.set_AIC_BIC()
        
        y_true = self._y
        y_pred = self._data['y_pred']

        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_true, y_pred)
        mape = mean_absolute_percentage_error(y_true, y_pred)
        medae = median_absolute_error(y_true, y_pred)
        log_rmse = np.sqrt(mean_squared_error(np.log(y_true), np.log(y_pred)))
        
        self._error_metrics = {
            'MSE': round(mse, digits),
            'RMSE': round(rmse, digits),
            'Log_RMSE' : round(log_rmse, digits),
            'MAE': round(mae, digits),
            'MedAE' : round(medae, digits),
            'MAPE': round(mape, digits), 
            'AIC' : round(self._AIC, digits),
            'BIC' : round(self._BIC, digits),
        }
        return self._error_metrics

    def set_log_likelihood(self):
        n = self._n
        k = self._p
        residuals = self._data['resids']
        self._ll = -(n * 1/2) * (1 + np.log(2 * np.pi)) - (n / 2) * np.log(residuals.dot(residuals) / n)
    
    def get_log_likelihood(self):
        return self._ll 

    def set_AIC_BIC(self):
        ll = self.get_log_likelihood()
        n = self._n
        k = self._p + 1

        self._AIC = (-2 * ll) + (2 * k)
        self._BIC = (-2 * ll) + (k * np.log(n))
                        
    ### Plotting
    def plot_resid_hist(self):
        sns.displot(self._data['resids'], kde=1).set(title='Residuals')
        plt.show()

    def plot_residuals(self, std=True):
        fig, ax = plt.subplots(2, 1, figsize=(14, 7))
        if std:
            resids = (self._data['resids'] - np.mean(self._data['resids'])) / np.std(self._data['resids'], ddof=1)
        else:
            resids = self._data['resids']
        ax[0].scatter(self._data['y_pred'], resids, alpha=0.75)     
        ax[0].set_xlabel('Fitted value') 
        ax[0].set_ylabel('Residual')
        ax[0].set_title('Residuals plot')

        ax[1].scatter(self._data.index, resids, alpha=0.75) 
        ax[1].set_xlabel('Index') 
        ax[1].set_ylabel('Residual')    

        if std:
            for i in range(2):
                ax[i].axhline(0, c='k', ls='--')
                ax[i].axhline(3, c='r', ls='--')
                ax[i].axhline(-3, c='r', ls='--')
        plt.show()
        return resids[np.abs(resids) > 3]
    
    def plot_scatter(self):
        sns.jointplot(self._data, x='y_pred', y=self._target)
        plt.show()

In [None]:
'''scaled_clean_train[target] = train[target]#np.log(scaled_clean_train[target])

cr = CustomRegression(train_IS, X.columns, y.name)
cr.fit_reg(Lasso())
cr.summary()
cr.plot_residuals()
cr.plot_scatter()'''

In [None]:
'''cr.predict(train_OOS,X.columns, y.name)
cr.summary()
cr.plot_residuals()
cr.plot_scatter()'''

In [None]:
'''
cr._model.predict(scaled_clean_test[X.columns])

#cr.summary()'''

In [None]:
'''scaled_clean_test[target] = 0
cr.predict(scaled_clean_test, X.columns, y.name)
cr.summary()

y_pred = cr.get_data()['']
y_pred = y_pred.rename(target)
df = test.join(y_pred)
#df[['Id', 'y_pred']]
df[['Id', target]]'''

In [None]:
'''df[['Id', 'y_pred']].to_csv('submission.csv', index=False)'''

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X = scaled_clean_train.drop(target, axis=1)
y = scaled_clean_train[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "hist"}
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

n = 10000

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=50,
   # Activate early stopping
   early_stopping_rounds=100
)

In [None]:
cv_results = xgb.cv(dtrain=dtrain_reg, params=params, 
                  nfold=10, num_boost_round=10000, 
                  metrics="error", as_pandas=True, seed=123)

cv_results

In [None]:
X = scaled_clean_train.drop(target, axis=1)
y = scaled_clean_train[target]

# Create the training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# Instantiate the XGBRegressor: xg_reg
xg_reg = xgb.XGBRegressor( objective='reg:squarederror', n_estimators=100, seed=123, subsample=0.5)

# Fit the regressor to the training set
xg_reg.fit(X, y)


# Predict the labels of the test set: preds
preds = xg_reg.predict(scaled_clean_test[X_train.columns])

# Compute the rmse: rmse
#log_rmse = np.sqrt(mean_squared_error(np.log(y_test), np.log(preds)))
#print(f"RMSE: {log_rmse}")

In [None]:

test['SalePrice'] = xg_reg.predict(scaled_clean_test[X_train.columns])

test[['Id','SalePrice']].to_csv('submission.csv', index=False)

In [None]:

DM_train = xgb.DMatrix(X_train, y_train, enable_categorical=True)
DM_test =  xgb.DMatrix(X_test, y_test, enable_categorical=True)

params = {"booster":"gblinear", "objective":"reg:squarederror"}

alphas = range(20, 1020, 20)
vals = []

for alpha in tqdm(alphas):
    params["alpha"] = alpha
    xg_reg = xgb.train(dtrain = DM_train, params=params, num_boost_round=1000)
    preds = xg_reg.predict(DM_test)

    log_rmse = np.sqrt(mean_squared_error(np.log(y_test), np.log(preds)))
    #print(f"alpha: {alpha} \t| RMSE: {log_rmse : .4f}")
    vals.append([alpha, log_rmse])

In [None]:
rmse_df = pd.DataFrame(vals, columns=['alpha', 'log_rmse'])
rmse_df = rmse_df.set_index('alpha')
rmse_df['log_rmse'].nlargest(10) #[['alpha', 'log_rmse']].plot(secondary_y=['alpha'])

In [None]:
# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=DM_train, params=params, nfold=10, num_boost_round=10, metrics='rmse', as_pandas=True, seed=123)

# Print cv_results
cv_results

In [None]:
dmatrix = xgb.DMatrix(data=X, label=y)

reg_params = [0.01, 0.1, 1, 10, 100, 1000, 10000]

params = {"objective":"reg:squarederror","max_depth":4}
rmses_l2 = []

for reg in tqdm(reg_params):
    params["lambda"] = reg
    cv_results_rmse = xgb.cv(dtrain=dmatrix, params=params, nfold=5, num_boost_round=100, metrics="rmse", as_pandas=True, seed=123)
    rmses_l2.append(cv_results_rmse["test-rmse-mean"].tail(1).values[0])

print("Best rmse as a function of l2:")
print(pd.DataFrame(list(zip(reg_params, rmses_l2)), columns=["l2", "rmse"]))