In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import median_absolute_error, make_scorer

from sklearn.model_selection import cross_validate, KFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.linear_model import RidgeCV, PassiveAggressiveRegressor, LinearRegression, TheilSenRegressor, HuberRegressor, RANSACRegressor, Lasso, ElasticNet, Lars, LassoLars, OrthogonalMatchingPursuit, BayesianRidge, ARDRegression, TweedieRegressor, PoissonRegressor, GammaRegressor, LassoLars
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

experiment = 'generate_all_feats_remove_high_corr_feats'

import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('training_data.csv')
artificial = pd.read_csv('Artificial_Crystals_Dataset.csv')
mineral = pd.read_csv('Mineral_Dataset_Supplementary_Info.csv')
test = pd.read_csv('validation_data.csv')

train.head()

Unnamed: 0,allelectrons_Total,density_Total,allelectrons_Average,val_e_Average,atomicweight_Average,ionenergy_Average,el_neg_chi_Average,R_vdw_element_Average,R_cov_element_Average,zaratio_Average,density_Average,Hardness
0,100.0,0.841611,10.0,4.8,20.612526,11.0881,2.766,1.732,0.86,0.49607,0.91457,6.0
1,100.0,7.558488,10.0,4.8,20.298893,12.04083,2.755,1.631,0.91,0.492719,0.7176,6.5
2,76.0,8.885992,15.6,5.6,33.739258,12.0863,2.828,1.788,0.864,0.481478,1.50633,2.5
3,100.0,8.795296,10.0,4.8,20.213349,10.9485,2.648,1.626,0.936,0.489272,0.78937,6.0
4,116.0,9.577996,11.6,4.8,24.988133,11.82448,2.766,1.682,0.896,0.492736,1.86481,6.0


In [3]:
# Rename hardness in artifical dataset and drop columns not required
artificial.rename(columns={'Hardness (Mohs)': 'Hardness'}, inplace=True)
artificial.drop(['Formula', 'Crystal structure', 'Unnamed: 0'], axis=1, inplace=True)

artificial.head()

Unnamed: 0,Hardness,allelectrons_Total,density_Total,allelectrons_Average,val_e_Average,atomicweight_Average,ionenergy_Average,el_neg_chi_Average,R_vdw_element_Average,R_cov_element_Average,zaratio_Average,density_Average
0,4.5,167.0,23.907992,18.555556,5.0,41.609136,11.693844,2.938889,1.711111,0.884444,0.47783,2.656444
1,4.0,14.0,1.740168,4.666667,1.333333,8.773227,11.614333,1.903333,1.31,0.68,0.82599,0.580056
2,2.5,102.0,8.511159,4.434783,3.304348,8.440584,13.176622,2.672609,1.37913,0.53087,0.71385,0.37005
3,5.5,78.0,8.109328,13.0,5.333333,27.448814,11.8264,2.96,1.625,0.813333,0.488163,1.351555
4,6.5,164.0,19.921324,14.909091,5.090909,32.012361,11.255573,2.881818,1.640909,0.841818,0.48348,1.811029


In [4]:
# Drop unneeded columns
mineral.drop('Unnamed: 0', axis=1, inplace=True)

mineral.head()

Unnamed: 0,Hardness,allelectrons_Total,density_Total,allelectrons_Average,val_e_Average,atomicweight_Average,ionenergy_Average,el_neg_chi_Average,R_vdw_element_Average,R_cov_element_Average,zaratio_Average,density_Average
0,2.3,110.0,23.0,36.666667,2.666667,82.598467,8.504133,2.146667,2.006667,1.253333,0.456803,7.666667
1,5.5,406.0,30.472136,9.902439,4.682927,19.81318,11.456151,2.700244,1.676829,0.868293,0.522909,0.743223
2,5.5,406.0,30.472464,10.410256,4.923077,20.931371,11.541405,2.75359,1.703846,0.894359,0.497498,0.781345
3,5.5,476.0,61.142136,11.609756,4.682927,23.659644,11.487395,2.763659,1.714634,0.84878,0.519474,1.491272
4,5.5,476.0,61.142464,12.205128,4.923077,24.975089,11.574251,2.820256,1.74359,0.873846,0.493887,1.567755


In [5]:
train.shape, artificial.shape, mineral.shape

((9368, 12), (52, 12), (622, 12))

In [6]:
# train = pd.concat([train, artificial, mineral], axis=0)
# train.shape

In [7]:
# Constants
TARGET = 'Hardness'

In [8]:
# Selecting numerical variables from the train dataset, excluding 'id' and TARGET
num_var = train.drop([TARGET], axis=1).select_dtypes(include=np.number).columns
print(num_var)

Index(['allelectrons_Total', 'density_Total', 'allelectrons_Average',
       'val_e_Average', 'atomicweight_Average', 'ionenergy_Average',
       'el_neg_chi_Average', 'R_vdw_element_Average', 'R_cov_element_Average',
       'zaratio_Average', 'density_Average'],
      dtype='object')


In [9]:
len(num_var)

11

In [10]:
# # Combining train and test datasets for comparative analysis
# # 'Source' column is added to label data from each dataset
# df = pd.concat([
#     train[num_var].assign(Source='Train'), 
#     test[num_var].assign(Source='Test')
# ], axis=0, ignore_index=True)

In [11]:
# Check for outliers
def outlier_thresholds(dataframe, col_name, q1=0.01, q3=0.99):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    IQR = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * IQR
    low_limit = quartile1 - 1.5 * IQR
    print(f'Feature name: {col_name}')
    print(f'Low Limit: {low_limit}')
    print(f'Upper Limit: {up_limit}')
    print()
    return low_limit, up_limit


def remove_outlier(dataframe, col_name):
    """
    Example Usage:
    for col in num_cols:
    new_df = remove_outlier(titanic, col)
    """
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    df_without_outliers = dataframe[~((dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit))]    
    return df_without_outliers

def cap_outliers(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)

    dataframe[col_name] = np.where(dataframe[col_name] > up_limit, up_limit, 
                                   np.where(dataframe[col_name] < low_limit, low_limit, dataframe[col_name]))
    return dataframe

df_outliers = train.copy()
df_outliers_test = test.copy()

for col in num_var:
    df_outliers = remove_outlier(df_outliers, col)

df_outliers.columns

In [12]:
train = df_outliers.copy()

In [13]:
scaler = StandardScaler()

standard_scaler = scaler.fit_transform(train[num_var])
train_scaled = pd.DataFrame(standard_scaler, columns=num_var)

test_scaled = scaler.transform(test[num_var])
test_scaled = pd.DataFrame(test_scaled, columns=num_var)
train_scaled.head()

In [14]:
# test_scaled.head()

# FEATURE ENGINEERING

In [15]:
epsilon = 1e-10

def get_hist_cum(df, feature):
	value_count = df[feature].value_counts()
	value_count = value_count.reset_index()
	value_count.sort_values(by='index', inplace=True, axis=0)
	value_count[f'cum_count_{feature}'] = value_count.cumsum()
	return value_count[['index', f'cum_count_{feature}']]

def feat_engineering(df, numerical_features):
    df_new = df.copy()

    # Transformation of numerical features
    for feature in numerical_features:
        # Log transformation (add small value to avoid log(0))
        df_new[f"{feature}_log"] = np.log(df[feature] + 1e-10)

        # Square root transformation
        df_new[f"{feature}_sqrt"] = np.sqrt(df[feature])

        # Polynomial transformations
        df_new[f"{feature}_degree2"] = df[feature]**2
        df_new[f"{feature}_degree3"] = df[feature]**3
        
        # Value count as a feature (Count encoding)
        value_counts = df[feature].value_counts()
        df_new[f"{feature}_counts"] = df[feature].map(value_counts)
        
        # Division of feature and its corresponding value count (both ways)
        # which disperse the distribution of different values with the same counts.
        df_new[f'{feature}_count_ratio'] = df_new[feature] / df_new[f"{feature}_counts"]
        df_new[f'{feature}_count_ratio'] = df_new[f"{feature}_counts"] / df_new[feature]
        
        # Multiplication of feature and its corresponding value count
        # which disperse the distribution of different values with the same counts.
        df_new[f'{feature}_count_ratio'] = df_new[feature] * df_new[f"{feature}_counts"]
        
        # # Cumulative Histogram Feature
        # cumsum = df[[feature]].merge(get_hist_cum(df, feature), left_on=[feature], right_on=['index'], how='left')
        # df_new[f"{feature}_cumcum_ratio"] = np.log(df[f"{feature}_counts"]) / (cumsum[f'cum_count_{feature}'] / len(df))
        # df_new[f"{feature}_cumsum_ratio_sub"] = np.log(df[f"{feature}_counts"]) / (1 - cumsum[f'cum_count_{feature}'] / len(df))
        
        # Distance Ratio (mean, max_dist, min_dist, median_dist, mode_dist)
        mean_dist = np.abs(df[feature] - df[feature].mean())
        max_dist = np.abs(df[feature] - df[feature].max())
        min_dist = np.abs(df[feature] - df[feature].min())
        median_dist = np.abs(df[feature] - df[feature].median())
        mode_dist = np.abs(df[feature] - df[feature].mode())
        
        df_new[f"{feature}_mean_ratio"] = np.log(df_new[f"{feature}_counts"]) / (mean_dist + epsilon)
        df_new[f"{feature}_max_ratio"] = np.log(df_new[f"{feature}_counts"]) / (max_dist + epsilon)
        df_new[f"{feature}_min_ratio"] = np.log(df_new[f"{feature}_counts"]) / (min_dist + epsilon)
        df_new[f"{feature}_median_ratio"] = np.log(df_new[f"{feature}_counts"]) / (median_dist + epsilon)
        # df_new[f"{feature}_mode_ratio"] = np.log(df_new[f"{feature}_counts"]) / (mode_dist + epsilon)

    # Pairwise numerical feature operations
    for i in range(len(numerical_features)):
        for j in range(i, len(numerical_features)):
            if i != j:
                feat1 = numerical_features[i]
                feat2 = numerical_features[j]

                # Multiplication
                df_new[f"{feat1}_x_{feat2}"] = df[feat1] * df[feat2]

                # Addition
                df_new[f"{feat1}_plus_{feat2}"] = df[feat1] + df[feat2]

                # Subtraction (both ways)
                df_new[f"{feat1}_minus_{feat2}"] = df[feat1] - df[feat2]
                df_new[f"{feat2}_minus_{feat1}"] = df[feat2] - df[feat1]

                # Ratio (both ways)
                df_new[f"{feat1}_div_{feat2}"] = df[feat1] / (df[feat2] + epsilon)
                df_new[f"{feat2}_div_{feat1}"] = df[feat2] / (df[feat1] + epsilon)

                # Mean
                df_new[f"Mean_{feat1}_{feat2}"] = df[[feat1, feat2]].mean(axis=1)

                # Standard Deviation
                df_new[f"Std_{feat1}_{feat2}"] = df[[feat1, feat2]].std(axis=1)

                # Percentage Sum (both ways)
                df_new[f"percent_sum_{feat1}_{feat2}"] = (df[feat1] / (df[feat1] + df[feat2] + epsilon)) * 100
                df_new[f"percent_sum_{feat2}_{feat1}"] = (df[feat2] / (df[feat1] + df[feat2] + epsilon)) * 100
                
                # Compare value of features
                df_new[f"{feat1}_gt_{feat2}"] = (df[feat1] > df[feat2]).astype(int)


    return df_new

In [16]:
train = feat_engineering(train, num_var)
test = feat_engineering(test, num_var)
train.shape

(9368, 727)

In [20]:
# Remove correlated features (leaving just 1 of each pair)
correlation_matrix = train.corr().abs()

# Select upper triangle of correlation matrix
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Find index of feature columns with correlation greater than a threshold (e.g., 0.9 in this case)
to_drop = [column for column in upper.columns if any(upper[column] >= 0.9)]

# Drop features
df_reduced = train.drop(to_drop, axis=1)

print(f"Dropped {len(to_drop)} highly correlated features. New shape of the dataset is {df_reduced.shape}")

Dropped 536 highly correlated features. New shape of the dataset is (9368, 191)


In [21]:
train = df_reduced.copy()

In [27]:
# Selecting numerical variables from the train dataset, excluding 'id' and TARGET
num_var = train.drop([TARGET], axis=1).select_dtypes(include=np.number).columns
len(num_var)

190

In [28]:
other_cols = [col for col in train.columns if col not in num_var]
other_cols

['Hardness']

In [29]:
train.columns[train.isna().any()].tolist()

[]

# MODEL CROSS-VALIDATION

In [32]:
MLA = [
	# Trial Models
	MLPRegressor(random_state=5),
	# TheilSenRegressor(random_state=5),
	HuberRegressor(),
	RANSACRegressor(random_state=5),
	Lasso(random_state=5),
	ElasticNet(random_state=5),
	Lars(random_state=5),
	LassoLars(random_state=5),
	OrthogonalMatchingPursuit(),
	BayesianRidge(),
	ARDRegression(),
    TweedieRegressor(power=1.5, alpha=0.5),
    PoissonRegressor(alpha=0.5),
    GammaRegressor(alpha=0.5),
    LassoLars(alpha=0.1, random_state=5),

	# GLM
	LinearRegression(),
	PassiveAggressiveRegressor(random_state=5),
	RidgeCV(),

	# Trees    
	DecisionTreeRegressor(random_state=5),
	ExtraTreeRegressor(random_state=5),

	XGBRegressor(random_state=5),
	LGBMRegressor(n_jobs=-1, random_state=5),
	CatBoostRegressor(random_state=5, verbose=False, early_stopping_rounds=100),
	
	# KNeighbors
	KNeighborsRegressor(),
	KNeighborsRegressor(n_neighbors=2),
	KNeighborsRegressor(n_neighbors=4),
	KNeighborsRegressor(n_neighbors=8),
	KNeighborsRegressor(n_neighbors=16),
	KNeighborsRegressor(n_neighbors=32),
	KNeighborsRegressor(n_neighbors=64),
	KNeighborsRegressor(n_neighbors=128),
	KNeighborsRegressor(n_neighbors=256),
	KNeighborsRegressor(n_neighbors=512),
	KNeighborsRegressor(n_neighbors=1024),

	# Ensemble Methods
	AdaBoostRegressor(random_state=5),
	BaggingRegressor(random_state=5),
	ExtraTreesRegressor(random_state=5),
	GradientBoostingRegressor(random_state=5),
	HistGradientBoostingRegressor(random_state=5),
	RandomForestRegressor(random_state=5),
    ]


# split dataset in cross-validation with splitter class
# cv_split could KFold, StratifiedKFold or RepeatedKFold depending on the problem
cv_split = KFold(n_splits=10, shuffle=True, random_state=5)
cv_split_trial = KFold(n_splits=3, shuffle=True, random_state=5) # For quick trials

In [33]:
# # create table to compare MLA metrics
# MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD', 'MLA Time']
# MLA_compare = pd.DataFrame(columns = MLA_columns)

# # create table to compare MLA predictions
# MLA_predict = {}

# # index through MLA and save performance to table
# row_index = 0
# scoring = median_abs_error_scorer = make_scorer(median_absolute_error, greater_is_better=False)

# for alg in MLA:

# 	# set name and parameters
# 	MLA_name = alg.__class__.__name__

# 	# Add suffix if name already exists
# 	suffix = 1
# 	original_MLA_name = MLA_name
# 	while MLA_compare['MLA Name'].str.contains(MLA_name).any():
# 		MLA_name = f"{original_MLA_name}_{suffix}"
# 		suffix += 1
		
# 	MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
# 	MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())

# 	"""score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate"""

# 	cv_results = cross_validate(alg, train_scaled, train[TARGET], cv=cv_split, scoring=scoring, return_train_score=True)

# 	# Calculate mean time in seconds
# 	mean_fit_time = cv_results['fit_time'].mean()

# 	# Convert mean time to minutes and seconds
# 	minutes = int(mean_fit_time // 60)
# 	seconds = mean_fit_time % 60

# 	# Format the time and assign it
# 	MLA_compare.loc[row_index, 'MLA Time'] = f"{minutes} min {seconds:.2f} sec"
# 	MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean() * -1
# 	MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean() * -1
# 	#if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
# 	MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!

# 	print(f'Done with {MLA_name}')
# 	row_index+=1

# #print and sort table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
# MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = True, inplace = True)
# MLA_compare.to_csv(f'{experiment}_results.csv', index=False)
# MLA_compare

In [34]:
# sns.barplot(x='MLA Test Accuracy Mean', y = 'MLA Name', data = MLA_compare, color = 'm')

# #prettify using pyplot: https://matplotlib.org/api/pyplot_api.html
# plt.title('Machine Learning Algorithm Accuracy Score \n')
# plt.xlabel('Accuracy Score (%)')
# plt.ylabel('Algorithm')

# ENSEMBLE CROSS-VALIDATION

In [35]:
# Hill Climbing inspired by code from Kaggle
def hill_climbing(x, y):
    
    # Evaluating oof predictions
    scores = {}
    for col in x.columns:
        scores[col] = median_absolute_error(y, x[col])

    # Sorting the model scores in ascending order
    scores = {k: v for k, v in sorted(scores.items(), key = lambda item: item[1], reverse = False)}

    # Sort oof_df
    x = x[list(scores.keys())]

    # Initialize weights
    weights = {col: 1 if i == 0 else 0 for i, col in enumerate(x.columns)}

    STOP = False
    current_best_ensemble = x.iloc[:,0]
    MODELS = x.iloc[:,1:]
    weight_range = np.arange(-0.5, 0.51, 0.01) 
    history = [median_absolute_error(y, current_best_ensemble)]
    j = 0

    while not STOP:
        j += 1
        potential_new_best_cv_score = median_absolute_error(y, current_best_ensemble)
        k_best, wgt_best = None, None
        for k in MODELS:
            for wgt in weight_range:
                potential_ensemble = (1 - wgt) * current_best_ensemble + wgt * MODELS[k]
                cv_score = median_absolute_error(y, potential_ensemble)
                if cv_score < potential_new_best_cv_score:
                    potential_new_best_cv_score = cv_score
                    k_best, wgt_best = k, wgt

        if k_best is not None:
            # Update weights
            weights = {col: (1 - wgt_best) * weights[col] if col != k_best else wgt_best for col in weights}
            current_best_ensemble = (1 - wgt_best) * current_best_ensemble + wgt_best * MODELS[k_best]
            MODELS.drop(k_best, axis = 1, inplace = True)
            if MODELS.shape[1] == 0:
                STOP = True
            history.append(potential_new_best_cv_score)
        else:
            STOP = True
        
    hill_ens_pred = current_best_ensemble
    
    return hill_ens_pred, weights

In [36]:
def create_submission_file(ids, predictions, filename):
    submission = pd.DataFrame({'id': ids, 'Hardness': predictions})
    submission.to_csv(filename, index=False)

In [37]:
hill_climb_scores = []
hill_climb_weights = []

for i, (train_index, test_index) in enumerate(cv_split.split(train[num_var], train[TARGET])):
    X_train, X_test = train[num_var].iloc[train_index], train[num_var].iloc[test_index]
    y_train, y_test = train[TARGET].iloc[train_index], train[TARGET].iloc[test_index]

    print(f'Fold {i+1}')
    
    MLA_cv_train_preds = []
    MLA_cv_preds = []
    MLA_cv_preds_dict = {}
    MLA_names = []
    
    suffix = 1
    for alg in MLA:
        MLA_name = alg.__class__.__name__

        # Add suffix if name already exists

        original_MLA_name = MLA_name
        if MLA_name in MLA_names:
        # while MLA_cv_preds.str.contains(MLA_name).any():
            MLA_name = f"{original_MLA_name}_{suffix}"
            suffix += 1
        
        predictor = alg.fit(X_train, y_train)
        pred_train_result = predictor.predict(X_train)
        pred_result = predictor.predict(X_test)
        # pred_result_rounded = np.round(pred_result, 1)

        MLA_cv_train_preds.append(pred_train_result)
        MLA_cv_preds.append(pred_result)
        MLA_cv_preds_dict[MLA_name] = pred_result
        MLA_names.append(MLA_name)

        print(f'Done with {MLA_name}')

    ##################
    ### Hill Climb ###
    ##################
    hill_climb_pred, hill_climb_weight = hill_climbing(pd.DataFrame(MLA_cv_preds_dict), y_test)
    hill_climb_score = median_absolute_error(y_test, hill_climb_pred)
    hill_climb_scores.append(hill_climb_score)
    hill_climb_weights.append(hill_climb_weight)
    print(f'The Fold {i+1} Hill Climb is {hill_climb_score}')
    print(f'The Fold {i+1} weight is {hill_climb_weight}')
    print()

print()
print(f'The Hill Climbing CV score is ==> {np.mean(hill_climb_scores)}')
print(f'The Hill Climbing weights are ==> {hill_climb_weights}')

Fold 1
Done with MLPRegressor
Done with HuberRegressor
Done with RANSACRegressor
Done with Lasso
Done with ElasticNet
Done with Lars
Done with LassoLars
Done with OrthogonalMatchingPursuit
Done with BayesianRidge
Done with ARDRegression
Done with TweedieRegressor
Done with PoissonRegressor
Done with GammaRegressor
Done with LassoLars_1
Done with LinearRegression
Done with PassiveAggressiveRegressor
Done with RidgeCV
Done with DecisionTreeRegressor
Done with ExtraTreeRegressor
Done with XGBRegressor
Done with LGBMRegressor
Done with CatBoostRegressor
Done with KNeighborsRegressor
Done with KNeighborsRegressor_2
Done with KNeighborsRegressor_3
Done with KNeighborsRegressor_4
Done with KNeighborsRegressor_5
Done with KNeighborsRegressor_6
Done with KNeighborsRegressor_7
Done with KNeighborsRegressor_8
Done with KNeighborsRegressor_9
Done with KNeighborsRegressor_10
Done with KNeighborsRegressor_11
Done with AdaBoostRegressor
Done with BaggingRegressor
Done with ExtraTreesRegressor
Done wi

In [39]:
# Get average weights for the models from all the folds

average_values = {}

for model in hill_climb_weights:
    for key, value in model.items():
        if key in average_values:
            average_values[key] += value
        else:
            average_values[key] = value

num_models = len(hill_climb_weights)
average_values = {k: v / num_models for k, v in average_values.items()}

# Ensure the new weights sum up to 1
sum = 0

for k, v in average_values.items():
    sum += v

average_values, sum, len(average_values)

({'LGBMRegressor': 0.24778727745583562,
  'CatBoostRegressor': 0.08803116146799868,
  'HistGradientBoostingRegressor': 0.3044357054165955,
  'GradientBoostingRegressor': 0.0010092000000002277,
  'ExtraTreesRegressor': 0.2630229363967982,
  'RandomForestRegressor': 1.0000000000121718e-05,
  'XGBRegressor': 0.11519987248697357,
  'BaggingRegressor': 0.0238777966450002,
  'DecisionTreeRegressor': 0.11428145572000022,
  'ExtraTreeRegressor': 0.0030198000000003235,
  'KNeighborsRegressor': 0.018177011635860083,
  'KNeighborsRegressor_3': 0.0019994040000001303,
  'KNeighborsRegressor_4': 0.0009900000000001735,
  'KNeighborsRegressor_5': 0.007603431590800116,
  'LinearRegression': 1.7763568394002464e-16,
  'BayesianRidge': 1.3322676295501814e-16,
  'KNeighborsRegressor_2': 0.008403714150600188,
  'KNeighborsRegressor_6': -0.003008315059999804,
  'OrthogonalMatchingPursuit': -0.037007380079999416,
  'KNeighborsRegressor_7': 0.001019898000000176,
  'ElasticNet': 1.3322676295501826e-16,
  'Lasso

In [40]:
# Create an ordered dictionary based on the order of models in MLA
ordered_average_values = {}
final_mlas = []

for model_name in MLA_names:        
    if model_name in average_values:
        ordered_average_values[model_name] = average_values[model_name]
    else:
        # Handle case where a model might not be in average_values
        ordered_average_values[model_name] = None

# Now ordered_average_values has the averages in the same order as MLA
ordered_average_values

{'MLPRegressor': 2.664535259100363e-16,
 'HuberRegressor': -0.004555642013999905,
 'RANSACRegressor': 2.2204460492503037e-16,
 'Lasso': -0.020814595401599707,
 'ElasticNet': 1.3322676295501826e-16,
 'Lars': 0.0,
 'LassoLars': 1.332267629550176e-16,
 'OrthogonalMatchingPursuit': -0.037007380079999416,
 'BayesianRidge': 1.3322676295501814e-16,
 'ARDRegression': 1.3322676295501767e-16,
 'TweedieRegressor': -0.046828741360897305,
 'PoissonRegressor': -0.005049999999999808,
 'GammaRegressor': 1.3322676295501777e-16,
 'LassoLars_1': 1.3322676295501777e-16,
 'LinearRegression': 1.7763568394002464e-16,
 'PassiveAggressiveRegressor': 1.7763568394002387e-16,
 'RidgeCV': 1.3322676295501816e-16,
 'DecisionTreeRegressor': 0.11428145572000022,
 'ExtraTreeRegressor': 0.0030198000000003235,
 'XGBRegressor': 0.11519987248697357,
 'LGBMRegressor': 0.24778727745583562,
 'CatBoostRegressor': 0.08803116146799868,
 'KNeighborsRegressor': 0.018177011635860083,
 'KNeighborsRegressor_2': 0.008403714150600188,


In [41]:
# Get the ordered weights values as a list to be used for final submission
hill_climb_final_weights = []

for value in ordered_average_values.values():
    hill_climb_final_weights.append(value)

hill_climb_final_weights

[2.664535259100363e-16,
 -0.004555642013999905,
 2.2204460492503037e-16,
 -0.020814595401599707,
 1.3322676295501826e-16,
 0.0,
 1.332267629550176e-16,
 -0.037007380079999416,
 1.3322676295501814e-16,
 1.3322676295501767e-16,
 -0.046828741360897305,
 -0.005049999999999808,
 1.3322676295501777e-16,
 1.3322676295501777e-16,
 1.7763568394002464e-16,
 1.7763568394002387e-16,
 1.3322676295501816e-16,
 0.11428145572000022,
 0.0030198000000003235,
 0.11519987248697357,
 0.24778727745583562,
 0.08803116146799868,
 0.018177011635860083,
 0.008403714150600188,
 0.0019994040000001303,
 0.0009900000000001735,
 0.007603431590800116,
 -0.003008315059999804,
 0.001019898000000176,
 1.3322676295501807e-16,
 1.3322676295501814e-16,
 -0.0019699999999997757,
 -0.03169744165095519,
 -0.047936549399014726,
 0.0238777966450002,
 0.2630229363967982,
 0.0010092000000002277,
 0.3044357054165955,
 1.0000000000121718e-05]

In [42]:
test_predictions = []
# Make predictions on test set
for alg in MLA:
        MLA_name = alg.__class__.__name__
                
        predictor = alg.fit(train[num_var], train[TARGET])
        pred_result = predictor.predict(test[num_var])

        test_predictions.append(pred_result)
        print(f'Done with {MLA_name}')

Done with MLPRegressor
Done with HuberRegressor
Done with RANSACRegressor
Done with Lasso
Done with ElasticNet
Done with Lars
Done with LassoLars
Done with OrthogonalMatchingPursuit
Done with BayesianRidge
Done with ARDRegression
Done with TweedieRegressor
Done with PoissonRegressor
Done with GammaRegressor
Done with LassoLars
Done with LinearRegression
Done with PassiveAggressiveRegressor
Done with RidgeCV
Done with DecisionTreeRegressor
Done with ExtraTreeRegressor
Done with XGBRegressor
Done with LGBMRegressor
Done with CatBoostRegressor
Done with KNeighborsRegressor
Done with KNeighborsRegressor
Done with KNeighborsRegressor
Done with KNeighborsRegressor
Done with KNeighborsRegressor
Done with KNeighborsRegressor
Done with KNeighborsRegressor
Done with KNeighborsRegressor
Done with KNeighborsRegressor
Done with KNeighborsRegressor
Done with KNeighborsRegressor
Done with AdaBoostRegressor
Done with BaggingRegressor
Done with ExtraTreesRegressor
Done with GradientBoostingRegressor
Do

In [43]:
# Double check that the weights and predictions are the same length
len(test_predictions), len(hill_climb_final_weights)

(39, 39)

In [44]:
weighted_avg_predictions = np.average(test_predictions, axis=0, weights=hill_climb_final_weights)
weighted_avg_predictions

array([2.87838327, 4.30398754, 5.11344916, ..., 8.33793729, 3.93464921,
       5.86467159])

In [45]:
# # # Round to 1 decimal place
# rounded_predictions = np.round(weighted_avg_predictions, 1)
validation_score = median_absolute_error(test[TARGET], weighted_avg_predictions)
validation_score

0.7051947967319654

In [None]:
# weighted_avg_predictions = np.average(test_predictions, axis=0, weights=hill_climb_final_weights)
# create_submission_file(test['id'], weighted_avg_predictions, f'submission_{experiment}.csv')