In [58]:
# this notebook is just the xgboot_predictor.ipynb from
# https://github.com/FergusOBoyle/sustainable-dev-goals-forecasting
# and I have added...
# 1. code to write out intermediate results to csv files.
# 2. added an algortihm derived from running the csv data through 
#    Microsoft's AutoML that automatically generated a more  optimized
#    model.  The code generated by AutoML needing some tweaking when run
#    outside of their Azure cloud but did
#    generate a better RMSE score notetheless.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error
# from sklearn.metrics.scorer import make_scorer
import xgboost as xgb 
from matplotlib import pyplot as plt

import sys
sys.path.append('..')
from utils import preprocess, missing, evaluate

### Globals and load data

In [59]:
target = 'SI.POV.DDAY'
predict_year=2010
#percent of input Indicators to use (set to 100 for full set of input features)
percent = 50

#Load the data from disk
input_dir = '.\\..\\data\\'
data_input = "cleaned_data.pkl"
data = pd.read_pickle(input_dir + data_input)

#Possible subset of data choosen to reduce calulation time
#For percetages less than 100% we try to choose a subset that represents the spread of variables

if percent == 100:
    pass
else: 
    num_indicators_original = data.shape[1]
    step = int(100/percent)
    data_new = data.iloc[:,::step].copy()
    #Add the target column if not already included
    if target not in data_new.columns:
        data_new[target] = data[target]
    data = data_new
    
print(data.shape[1], "indicators included")

307 indicators included


In [None]:
# jagNote: write out some intermediate data for inspection

# save pk. data back to a csv (for use in Azure)

import pickle as pkl
# import pandas as pd

input_dir2 = '.\\..\\data\\'
data_input2 = "cleaned_data.pkl"

with open(input_dir2 + data_input2, "rb") as f2:
    object = pkl.load(f2)
    
df2 = pd.DataFrame(object)
df2.to_csv(input_dir2 + r'cleaned_data2.csv')



### Window data and preprocess

In [60]:
%time data_regressors, data_targets = \
        preprocess.window_data(data, lag=3,num_windows=10, step=1, predict_year=2010, \
                         target=target, impute_type='interpolation')

#Break up into training and testing data.

idx = pd.IndexSlice
data_train_regressors = data_regressors.loc[idx[:,2:10],:]
data_train_targets = data_targets.loc[idx[:,2:10],:]
data_test_regressors = data_regressors.loc[idx[:,1],:]
data_test_targets= data_targets.loc[idx[:,1],:]

#For Training, only consider windows that don't have a missing target as they offer nothing to training
#Therefore, remove those observations from both the training regressors and targets datasets.
data_train_regressors_subset = data_train_regressors[~np.isnan(list(data_train_targets.values.flatten()))]
data_train_targets_subset = data_train_targets[~np.isnan(list(data_train_targets.values.flatten()))]

#For testing, also remove windows with no target variable as it is impossible to measure preformance.
data_test_regressors_subset = data_test_regressors[~np.isnan(list(data_test_targets.values.flatten()))]
data_test_targets_subset = data_test_targets[~np.isnan(list(data_test_targets.values.flatten()))]

Wall time: 32.2 s


In [61]:
X_train = data_train_regressors_subset.values
y_train = data_train_targets_subset.values.ravel()
X_test = data_test_regressors_subset.values
y_test = data_test_targets_subset

In [None]:

# jagNote: write out some intermediate data for inspection...

print(type(data_train_regressors_subset))
print(data_train_regressors_subset.shape)
print(data_train_targets_subset.shape)
print(data_test_regressors_subset.shape)
print(data_test_targets_subset.shape)

# 
# combine the dataframes vertically
xxx = pd.concat([data_train_regressors_subset,data_test_regressors_subset])
print(xxx.shape)

yyy = pd.concat([data_train_targets_subset,data_test_targets_subset])
print(yyy.shape)

# combine horizontally (to add target back to a combined dataframe)
xxxyyy = pd.concat([xxx, yyy.set_index(xxx.index)], axis=1)

# write out to a csv file

input_dir5 = '.\\..\\data\\'
    
xxxyyy.to_csv(input_dir5 + r'combined_data5.csv')

In [None]:

# jagNote: Let's save tabular data that has some preprocessing and perhaps
# feature engineering back to csv files so we can use this data up in the
# microsoft azure cloud using AutoML...
# note: we'll upload a combined training and test csv to azure and allow
# automl to do their own splitting into training and test datasets.


print(type(X_train))
print(type(y_train))
print(type(X_test))
print(type(y_test))

# print(X_train.size)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

#
# combine train and test arrrays
#
X_combined = np.vstack((X_train, X_test))
print(X_combined.shape)
y_train3 = y_train.reshape((455,1))
y_combined = np.vstack((y_train3, y_test))
print(y_combined.shape)

xy_combined = np.concatenate((X_combined, y_combined),axis=1)


input_dir3 = '.\\..\\data\\'

#
# write numpy array back to csv

np.savetxt(input_dir3 + r'combined_data3.csv', xy_combined, delimiter=",")


### XGBoost Model

#### Out-of-the-box using the Scikit-learn interface

In [62]:
XGB = xgb.XGBRegressor(random_state=42 ,objective='reg:squarederror', subsample=0.9)
XGB.fit( X_train,y_train)
#Make predictions
predictions = XGB.predict(X_test) 

mse= mean_squared_error(y_test, predictions)
print("RMSE of XGBoost out-of-the-box is:", np.sqrt(mse))

RMSE of XGBoost out-of-the-box is: 5.767635272547369


#### Tuning of the Algorithm

In [63]:

from sklearn.metrics import make_scorer
cv_folds = 5

scorer = make_scorer(mean_squared_error ,greater_is_better=False)

Step 1. Tune the number of estimators

In [64]:
model = xgb.XGBRegressor(random_state=42,
                         objective='reg:squarederror',
                         max_depth=5, 
                         min_child_weight = 1, 
                         gamma = 0, 
                         subsample=0.9, 
                         colsample_bytree = 0.8, 
                         scale_pos_weight = 1)

param = model.get_xgb_params()
data_matrix = xgb.DMatrix(X_train, label=y_train)
cvresult = xgb.cv(param, data_matrix, num_boost_round=model.get_params()['n_estimators'], nfold=cv_folds,
            metrics='rmse', early_stopping_rounds=50)
#Set the optimised number of estimators
model.set_params(n_estimators=cvresult.shape[0])
print("Optimal number of estimators:", cvresult.shape[0])

Optimal number of estimators: 100


In [65]:
model.fit( X_train,y_train)
#Make predictions
predictions = model.predict(X_test) 

mse= mean_squared_error(y_test, predictions)
print("RMSE of xgboost after tuning (step 1) is:", np.sqrt(mse))

RMSE of xgboost after tuning (step 1) is: 5.513444558601978


Step 2. Tune max_depth and min_child_weight

In [66]:
params = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}

#grid_model = GridSearchCV(model, param_grid = params, scoring=scorer,
#                        n_jobs=4,iid=False, cv=5)
grid_model = GridSearchCV(model, param_grid = params, scoring=scorer,
                        n_jobs=4,cv=5)
grid_model.fit(X_train,y_train)

#Score the best model using the test data
model = grid_model.best_estimator_
model.fit( X_train,y_train)
#Make predictions
predictions = model.predict(X_test) 

mse= mean_squared_error(y_test, predictions)
print("RMSE of xgboost after tuning (step 2) is:", np.sqrt(mse))

RMSE of xgboost after tuning (step 2) is: 5.39043909814746


The result of our test shows that the performance of the model after tuning was actually worse than before. The model is generalising very poorly. This may be a reflection on some kind of bias introduced in creating our training and data subsets. It would be worth looking at how I decided to discard any countries early on that did not have target values for the target year, 2010. It may have made more sense to window the data and split into training and test subsets and then, after, discard any observations that did not have a target value. 

In [67]:
grid_model.best_params_

{'max_depth': 5, 'min_child_weight': 5}

Step 3. Tune Gamma

In [68]:
params = {
 'gamma':[i/10.0 for i in range(0,5)]
}



#grid_model = GridSearchCV(model, param_grid = params, scoring=scorer,
#                        n_jobs=4,iid=False, cv=5)
grid_model = GridSearchCV(model, param_grid = params, scoring=scorer,
                        n_jobs=4,cv=5)
grid_model.fit(X_train,y_train)

#Score the best model using the test data
model = grid_model.best_estimator_
model.fit( X_train,y_train)
#Make predictions
predictions = model.predict(X_test) 

mse= mean_squared_error(y_test, predictions)
print("RMSE of xgboost after tuning (step 3) is:", np.sqrt(mse))

RMSE of xgboost after tuning (step 3) is: 5.394199542667251


In [18]:
grid_model.best_params_

{'gamma': 0.1}

Step 4. Tune Regularization

In [69]:
params = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1]
}

#grid_model = GridSearchCV(model, param_grid = params, scoring=scorer,
#                        n_jobs=4,iid=False, cv=5 ,return_train_score=True)
grid_model = GridSearchCV(model, param_grid = params, scoring=scorer,
                        n_jobs=4,cv=5 ,return_train_score=True)
grid_model.fit(X_train,y_train)

#Score the best model using the test data
model = grid_model.best_estimator_
model.fit( X_train,y_train)
#Make predictions
predictions = model.predict(X_test) 

mse= mean_squared_error(y_test, predictions)
print("RMSE of xgboost after tuning (step 4) is:", np.sqrt(mse))

RMSE of xgboost after tuning (step 4) is: 5.394199570122429


In [70]:
grid_model.best_params_

{'reg_alpha': 1e-05}

In [71]:
params = {
 'reg_lambda':[1e-5, 1e-2, 0.1, 1]
}

#grid_model = GridSearchCV(model, param_grid = params, scoring=scorer,
#                        n_jobs=4,iid=False, cv=5 ,return_train_score=True)
grid_model = GridSearchCV(model, param_grid = params, scoring=scorer,
                        n_jobs=4,cv=5 ,return_train_score=True)
grid_model.fit(X_train,y_train)

#Score the best model using the test data
model = grid_model.best_estimator_
model.fit( X_train,y_train)
#Make predictions
predictions = model.predict(X_test) 

mse= mean_squared_error(y_test, predictions)
print("RMSE of xgboost after tuning (step 5) is:", np.sqrt(mse))

RMSE of xgboost after tuning (step 5) is: 5.513699313865063


### XGBoost Model with no Imputation

In [72]:
%time data_regressors, data_targets = \
        preprocess.window_data(data, lag=3,num_windows=10, step=1, predict_year=2010, \
                         target=target)

#Break up into training and testing data.

idx = pd.IndexSlice
data_train_regressors = data_regressors.loc[idx[:,2:10],:]
data_train_targets = data_targets.loc[idx[:,2:10],:]
data_test_regressors = data_regressors.loc[idx[:,1],:]
data_test_targets= data_targets.loc[idx[:,1],:]

#For Training, only consider windows that don't have a missing target as they offer nothing to training
#Therefore, remove those observations from both the training regressors and targets datasets.
data_train_regressors_subset = data_train_regressors[~np.isnan(list(data_train_targets.values.flatten()))]
data_train_targets_subset = data_train_targets[~np.isnan(list(data_train_targets.values.flatten()))]

#For testing, also remove windows with no target variable as it is impossible to measure preformance.
data_test_regressors_subset = data_test_regressors[~np.isnan(list(data_test_targets.values.flatten()))]
data_test_targets_subset = data_test_targets[~np.isnan(list(data_test_targets.values.flatten()))]

X_train_miss = data_train_regressors_subset.values
y_train_miss = data_train_targets_subset.values.ravel()
X_test_miss = data_test_regressors_subset.values
y_test_miss = data_test_targets_subset


Wall time: 605 ms


#### Out-of-the-box xgboost on data without imputation

Hand-tuning of the XGBoost model. I pick some important paramneters and play around until I get a good result. I'm sure there is more accuracy that can be obtained from this model by gridsearching but I think this is enough to illustrate that using a XGBoost (or perhaps any tree-based predictive algo) without any imputation done on the input data gives by far the best results of any of the models tried.

In [73]:
XGB = xgb.XGBRegressor(n_estimators=200,  objective='reg:squarederror',max_depth=7, subsample=0.87, reg_lambda=0.2)
XGB.fit( X_train_miss,y_train_miss)
#Make predictions
predictions = XGB.predict(X_test_miss) 

mse= mean_squared_error(y_test_miss, predictions)
print("RMSE of XGBoost out-of-the-box is:", np.sqrt(mse))

RMSE of XGBoost out-of-the-box is: 5.231545740351754


In [None]:
# jagNote:  Below is code that was not in xgboot_predictor.ipynb.
# Instead this code was derived automatically by AutooML and then
# modified to run on my local PC in my Anaconda environment.


In [74]:
# from Microsoft ModelBuilder script.py
import logging
logger = logging.getLogger("azureml.training.tabular")
logger.setLevel(logging.INFO)

In [75]:
# from Microsoft ModelBuilder script.py
def generate_preprocessor_config_0():
    from sklearn.preprocessing import MaxAbsScaler
    
    preproc = MaxAbsScaler(
        copy=True
    )
    
    return preproc

In [76]:
# from Microsoft ModelBuilder script.py
def generate_algorithm_config_0():
    # from lightgbm.sklearn import LGBMRegressor
    import lightgbm as lgb
    algorithm = lgb.LGBMRegressor(
        boosting_type='gbdt',
        class_weight=None,
        colsample_bytree=1.0,
        importance_type='split',
        learning_rate=0.1,
        max_depth=-1,
        min_child_samples=20,
        min_child_weight=0.001,
        min_split_gain=0.0,
        n_estimators=100,
        n_jobs=-1,
        num_leaves=31,
        objective=None,
        random_state=None,
        reg_alpha=0.0,
        reg_lambda=0.0,
        silent=True,
        subsample=1.0,
        subsample_for_bin=200000,
        subsample_freq=0,
        verbose=-1
    )
    
    return algorithm

In [77]:
def generate_algorithm_config_1():
    from sklearn.ensemble import ExtraTreesRegressor
    
    algorithm = ExtraTreesRegressor(
        bootstrap=False,
        ccp_alpha=0.0,
        criterion='mse',
        max_depth=None,
        max_features=0.5,
        max_leaf_nodes=None,
        max_samples=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        min_samples_leaf=0.005080937188890647,
        min_samples_split=0.0012814223889440828,
        min_weight_fraction_leaf=0.0,
        n_estimators=50,
        n_jobs=-1,
        oob_score=False,
        random_state=None,
        verbose=0,
        warm_start=False
    )
    
    return algorithm

def generate_algorithm_config_1():
    from sklearn.ensemble import ExtraTreesRegressor
    
    algorithm = ExtraTreesRegressor(
        bootstrap=False,
        ccp_alpha=0.0,
        criterion='mse',
        max_depth=None,
        max_features=0.5,
        max_leaf_nodes=None,
        max_samples=None,
        min_impurity_decrease=0.0,
        min_samples_leaf=0.005080937188890647,
        min_samples_split=0.0012814223889440828,
        min_weight_fraction_leaf=0.0,
        n_estimators=50,
        n_jobs=-1,
        oob_score=False,
        random_state=None,
        verbose=0,
        warm_start=False
    )
    
    return algorithm

In [78]:
# from Microsoft ModelBuilder script.py
def generate_preprocessor_config_1():
    from sklearn.preprocessing import MinMaxScaler
    
    preproc = MinMaxScaler(
        copy=True,
        feature_range=(0, 1)
    )
    
    return preproc

In [79]:
def generate_preprocessor_config_2():
    from sklearn.preprocessing import MinMaxScaler
    
    preproc = MinMaxScaler(
        copy=True,
        feature_range=(0, 1)
    )
    
    return preproc

In [80]:
# from Microsoft ModelBuilder script.py
def generate_algorithm_config_2():
    from sklearn.linear_model import ElasticNet
    
    algorithm = ElasticNet(
        alpha=0.001,
        copy_X=True,
        fit_intercept=True,
        l1_ratio=0.8436842105263158,
        max_iter=1000,
        normalize=False,
        positive=False,
        precompute=False,
        random_state=None,
        selection='cyclic',
        tol=0.0001,
        warm_start=False
    )
    
    return algorithm


from sklearn.linear_model import ElasticNet    
algorithm = ElasticNet(
    alpha=0.001,
    copy_X=True,
    fit_intercept=True,
    l1_ratio=0.8436842105263158,
    max_iter=1000,
    normalize=False,
    positive=False,
    precompute=False,
    random_state=None,
    selection='cyclic',
    tol=0.0001,
    warm_start=False
)

In [81]:
# from Microsoft ModelBuilder script.py
def generate_algorithm_config():
    # from azureml.automl.runtime.shared.model_wrappers import PreFittedSoftVotingRegressor
    # from sklearn.ensemble import VotingRegressor
    from sklearn.ensemble import VotingClassifier, VotingRegressor
    from sklearn.pipeline import Pipeline
    
    pipeline_0 = Pipeline(steps=[('preproc', generate_preprocessor_config_0()), ('model', generate_algorithm_config_0())])
    pipeline_1 = Pipeline(steps=[('preproc', generate_preprocessor_config_1()), ('model', generate_algorithm_config_1())])
    pipeline_2 = Pipeline(steps=[('preproc', generate_preprocessor_config_2()), ('model', generate_algorithm_config_2())])
    #algorithm = PreFittedSoftVotingRegressor(
    algorithm = VotingRegressor(
        estimators=[
            ('model_0', pipeline_0),
            ('model_1', pipeline_1),
            ('model_2', pipeline_2),
        ],
        weights=[0.5, 0.42857142857142855, 0.07142857142857142]
    )
    
    return algorithm

In [82]:
# from Microsoft ModelBuilder script.py
def build_model_pipeline():
    from sklearn.pipeline import Pipeline
    
    logger.info("Running build_model_pipeline")
    pipeline = Pipeline(
        steps=[
            ('featurization', generate_data_transformation_config()),
            ('ensemble', generate_algorithm_config()),
        ]
    )
    
    return pipeline

In [83]:
# from Microsoft ModelBuilder script.py
def train_model(X, y, sample_weights):
    # logger.info("Running train_model")
    model_pipeline = build_model_pipeline()
    
    model = model_pipeline.fit(X, y)
    return model

In [84]:
# from Microsoft ModelBuilder script.py
def calculate_metrics(model, X, y, sample_weights, X_test, y_test, cv_splits=None):
    from azureml.training.tabular.preprocessing.binning import make_dataset_bins
    from azureml.training.tabular.score.scoring import score_regression
    
    y_pred = model.predict(X_test)
    y_min = np.min(y)
    y_max = np.max(y)
    y_std = np.std(y)
    
    bin_info = make_dataset_bins(X_test.shape[0], y_test)
    metrics = score_regression(
        y_test, y_pred, ['normalized_root_mean_squared_error'], y_max, y_min, y_std, sample_weights, bin_info)
    return metrics

In [85]:
def generate_data_transformation_config():
    from sklearn.pipeline import FeatureUnion
    
    column_group_1 = [['Column1'], ['Column2'], ['Column3'], ['Column4'], ['Column5'], ['Column6'], ['Column7'], ['Column8'], ['Column9'], ['Column10'], ['Column11'], ['Column12'], ['Column13'], ['Column14'], ['Column15'], ['Column16'], ['Column17'], ['Column18'], ['Column19'], ['Column20'], ['Column21'], ['Column22'], ['Column23'], ['Column24'], ['Column25'], ['Column26'], ['Column27'], ['Column28'], ['Column29'], ['Column30'], ['Column31'], ['Column32'], ['Column33'], ['Column34'], ['Column35'], ['Column36'], ['Column37'], ['Column38'], ['Column39'], ['Column40'], ['Column41'], ['Column42'], ['Column43'], ['Column44'], ['Column45'], ['Column46'], ['Column47'], ['Column48'], ['Column49'], ['Column50'], ['Column51'], ['Column52'], ['Column53'], ['Column54'], ['Column55'], ['Column56'], ['Column57'], ['Column58'], ['Column59'], ['Column60'], ['Column61'], ['Column62'], ['Column63'], ['Column64'], ['Column65'], ['Column66'], ['Column67'], ['Column68'], ['Column69'], ['Column70'], ['Column71'], ['Column72'], ['Column73'], ['Column74'], ['Column75'], ['Column76'], ['Column77'], ['Column78'], ['Column79'], ['Column80'], ['Column81'], ['Column82'], ['Column83'], ['Column84'], ['Column85'], ['Column86'], ['Column87'], ['Column88'], ['Column89'], ['Column90'], ['Column91'], ['Column92'], ['Column93'], ['Column94'], ['Column95'], ['Column96'], ['Column97'], ['Column98'], ['Column99'], ['Column100'], ['Column101'], ['Column102'], ['Column103'], ['Column104'], ['Column105'], ['Column106'], ['Column107'], ['Column108'], ['Column109'], ['Column110'], ['Column111'], ['Column112'], ['Column113'], ['Column114'], ['Column115'], ['Column116'], ['Column117'], ['Column118'], ['Column119'], ['Column120'], ['Column121'], ['Column122'], ['Column123'], ['Column124'], ['Column125'], ['Column126'], ['Column127'], ['Column128'], ['Column129'], ['Column130'], ['Column131'], ['Column132'], ['Column133'], ['Column134'], ['Column135'], ['Column136'], ['Column137'], ['Column138'], ['Column139'], ['Column140'], ['Column141'], ['Column142'], ['Column143'], ['Column144'], ['Column145'], ['Column146'], ['Column147'], ['Column148'], ['Column149'], ['Column150'], ['Column151'], ['Column152'], ['Column153'], ['Column154'], ['Column155'], ['Column156'], ['Column157'], ['Column158'], ['Column159'], ['Column160'], ['Column161'], ['Column162'], ['Column163'], ['Column164'], ['Column165'], ['Column166'], ['Column167'], ['Column168'], ['Column169'], ['Column170'], ['Column171'], ['Column172'], ['Column173'], ['Column174'], ['Column175'], ['Column176'], ['Column177'], ['Column178'], ['Column179'], ['Column180'], ['Column181'], ['Column182'], ['Column183'], ['Column184'], ['Column185'], ['Column186'], ['Column187'], ['Column188'], ['Column189'], ['Column190'], ['Column191'], ['Column192'], ['Column193'], ['Column194'], ['Column195'], ['Column196'], ['Column197'], ['Column198'], ['Column199'], ['Column200'], ['Column201'], ['Column202'], ['Column203'], ['Column204'], ['Column205'], ['Column206'], ['Column207'], ['Column208'], ['Column209'], ['Column210'], ['Column211'], ['Column212'], ['Column213'], ['Column214'], ['Column215'], ['Column216'], ['Column217'], ['Column218'], ['Column219'], ['Column220'], ['Column221'], ['Column222'], ['Column223'], ['Column224'], ['Column225'], ['Column226'], ['Column227'], ['Column228'], ['Column229'], ['Column230'], ['Column231'], ['Column232'], ['Column233'], ['Column234'], ['Column235'], ['Column236'], ['Column237'], ['Column238'], ['Column239'], ['Column240'], ['Column241'], ['Column242'], ['Column243'], ['Column244'], ['Column245'], ['Column246'], ['Column247'], ['Column248'], ['Column249'], ['Column250'], ['Column251'], ['Column252'], ['Column253'], ['Column254'], ['Column255'], ['Column256'], ['Column257'], ['Column258'], ['Column259'], ['Column260'], ['Column261'], ['Column262'], ['Column263'], ['Column264'], ['Column265'], ['Column266'], ['Column267'], ['Column268'], ['Column269'], ['Column270'], ['Column271'], ['Column272'], ['Column273'], ['Column274'], ['Column275'], ['Column276'], ['Column277'], ['Column278'], ['Column279'], ['Column280'], ['Column281'], ['Column282'], ['Column283'], ['Column284'], ['Column285'], ['Column286'], ['Column287'], ['Column288'], ['Column289'], ['Column290'], ['Column291'], ['Column292'], ['Column293'], ['Column294'], ['Column295'], ['Column296'], ['Column297'], ['Column298'], ['Column299'], ['Column300'], ['Column301'], ['Column302'], ['Column303'], ['Column304'], ['Column305'], ['Column306'], ['Column307'], ['Column308'], ['Column309'], ['Column310'], ['Column311'], ['Column312'], ['Column313'], ['Column314'], ['Column315'], ['Column316'], ['Column317'], ['Column318'], ['Column319'], ['Column320'], ['Column321'], ['Column322'], ['Column323'], ['Column324'], ['Column325'], ['Column326'], ['Column327'], ['Column328'], ['Column329'], ['Column330'], ['Column331'], ['Column332'], ['Column333'], ['Column334'], ['Column335'], ['Column336'], ['Column337'], ['Column338'], ['Column339'], ['Column340'], ['Column341'], ['Column342'], ['Column343'], ['Column344'], ['Column345'], ['Column346'], ['Column347'], ['Column348'], ['Column349'], ['Column350'], ['Column351'], ['Column352'], ['Column353'], ['Column354'], ['Column355'], ['Column356'], ['Column357'], ['Column358'], ['Column359'], ['Column360'], ['Column361'], ['Column362'], ['Column363'], ['Column364'], ['Column365'], ['Column366'], ['Column367'], ['Column368'], ['Column369'], ['Column370'], ['Column371'], ['Column372'], ['Column373'], ['Column374'], ['Column375'], ['Column376'], ['Column377'], ['Column378'], ['Column379'], ['Column380'], ['Column381'], ['Column382'], ['Column383'], ['Column384'], ['Column385'], ['Column386'], ['Column387'], ['Column388'], ['Column389'], ['Column390'], ['Column391'], ['Column392'], ['Column393'], ['Column394'], ['Column395'], ['Column396'], ['Column397'], ['Column398'], ['Column399'], ['Column400'], ['Column401'], ['Column402'], ['Column403'], ['Column404'], ['Column405'], ['Column406'], ['Column407'], ['Column408'], ['Column409'], ['Column410'], ['Column411'], ['Column412'], ['Column413'], ['Column414'], ['Column415'], ['Column416'], ['Column417'], ['Column418'], ['Column419'], ['Column420'], ['Column421'], ['Column422'], ['Column423'], ['Column424'], ['Column425'], ['Column426'], ['Column427'], ['Column428'], ['Column429'], ['Column430'], ['Column431'], ['Column432'], ['Column433'], ['Column434'], ['Column435'], ['Column436'], ['Column437'], ['Column438'], ['Column439'], ['Column440'], ['Column441'], ['Column442'], ['Column443'], ['Column444'], ['Column445'], ['Column446'], ['Column447'], ['Column448'], ['Column449'], ['Column450'], ['Column451'], ['Column452'], ['Column453'], ['Column454'], ['Column455'], ['Column456'], ['Column457'], ['Column458'], ['Column459'], ['Column460'], ['Column461'], ['Column462'], ['Column463'], ['Column464'], ['Column465'], ['Column466'], ['Column467'], ['Column468'], ['Column469'], ['Column470'], ['Column471'], ['Column472'], ['Column473'], ['Column474'], ['Column475'], ['Column476'], ['Column477'], ['Column478'], ['Column479'], ['Column480'], ['Column481'], ['Column482'], ['Column483'], ['Column484'], ['Column485'], ['Column486'], ['Column487'], ['Column488'], ['Column489'], ['Column490'], ['Column491'], ['Column492'], ['Column493'], ['Column494'], ['Column495'], ['Column496'], ['Column497'], ['Column498'], ['Column499'], ['Column500'], ['Column501'], ['Column502'], ['Column503'], ['Column504'], ['Column505'], ['Column506'], ['Column507'], ['Column508'], ['Column509'], ['Column510'], ['Column511'], ['Column512'], ['Column513'], ['Column514'], ['Column515'], ['Column516'], ['Column517'], ['Column518'], ['Column519'], ['Column520'], ['Column521'], ['Column522'], ['Column523'], ['Column524'], ['Column525'], ['Column526'], ['Column527'], ['Column528'], ['Column529'], ['Column530'], ['Column531'], ['Column532'], ['Column533'], ['Column534'], ['Column535'], ['Column536'], ['Column537'], ['Column538'], ['Column539'], ['Column540'], ['Column541'], ['Column542'], ['Column543'], ['Column544'], ['Column545'], ['Column546'], ['Column547'], ['Column548'], ['Column549'], ['Column550'], ['Column551'], ['Column552'], ['Column553'], ['Column554'], ['Column555'], ['Column556'], ['Column557'], ['Column558'], ['Column559'], ['Column560'], ['Column561'], ['Column562'], ['Column563'], ['Column564'], ['Column565'], ['Column566'], ['Column567'], ['Column568'], ['Column569'], ['Column570'], ['Column571'], ['Column572'], ['Column573'], ['Column574'], ['Column575'], ['Column576'], ['Column577'], ['Column578'], ['Column579'], ['Column580'], ['Column581'], ['Column582'], ['Column583'], ['Column584'], ['Column585'], ['Column586'], ['Column587'], ['Column588'], ['Column589'], ['Column590'], ['Column591'], ['Column592'], ['Column593'], ['Column594'], ['Column595'], ['Column596'], ['Column597'], ['Column598'], ['Column599'], ['Column600'], ['Column601'], ['Column602'], ['Column603'], ['Column604'], ['Column605'], ['Column606'], ['Column607'], ['Column608'], ['Column609'], ['Column610'], ['Column611'], ['Column612'], ['Column613'], ['Column614'], ['Column615'], ['Column616'], ['Column617'], ['Column618'], ['Column619'], ['Column620'], ['Column621'], ['Column622'], ['Column623'], ['Column624'], ['Column625'], ['Column626'], ['Column627'], ['Column628'], ['Column629'], ['Column630'], ['Column631'], ['Column632'], ['Column633'], ['Column634'], ['Column635'], ['Column636'], ['Column637'], ['Column638'], ['Column639'], ['Column640'], ['Column641'], ['Column642'], ['Column643'], ['Column644'], ['Column645'], ['Column646'], ['Column647'], ['Column648'], ['Column649'], ['Column650'], ['Column651'], ['Column652'], ['Column653'], ['Column654'], ['Column655'], ['Column656'], ['Column657'], ['Column658'], ['Column659'], ['Column660'], ['Column661'], ['Column662'], ['Column663'], ['Column664'], ['Column665'], ['Column666'], ['Column667'], ['Column668'], ['Column669'], ['Column670'], ['Column671'], ['Column672'], ['Column673'], ['Column674'], ['Column675'], ['Column676'], ['Column677'], ['Column678'], ['Column679'], ['Column680'], ['Column681'], ['Column682'], ['Column683'], ['Column684'], ['Column685'], ['Column686'], ['Column687'], ['Column688'], ['Column689'], ['Column690'], ['Column691'], ['Column692'], ['Column693'], ['Column694'], ['Column695'], ['Column696'], ['Column697'], ['Column698'], ['Column699'], ['Column700'], ['Column701'], ['Column702'], ['Column703'], ['Column704'], ['Column705'], ['Column706'], ['Column707'], ['Column708'], ['Column709'], ['Column710'], ['Column711'], ['Column712'], ['Column713'], ['Column714'], ['Column715'], ['Column716'], ['Column717'], ['Column718'], ['Column719'], ['Column720'], ['Column721'], ['Column722'], ['Column723'], ['Column724'], ['Column725'], ['Column726'], ['Column727'], ['Column728'], ['Column729'], ['Column730'], ['Column731'], ['Column732'], ['Column733'], ['Column734'], ['Column735'], ['Column736'], ['Column737'], ['Column738'], ['Column739'], ['Column740'], ['Column741'], ['Column742'], ['Column743'], ['Column744'], ['Column745'], ['Column746'], ['Column747'], ['Column748'], ['Column749'], ['Column750'], ['Column751'], ['Column752'], ['Column753'], ['Column754'], ['Column755'], ['Column756'], ['Column757'], ['Column758'], ['Column759'], ['Column760'], ['Column761'], ['Column762'], ['Column763'], ['Column764'], ['Column765'], ['Column766'], ['Column767'], ['Column768'], ['Column769'], ['Column770'], ['Column771'], ['Column772'], ['Column773'], ['Column774'], ['Column775'], ['Column776'], ['Column777'], ['Column778'], ['Column779'], ['Column780'], ['Column781'], ['Column782'], ['Column783'], ['Column784'], ['Column785'], ['Column786'], ['Column787'], ['Column788'], ['Column789'], ['Column790'], ['Column791'], ['Column792'], ['Column793'], ['Column794'], ['Column795'], ['Column796'], ['Column797'], ['Column798'], ['Column799'], ['Column800'], ['Column801'], ['Column802'], ['Column803'], ['Column804'], ['Column805'], ['Column806'], ['Column807'], ['Column808'], ['Column809'], ['Column810'], ['Column811'], ['Column812'], ['Column813'], ['Column814'], ['Column815'], ['Column816'], ['Column817'], ['Column818'], ['Column819'], ['Column820'], ['Column821'], ['Column822'], ['Column823'], ['Column824'], ['Column825'], ['Column826'], ['Column827'], ['Column828'], ['Column829'], ['Column830'], ['Column831'], ['Column832'], ['Column833'], ['Column834'], ['Column835'], ['Column836'], ['Column837'], ['Column838'], ['Column839'], ['Column840'], ['Column841'], ['Column842'], ['Column843'], ['Column844'], ['Column845'], ['Column846'], ['Column847'], ['Column848'], ['Column849'], ['Column850'], ['Column851'], ['Column852'], ['Column853'], ['Column854'], ['Column855'], ['Column856'], ['Column857'], ['Column858'], ['Column859'], ['Column860'], ['Column861'], ['Column862'], ['Column863'], ['Column864'], ['Column865'], ['Column866'], ['Column867'], ['Column868'], ['Column869'], ['Column870'], ['Column871'], ['Column872'], ['Column873'], ['Column874'], ['Column875'], ['Column876'], ['Column877'], ['Column878'], ['Column879'], ['Column880'], ['Column881'], ['Column882'], ['Column883'], ['Column884'], ['Column885'], ['Column886'], ['Column887'], ['Column888'], ['Column889'], ['Column890'], ['Column891'], ['Column892'], ['Column893'], ['Column894'], ['Column895'], ['Column896'], ['Column897'], ['Column898'], ['Column899'], ['Column900'], ['Column901'], ['Column902'], ['Column903'], ['Column904'], ['Column905'], ['Column906'], ['Column907'], ['Column908'], ['Column909'], ['Column910'], ['Column911'], ['Column912'], ['Column913'], ['Column914'], ['Column915'], ['Column916'], ['Column917'], ['Column918'], ['Column919'], ['Column920'], ['Column921']]
    
    mapper = get_mapper_ab1045(column_group_1)
    return mapper

In [86]:
def get_mapper_ab1045(column_names):
    from sklearn.impute import SimpleImputer
    from sklearn_pandas.dataframe_mapper import DataFrameMapper
    from sklearn_pandas.features_generator import gen_features
    
    definition = gen_features(
        columns=column_names,
        classes=[
            {
                'class': SimpleImputer,
                'add_indicator': False,
                'copy': True,
                'fill_value': None,
                'missing_values': np.nan,
                'strategy': 'mean',
                'verbose': 0,
            },
        ]
    )
    mapper = DataFrameMapper(features=definition, input_df=True, sparse=True)
    
    return mapper

In [89]:

# need to convert to use microsoft azure algorithm

# XGB = xgb.XGBRegressor(n_estimators=200,  objective='reg:squarederror',max_depth=7, subsample=0.87, reg_lambda=0.2)
# XGB.fit( X_train_miss,y_train_miss)
# #Make predictions
# predictions = XGB.predict(X_test_miss) 

# mse= mean_squared_error(y_test_miss, predictions)
# print("RMSE of XGBoost out-of-the-box is:", np.sqrt(mse))

myalgol = generate_algorithm_config()
myalgol.fit( X_train,y_train)

predictions = myalgol.predict(X_test) 

mse= mean_squared_error(y_test, predictions)
print("RMSE is:", np.sqrt(mse))

# jagNote: Some warning are generated below but the code runs ok for now...

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


RMSE is: 4.761559497612451
