# Machine Learning

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
from google.colab import files
import numpy as np
import plotly.express as px
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

Mounted at /content/drive


## No Energy Dataframe Production

In [2]:
df_emissions = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DataSci \
112/Final Project/Processed Data/All Ems.csv", index_col=0)
df_urban_pop = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DataSci \
112/Final Project/Processed Data/All Urban Pop.csv", index_col=0)
df_pop = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DataSci 112/Final \
Project/Processed Data/All Pop.csv", index_col=0)
df_gdp = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DataSci 112/Final \
Project/Processed Data/All GDP.csv", index_col=0)
df_el = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DataSci 112/Final \
Project/Processed Data/All El.csv", index_col=0)
df_fdi = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DataSci 112/Final \
Project/Processed Data/All FDI.csv", index_col=0)
df_fort = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DataSci 112/Final \
Project/Processed Data/All Fortune 500.csv", index_col=0)

In [3]:
s1 = df_emissions.merge(df_urban_pop, on=("name", "year"), how="inner")
s2 = s1.merge(df_pop, on=("name", "year"), how="inner")
s3 = s2.merge(df_gdp.drop("name", axis=1), on=("code", "year"), how="inner")
s4 = s3.merge(df_el[~df_el["electrification"].isna()].drop("name", axis=1), 
              on=("code", "year"), how="inner")
s5 = s4.merge(df_fdi.drop("name", axis=1), on=("code", "year"), how="inner")
s6 = s5.merge(df_fort.rename(columns={"Company":"Number of Fortune 500"}),
              left_on=("name", "year"), right_on=("Country", "Year"),
              how="left")#.drop(["Country", "Year"], axis=1)
df_ml = s6[["name", "code", "year", "population", "urban population percentage",
         "gdp", "electrification", "fdi", "Number of Fortune 500", 
         "emissions"]].rename(columns={
             "name":"Country",
             "code":"Code",
             "year":"Year",
             "emissions":"Emissions (Tons CO2)",
             "population":"Population",
             "urban population percentage":"Urban Percentage",
             "gdp":"GDP Per Capita (USD)",
             "electrification":"Electrification Percentage",
             "fdi":"FDI (USD)"
         })
df_ml["Population"] *= 1000
df_ml["FDI (USD)"] = df_ml["FDI (USD)"].astype(int)
df_ml["GDP Per Capita (USD)"] = df_ml["GDP Per Capita (USD)"].astype(int)
df_ml["Electrification Percentage"] = df_ml["Electrification Percentage"].round(1)
df_ml["Emissions (Tons CO2)"] = df_ml["Emissions (Tons CO2)"].astype(int)
df_ml["Number of Fortune 500"] = df_ml["Number of Fortune 500"]\
  .fillna(0).astype(int)
df_ml

Unnamed: 0,Country,Code,Year,Population,Urban Percentage,GDP Per Capita (USD),Electrification Percentage,FDI (USD),Number of Fortune 500,Emissions (Tons CO2)
0,Australia,AUS,1990,17041000,85.4,18249,100.0,8457776859,0,278154140
1,Australia,AUS,1991,17272000,85.4,18860,100.0,2612066526,0,279528500
2,Australia,AUS,1992,17486000,85.3,18624,100.0,4941906671,0,284525380
3,Australia,AUS,1993,17687000,85.2,17700,100.0,5312435141,0,288870560
4,Australia,AUS,1994,17883000,85.0,18129,100.0,4458484243,0,293696580
...,...,...,...,...,...,...,...,...,...,...
3006,Marshall Islands,MHL,2012,53000,74.5,3445,90.6,21373811,0,135568
3007,Marshall Islands,MHL,2013,53000,74.9,3611,91.2,32548149,0,139232
3008,Marshall Islands,MHL,2014,53000,75.4,3672,91.8,9047685,0,142896
3009,Marshall Islands,MHL,2017,53000,76.6,4507,94.3,5835497,0,146560


In [4]:
df_ml_processed = df_ml.drop(["Country", "Code", "Year"], axis=1)
df_ml_processed["Years Ago"] = df_ml["Year"] - df_ml["Year"].min()
df_ml_processed = df_ml_processed[["Years Ago", "Population", "Urban Percentage",
                                   "GDP Per Capita (USD)",
                                   "Electrification Percentage", "FDI (USD)",
                                   "Number of Fortune 500", 
                                   "Emissions (Tons CO2)"]]
df_ml_processed

Unnamed: 0,Years Ago,Population,Urban Percentage,GDP Per Capita (USD),Electrification Percentage,FDI (USD),Number of Fortune 500,Emissions (Tons CO2)
0,0,17041000,85.4,18249,100.0,8457776859,0,278154140
1,1,17272000,85.4,18860,100.0,2612066526,0,279528500
2,2,17486000,85.3,18624,100.0,4941906671,0,284525380
3,3,17687000,85.2,17700,100.0,5312435141,0,288870560
4,4,17883000,85.0,18129,100.0,4458484243,0,293696580
...,...,...,...,...,...,...,...,...
3006,22,53000,74.5,3445,90.6,21373811,0,135568
3007,23,53000,74.9,3611,91.2,32548149,0,139232
3008,24,53000,75.4,3672,91.8,9047685,0,142896
3009,27,53000,76.6,4507,94.3,5835497,0,146560


In [5]:
df_ml_processed["Emissions (Tons CO2)"].describe()

count    3.011000e+03
mean     1.583315e+08
std      7.195824e+08
min      7.734000e+03
25%      1.961334e+06
50%      1.094802e+07
75%      7.488019e+07
max      1.095621e+10
Name: Emissions (Tons CO2), dtype: float64

## Energy Dataframe Production

In [6]:
paths = [
"/content/drive/MyDrive/Colab Notebooks/DataSci 112/Final Project/Processed \
Data/Country biofuel_share_energy by Year", 
"/content/drive/MyDrive/Colab Notebooks/DataSci 112/Final Project/Processed \
Data/Country coal_share_energy by Year", 
"/content/drive/MyDrive/Colab Notebooks/DataSci 112/Final Project/Processed \
Data/Country fossil_share_energy by Year", 
"/content/drive/MyDrive/Colab Notebooks/DataSci 112/Final Project/Processed \
Data/Country gas_share_energy by Year",
"/content/drive/MyDrive/Colab Notebooks/DataSci 112/Final Project/Processed \
Data/Country oil_share_energy by Year"
]
metrics = ["Biofuel Share (%)", "Coal Share (%)", "Fossil Fuel Share (%)", 
           "Gas Share (%)", "Oil Share (%)"]

In [7]:
df_ml_pre = df_ml
for p in range(5):
  df = pd.read_csv(paths[p]).drop("Unnamed: 0", axis=1)
  data = []
  for i in range(242):
    for j in range(2, 123):
      if df.iloc[i, j] > 0:
        data.append((df.iloc[i, 0], df.iloc[i, 1], df.columns[j], 
                    df.iloc[i, j]))
  df_full = pd.DataFrame(data, columns=["code", "name", "year", metrics[p]])
  df_full["year"] = df_full["year"].str[-4:].astype(int)
  df_full["name"] = df_full["name"].str.strip()
  df_full = df_full.rename(columns={"code":"Code", "year":"Year"})
  df_ml_pre = df_ml_pre.merge(df_full.drop("name", axis=1),
                                on=("Code", "Year"), how="left")
df_ml_pre

Unnamed: 0,Country,Code,Year,Population,Urban Percentage,GDP Per Capita (USD),Electrification Percentage,FDI (USD),Number of Fortune 500,Emissions (Tons CO2),Biofuel Share (%),Coal Share (%),Fossil Fuel Share (%),Gas Share (%),Oil Share (%)
0,Australia,AUS,1990,17041000,85.4,18249,100.0,8457776859,0,278154140,,42.032,95.922,15.869,38.021
1,Australia,AUS,1991,17272000,85.4,18860,100.0,2612066526,0,279528500,,43.151,95.669,15.310,37.207
2,Australia,AUS,1992,17486000,85.3,18624,100.0,4941906671,0,284525380,,42.972,95.648,15.680,36.996
3,Australia,AUS,1993,17687000,85.2,17700,100.0,5312435141,0,288870560,,42.050,95.654,15.806,37.798
4,Australia,AUS,1994,17883000,85.0,18129,100.0,4458484243,0,293696580,,41.263,95.871,16.881,37.727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3006,Marshall Islands,MHL,2012,53000,74.5,3445,90.6,21373811,0,135568,,,,,
3007,Marshall Islands,MHL,2013,53000,74.9,3611,91.2,32548149,0,139232,,,,,
3008,Marshall Islands,MHL,2014,53000,75.4,3672,91.8,9047685,0,142896,,,,,
3009,Marshall Islands,MHL,2017,53000,76.6,4507,94.3,5835497,0,146560,,,,,


In [8]:
df_ml_full = df_ml_pre[df_ml_pre.iloc[:, -5:].sum(axis=1) != 0]
df_ml_full.iloc[:, -5:] = df_ml_full.iloc[:, -5:].fillna(0).round(1)
df_ml_full

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ml_full.iloc[:, -5:] = df_ml_full.iloc[:, -5:].fillna(0).round(1)


Unnamed: 0,Country,Code,Year,Population,Urban Percentage,GDP Per Capita (USD),Electrification Percentage,FDI (USD),Number of Fortune 500,Emissions (Tons CO2),Biofuel Share (%),Coal Share (%),Fossil Fuel Share (%),Gas Share (%),Oil Share (%)
0,Australia,AUS,1990,17041000,85.4,18249,100.0,8457776859,0,278154140,0.0,42.0,95.9,15.9,38.0
1,Australia,AUS,1991,17272000,85.4,18860,100.0,2612066526,0,279528500,0.0,43.2,95.7,15.3,37.2
2,Australia,AUS,1992,17486000,85.3,18624,100.0,4941906671,0,284525380,0.0,43.0,95.6,15.7,37.0
3,Australia,AUS,1993,17687000,85.2,17700,100.0,5312435141,0,288870560,0.0,42.0,95.7,15.8,37.8
4,Australia,AUS,1994,17883000,85.0,18129,100.0,4458484243,0,293696580,0.0,41.3,95.9,16.9,37.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2876,Oman,OMN,2014,3961000,80.1,23121,100.0,1286345903,0,65896620,0.0,0.1,100.0,67.1,32.8
2877,Oman,OMN,2016,4425000,82.5,17082,100.0,2265279583,0,65669388,0.0,0.2,100.0,68.0,31.8
2878,Oman,OMN,2017,4636000,83.6,17802,100.0,2988036410,0,68619700,0.0,0.2,100.0,62.6,37.2
2879,Oman,OMN,2018,4830000,84.5,19887,100.0,6455136540,0,72555760,0.0,0.2,100.0,60.5,39.3


In [9]:
df_ml_full_processed = df_ml_full.drop(["Country", "Code", "Year"], axis=1)
df_ml_full_processed["Years Ago"] = df_ml["Year"] - df_ml["Year"].min()
df_ml_full_processed = df_ml_full_processed[["Years Ago", "Population", 
                                   "Urban Percentage", "GDP Per Capita (USD)",
                                   "Electrification Percentage", "FDI (USD)",
                                   "Number of Fortune 500", "Biofuel Share (%)",
                                   "Coal Share (%)", "Oil Share (%)", 
                                   "Gas Share (%)", "Fossil Fuel Share (%)",
                                   "Emissions (Tons CO2)"]]
df_ml_full_processed

Unnamed: 0,Years Ago,Population,Urban Percentage,GDP Per Capita (USD),Electrification Percentage,FDI (USD),Number of Fortune 500,Biofuel Share (%),Coal Share (%),Oil Share (%),Gas Share (%),Fossil Fuel Share (%),Emissions (Tons CO2)
0,0,17041000,85.4,18249,100.0,8457776859,0,0.0,42.0,38.0,15.9,95.9,278154140
1,1,17272000,85.4,18860,100.0,2612066526,0,0.0,43.2,37.2,15.3,95.7,279528500
2,2,17486000,85.3,18624,100.0,4941906671,0,0.0,43.0,37.0,15.7,95.6,284525380
3,3,17687000,85.2,17700,100.0,5312435141,0,0.0,42.0,37.8,15.8,95.7,288870560
4,4,17883000,85.0,18129,100.0,4458484243,0,0.0,41.3,37.7,16.9,95.9,293696580
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2876,24,3961000,80.1,23121,100.0,1286345903,0,0.0,0.1,32.8,67.1,100.0,65896620
2877,26,4425000,82.5,17082,100.0,2265279583,0,0.0,0.2,31.8,68.0,100.0,65669388
2878,27,4636000,83.6,17802,100.0,2988036410,0,0.0,0.2,37.2,62.6,100.0,68619700
2879,28,4830000,84.5,19887,100.0,6455136540,0,0.0,0.2,39.3,60.5,100.0,72555760


In [10]:
df_ml_full_processed["Emissions (Tons CO2)"].describe()

count    1.266000e+03
mean     3.495792e+08
std      1.032548e+09
min      2.242604e+06
25%      4.737548e+07
50%      9.704744e+07
75%      3.567606e+08
max      1.074100e+10
Name: Emissions (Tons CO2), dtype: float64

## No Energy ML

In [11]:
no_energy_X = df_ml_processed.iloc[:, :-1]
no_energy_y = df_ml_processed.iloc[:, -1]
no_energy_y_log = np.log10(no_energy_y)

### K Neighbors

In [12]:
scores = []
for i in range(1, 201):
  pipeline = make_pipeline(
      StandardScaler(),
      KNeighborsRegressor(n_neighbors=i)
  )
  scores.append(-cross_val_score(
      pipeline,
      X=no_energy_X,
      y=no_energy_y,
      scoring="neg_root_mean_squared_error",
      cv=10
  ).mean())
px.scatter(x=[i for i in range(1, 201)], y=scores, 
           title="Test Error for K-Nearest Neighbors Regressor without Energy",
           labels={"x":"Neighbors", "y":"Test Error"})

Test Error is lowest for 37 Neighbors

In [13]:
scores = []
for i in range(1, 201):
  pipeline = make_pipeline(
      StandardScaler(),
      KNeighborsRegressor(n_neighbors=i)
  )
  scores.append(-cross_val_score(
      pipeline,
      X=no_energy_X,
      y=no_energy_y_log,
      scoring="neg_root_mean_squared_error",
      cv=10
  ).mean())
px.scatter(x=[i for i in range(1, 201)], y=scores, 
           title="Test Error for K-Nearest Neighbors Regressor without Energy",
           labels={"x":"Neighbors", "y":"Log Test Error"})

Test error is lowest for 21 neighbors

### Linear Regression

In [14]:
pipeline = make_pipeline(
      StandardScaler(),
      LinearRegression()
  )
-cross_val_score(
    pipeline,
    X=no_energy_X,
    y=no_energy_y,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

440013110.7558632

In [15]:
no_energy_y_log = np.log10(no_energy_y)
pipeline = make_pipeline(
      StandardScaler(),
      LinearRegression()
  )
-cross_val_score(
    pipeline,
    X=no_energy_X,
    y=no_energy_y_log,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

0.8464584499587218

### Simple Ensemble

In [16]:
KNeighbors = make_pipeline(
      StandardScaler(),
      KNeighborsRegressor(n_neighbors=37)
  )
Linear = make_pipeline(
      StandardScaler(),
      LinearRegression()
  )
Voter = VotingRegressor([("Neighbors", KNeighbors), ("Linear", Linear)])
-cross_val_score(
    Voter,
    X=no_energy_X,
    y=no_energy_y,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

325300436.47102344

In [17]:
-cross_val_score(
    Voter,
    X=no_energy_X,
    y=no_energy_y_log,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

0.7392679934843263

In [18]:
Stack = StackingRegressor([("Neighbors", KNeighbors), ("Linear", Linear)])
-cross_val_score(
    Stack,
    X=no_energy_X,
    y=no_energy_y,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

2068377339090239.5

In [19]:
Stack = StackingRegressor([("Neighbors", KNeighbors), ("Linear", Linear)])
-cross_val_score(
    Stack,
    X=no_energy_X,
    y=no_energy_y_log,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

0.7066144073813833

### XGBoost

In [20]:
pipeline = make_pipeline(
      StandardScaler(),
      XGBRegressor()
  )
-cross_val_score(
    pipeline,
    X=no_energy_X,
    y=no_energy_y,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

321789550.34002036

In [21]:
params = {"xgbregressor__eta":uniform(0.01, 0.2),
          "xgbregressor__n_estimators":randint(100, 500),
          "xgbregressor__max_depth":randint(3, 10),
          "xgbregressor__min_child_weight":uniform(3, 20),
          "xgbregressor__reg_alpha":uniform(0.01, 0.2),
          "xgbregressor__reg_lambda":uniform(0.01, 0.2)}

rscv = RandomizedSearchCV(
    pipeline,
    params,
    n_iter=100,
    scoring="neg_root_mean_squared_error",
    n_jobs=2,
    cv=10
)

rscv.fit(no_energy_X, no_energy_y)

-rscv.best_score_, rscv.best_params_

(281134739.71920955,
 {'xgbregressor__eta': 0.11563022128002773,
  'xgbregressor__max_depth': 6,
  'xgbregressor__min_child_weight': 22.73762979511926,
  'xgbregressor__n_estimators': 464,
  'xgbregressor__reg_alpha': 0.030937244429103943,
  'xgbregressor__reg_lambda': 0.04959458876316877})

In [22]:
pipeline = make_pipeline(
      StandardScaler(),
      XGBRegressor()
  )
-cross_val_score(
    pipeline,
    X=no_energy_X,
    y=no_energy_y_log,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

0.3252436278321002

In [23]:
params = {"xgbregressor__eta":uniform(0.01, 0.2),
          "xgbregressor__n_estimators":randint(100, 500),
          "xgbregressor__max_depth":randint(3, 10),
          "xgbregressor__min_child_weight":uniform(3, 20),
          "xgbregressor__reg_alpha":uniform(0.01, 0.2),
          "xgbregressor__reg_lambda":uniform(0.01, 0.2)}

rscv = RandomizedSearchCV(
    pipeline,
    params,
    n_iter=100,
    scoring="neg_root_mean_squared_error",
    n_jobs=2,
    cv=10
)

rscv.fit(no_energy_X, no_energy_y_log)

-rscv.best_score_, rscv.best_params_

(0.29811815990830337,
 {'xgbregressor__eta': 0.11747009122897904,
  'xgbregressor__max_depth': 3,
  'xgbregressor__min_child_weight': 20.698081873344005,
  'xgbregressor__n_estimators': 363,
  'xgbregressor__reg_alpha': 0.11830779609473506,
  'xgbregressor__reg_lambda': 0.17894506209081362})

## Energy ML

In [24]:
energy_X = df_ml_full_processed.iloc[:, :-1]
energy_y = df_ml_full_processed.iloc[:, -1]
energy_y_log = np.log10(energy_y)

### K Neighbors

In [25]:
scores = []
for i in range(1, 201):
  pipeline = make_pipeline(
      StandardScaler(),
      KNeighborsRegressor(n_neighbors=i)
  )
  scores.append(-cross_val_score(
      pipeline,
      X=energy_X,
      y=energy_y,
      scoring="neg_root_mean_squared_error",
      cv=10
  ).mean())
px.scatter(x=[i for i in range(1, 201)], y=scores, 
           title="Test Error for K-Nearest Neighbors Regressor with Energy",
           labels={"x":"Neighbors", "y":"Test Error"})

Test error is lowest for 53 neighbors

In [26]:
scores = []
for i in range(1, 201):
  pipeline = make_pipeline(
      StandardScaler(),
      KNeighborsRegressor(n_neighbors=i)
  )
  scores.append(-cross_val_score(
      pipeline,
      X=energy_X,
      y=energy_y_log,
      scoring="neg_root_mean_squared_error",
      cv=10
  ).mean())
px.scatter(x=[i for i in range(1, 201)], y=scores, 
           title="Test Error for K-Nearest Neighbors Regressor with Energy",
           labels={"x":"Neighbors", "y":"Log Test Error"})

Test error is lowest for 45 neighbors

### Linear Regression

In [27]:
pipeline = make_pipeline(
      StandardScaler(),
      LinearRegression()
  )
-cross_val_score(
    pipeline,
    X=energy_X,
    y=energy_y,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

786714386.2282263

In [28]:
no_energy_y_log = np.log10(no_energy_y)
pipeline = make_pipeline(
      StandardScaler(),
      LinearRegression()
  )
-cross_val_score(
    pipeline,
    X=energy_X,
    y=energy_y_log,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

0.4233911748629192

### Simple Ensemble

In [29]:
-cross_val_score(
    Voter,
    X=energy_X,
    y=energy_y,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

554765817.2341408

In [30]:
-cross_val_score(
    Stack,
    X=energy_X,
    y=energy_y,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

2074944649688378.0

In [31]:
-cross_val_score(
    Voter,
    X=energy_X,
    y=energy_y_log,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

0.37375262473283855

In [32]:
-cross_val_score(
    Stack,
    X=energy_X,
    y=energy_y_log,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

0.38475092006757355

### XGBoost

In [33]:
pipeline = make_pipeline(
      StandardScaler(),
      XGBRegressor()
  )
-cross_val_score(
    pipeline,
    X=energy_X,
    y=energy_y,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

637011817.307306

In [34]:
params = {"xgbregressor__eta":uniform(0.01, 0.2),
          "xgbregressor__n_estimators":randint(100, 500),
          "xgbregressor__max_depth":randint(3, 10),
          "xgbregressor__min_child_weight":uniform(3, 20),
          "xgbregressor__reg_alpha":uniform(0.01, 0.2),
          "xgbregressor__reg_lambda":uniform(0.01, 0.2)}

rscv = RandomizedSearchCV(
    pipeline,
    params,
    n_iter=100,
    scoring="neg_root_mean_squared_error",
    n_jobs=2,
    cv=10
)

rscv.fit(energy_X, energy_y)

-rscv.best_score_, rscv.best_params_

(529319164.9998269,
 {'xgbregressor__eta': 0.15400046627728453,
  'xgbregressor__max_depth': 5,
  'xgbregressor__min_child_weight': 22.261072499801944,
  'xgbregressor__n_estimators': 226,
  'xgbregressor__reg_alpha': 0.11829362552205923,
  'xgbregressor__reg_lambda': 0.04359796187841886})

In [35]:
pipeline = make_pipeline(
      StandardScaler(),
      XGBRegressor()
  )
-cross_val_score(
    pipeline,
    X=energy_X,
    y=energy_y_log,
    scoring="neg_root_mean_squared_error",
    cv=10
).mean()

0.26846817390691047

In [36]:
params = {"xgbregressor__eta":uniform(0.01, 0.2),
          "xgbregressor__n_estimators":randint(100, 500),
          "xgbregressor__max_depth":randint(3, 10),
          "xgbregressor__min_child_weight":uniform(3, 20),
          "xgbregressor__reg_alpha":uniform(0.01, 0.2),
          "xgbregressor__reg_lambda":uniform(0.01, 0.2)}

rscv = RandomizedSearchCV(
    pipeline,
    params,
    n_iter=100,
    scoring="neg_root_mean_squared_error",
    n_jobs=2,
    cv=10
)

rscv.fit(energy_X, energy_y_log)

-rscv.best_score_, rscv.best_params_

(0.2587638665493514,
 {'xgbregressor__eta': 0.1353103977798682,
  'xgbregressor__max_depth': 3,
  'xgbregressor__min_child_weight': 19.88381150895329,
  'xgbregressor__n_estimators': 285,
  'xgbregressor__reg_alpha': 0.12164817160295653,
  'xgbregressor__reg_lambda': 0.19695339589192953})

In [37]:
np.sqrt(((energy_y_log - energy_y_log.mean()) ** 2).mean())

0.5918422657788786