In [3]:
# Import libraries and dependencies
import numpy as np
import pandas as pd

In [4]:
# ------------------------------ Data Set Loading ------------------------------
# Read data set
df = pd.read_csv('testdata.csv')
df

Unnamed: 0,Year,GDP growth (annual %),% Urban,Total Energy (5 yr)
0,1980,7.83,19%,24.36
1,1981,5.11,20%,25.03
2,1982,9.02,21%,26.38
3,1983,10.77,22%,28.17
4,1984,15.19,22%,28.32
5,1985,13.43,23%,28.37
6,1986,8.95,24%,28.72
7,1987,11.66,24%,30.07
8,1988,11.22,25%,32.38
9,1989,4.21,26%,34.81


In [5]:
# ------------------------------- Data Cleaning --------------------------------

# Remove null values
df.dropna(inplace = True)

# Specify the features columns
X = df.drop(columns = [df.columns[-1]])

# Specify the target column
y = df.iloc[:,-1]

# Transform non-numerical columns into binary-type columns
X = pd.get_dummies(X)

In [6]:
# ----------------------------- Data Preprocessing -----------------------------

# Import train_test_split class
from sklearn.model_selection import train_test_split

# Divide data set into traning and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

In [7]:
# ------------------------------- Model Building -------------------------------

# Import machine learning model class
from sklearn.ensemble import RandomForestRegressor

# Instantiate machine learning model
rf = RandomForestRegressor()

# Fit the machine learning model with the training data
rf.fit(X_train, y_train)

# Make predictions using the testing data
y_pred = rf.predict(X_test)


In [15]:
# Make predictions using ALL years
y_pred = rf.predict(X)
y_pred

array([ 26.1628,  26.746 ,  26.4004,  27.3179,  28.3498,  28.5032,
        28.5539,  29.6854,  31.239 ,  31.9123,  38.4263,  38.529 ,
        38.3583,  38.8312,  40.0585,  42.0426,  42.4612,  45.9571,
        51.9243,  64.603 ,  70.2639,  74.7359,  90.8506,  96.2056,
       100.9206, 112.2777, 118.9361, 128.0432, 135.0377, 137.3223,
       135.7607, 137.8562, 142.2881, 145.2913, 148.6804])

In [17]:
# Add predictive variables to database
df["Predicted Energy"] = y_pred
df

Unnamed: 0,Year,GDP growth (annual %),% Urban,Total Energy (5 yr),Predicted Energy
0,1980,7.83,19%,24.36,26.1628
1,1981,5.11,20%,25.03,26.746
2,1982,9.02,21%,26.38,26.4004
3,1983,10.77,22%,28.17,27.3179
4,1984,15.19,22%,28.32,28.3498
5,1985,13.43,23%,28.37,28.5032
6,1986,8.95,24%,28.72,28.5539
7,1987,11.66,24%,30.07,29.6854
8,1988,11.22,25%,32.38,31.239
9,1989,4.21,26%,34.81,31.9123


In [8]:
# ------------------------------ Model Evaluation ------------------------------

# Get the coefficient of determination R2
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

# Get the Mean Squared Error
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_pred))

# Get the Root Mean Squared Error
print(np.sqrt(mean_squared_error(y_test, y_pred)))

0.956927848649268
12.25878679571454
3.501255031515776


In [19]:
# Bomin

import plotly.express as px
import hvplot.pandas

fig = px.line(df,x="Year", y=["Total Energy (5 yr)", "Predicted Energy"])
fig.show()

In [None]:
# ------------------------------ Visualization  ------------------------------
#  plot a 3D chart where axis x and y would correspond to feature 1 and feature 2,
# and the z axis the y dataset.


# Plotting the clusters with three features
import plotly.express as px
import hvplot.pandas

fig = px.scatter_3d(df, x="GDP growth (annual %)", y="% Urban", z="Total Energy", color="Total Energy", size=" Urban population ", width=800)
#fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [None]:
# export dataframe to csv
