In [1]:
# Import libraries and dependencies
import numpy as np
import pandas as pd

In [2]:
# ------------------------------ Data Set Loading ------------------------------
# Read data set
df = pd.read_csv('testdata.csv')
df.head()

Unnamed: 0,GDP growth (annual %),Urban population,Total Energy
0,7.83,189947471,19.1
1,5.11,199949784,18.97
2,9.02,210823843,19.94
3,10.77,220472140,21.28
4,15.19,230206255,23.05


In [10]:
# ------------------------------- Data Cleaning --------------------------------

# Remove null values
df.dropna(inplace = True)

# Specify the features columns
X = df.drop(columns = [df.columns[-1]])

# Specify the target column
y = df.iloc[:,-1]

# Transform non-numerical columns into binary-type columns
X = pd.get_dummies(X)

In [11]:
# ----------------------------- Data Preprocessing -----------------------------

# Import train_test_split class
from sklearn.model_selection import train_test_split

# Divide data set into traning and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

In [12]:
# ------------------------------- Model Building -------------------------------

# Import machine learning model class
from sklearn.ensemble import RandomForestRegressor

# Instantiate machine learning model
rf = RandomForestRegressor()

# Fit the machine learning model with the training data
rf.fit(X_train, y_train)

# Make predictions using the testing data
y_pred = rf.predict(X_test)


In [13]:
# ------------------------------ Model Evaluation ------------------------------

# Get the coefficient of determination R2
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

# Get the Mean Squared Error
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_pred))

# Get the Root Mean Squared Error
print(np.sqrt(mean_squared_error(y_test, y_pred)))

0.9985528989697123
3.187322612499949
1.7853074280078345


In [24]:
# ------------------------------ Visualization  ------------------------------
#  plot a 3D chart where axis x and y would correspond to feature 1 and feature 2,
# and the z axis the y dataset.


# Plotting the clusters with three features
import plotly.express as px
import hvplot.pandas

fig = px.scatter_3d(df, x="GDP growth (annual %)", y=" Urban population ", z="Total Energy", color="Total Energy", size=" Urban population ", width=800)
#fig.update_layout(legend=dict(x=0,y=1))
fig.show()