# Multiple Linear Regression Model

In [17]:
# Installations
# !pip install keras
# !pip install tensorflow
# !pip install sklearn --upgrade
# !pip install joblib

In [18]:
import warnings
warnings.simplefilter('ignore')

# Load dependencies
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Set the same starting seed number for numpy.random so the results are reproducible
from numpy.random import seed
seed(42)

## Basic Data Cleaning

In [19]:
# Read and clean the data
# filepath = os.path.join('..','resources','WDI_csv','WDIData.csv')
filepath = os.path.join('..','resources','new_df.csv')
df = pd.read_csv(filepath)
df.head()

Unnamed: 0.1,Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,Value,HDI
0,0,South Asia,SAS,"Population, total",SP.POP.TOTL,1990,1133495000.0,0.441
1,1,South Asia,SAS,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1990,58.1448,0.441
2,2,South Asia,SAS,GDP (current US$),NY.GDP.MKTP.CD,1990,407227000000.0,0.441
3,4,South Asia,SAS,"Agriculture, forestry, and fishing, value adde...",NV.AGR.TOTL.ZS,1990,26.9171,0.441
4,5,Sub-Saharan Africa,SSF,"Population, total",SP.POP.TOTL,1990,509451900.0,0.402


In [20]:
# Pivot table with indicators/features as column names
pivot_df = df.pivot_table(index=["Country Name","Year","HDI"], columns=["Indicator Name"], values=["Value"])

# Flatten pivot table
pivot_df.columns = pivot_df.columns.to_series().str.join(' ')
pivot_df.reset_index(inplace=True)

# Rename columns
pivot_df = pivot_df.rename(columns={"Value Agriculture, forestry, and fishing, value added (% of GDP)": "Agriculture (% GDP)",
                                    "Value Employers, total (% of total employment) (modeled ILO estimate)": "Employment (%)",
                                    "Value GDP (current US$)": "GDP (USD)",
                                    "Value Life expectancy at birth, total (years)": "Life expectancy",
                                    "Value Population, total": "Population"})

# Drop rows with null values
print(pivot_df.shape)
pivot_df = pivot_df.dropna()
print(pivot_df.shape)
pivot_df.drop(pivot_df[pivot_df.HDI == ".."].index, inplace=True)
print(pivot_df.shape)

pivot_df["HDI"] = pivot_df["HDI"].astype('float')

pivot_df.reset_index(inplace=True)
pivot_df.head()

(4988, 8)
(3881, 8)
(3652, 8)


Unnamed: 0,index,Country Name,Year,HDI,Agriculture (% GDP),Employment (%),GDP (USD),Life expectancy,Population
0,12,Afghanistan,2002,0.378,38.627892,0.521,4055177000.0,56.784,22600770.0
1,13,Afghanistan,2003,0.387,37.418855,0.481,4515563000.0,57.271,23680871.0
2,14,Afghanistan,2004,0.4,29.721067,0.481,5226775000.0,57.772,24726684.0
3,15,Afghanistan,2005,0.41,31.114855,0.489,6209140000.0,58.29,25654277.0
4,16,Afghanistan,2006,0.419,28.635969,0.487,6971287000.0,58.826,26433049.0


In [21]:
# Specify indicator(s)
indicators = ["Life expectancy",
              "Agriculture (% GDP)",
              "Population",
              "GDP (USD)",
              "Employment (%)"]

# Initialize features array
X = []

# For each row in the df
for row in range(len(pivot_df)):

    point = []

    # Append each indicator value to the data point
    for i in range(len(indicators)):
        point.append(pivot_df[indicators[i]][row])
        
    # Append the row to the features array
    X.append(point)

X

[[56.784, 38.62789186, 22600770.0, 4055176933.0, 0.521000028],
 [57.271, 37.41885544, 23680871.0, 4515563414.0, 0.481000006],
 [57.772, 29.72106714, 24726684.0, 5226775163.0, 0.481000006],
 [58.29, 31.11485491, 25654277.0, 6209140044.0, 0.488999993],
 [58.826, 28.63596858, 26433049.0, 6971286732.0, 0.486999989],
 [59.375, 30.10501136, 27100536.0, 9747886105.0, 0.492000014],
 [59.93, 24.89227001, 27722276.0, 10109218068.0, 0.488000005],
 [60.484, 29.29750105, 28394813.0, 12439087077.0, 0.497000009],
 [61.028, 26.21006854, 29185507.0, 15856574731.0, 0.49000001],
 [61.553, 23.74366399, 30117413.0, 17804280538.0, 0.493000001],
 [62.054, 24.39087363, 31161376.0, 20001615789.0, 0.499000013],
 [62.525, 22.81066274, 32269589.0, 20561054090.0, 0.4959999920000001],
 [62.966, 22.13704137, 33370794.0, 20484873230.0, 0.497999996],
 [63.377, 20.63432272, 34413603.0, 19907111419.0, 0.501999974],
 [63.76300000000001,
  21.08108621,
  35383128.0,
  19362642267.0,
  0.5059999820000001],
 [64.13, 20.4665

In [22]:
# Flatten the data into arrays
X = np.array(X)

y = np.array(pivot_df["HDI"])
y = y.reshape(-1, 1)

print(X.shape)
print(y.shape)

(3652, 5)
(3652, 1)


## Plot the Data

In [23]:
# fig, ax = plt.subplots()
# ax.scatter(X, y)
# plt.xlabel(indicator)
# plt.ylabel("HDI")
# plt.show()

## Data Preprocessing for ML

In [24]:
# Split into Test and Train data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [25]:
# Scale the data
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [26]:
# Transform the training and test data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

## Creating the Model

### Multiple Linear Regression Model

In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [28]:
# Create the model using LinearRegression
linear = LinearRegression()

# Train the model
linear.fit(X_train_scaled, y_train_scaled)

# Use our model to make predictions
predictions = linear.predict(X_test_scaled)

# Score the model
# training_score = model.score(X_train_scaled, y_train_scaled)
# testing_score = model.score(X_test_scaled, y_test_scaled)
r2 = linear.score(X_test_scaled, y_test_scaled)
mse = mean_squared_error(y_test_scaled, predictions)

# print(f"Training Score: {training_score}")
# print(f"Testing Score: {testing_score}")
print(f"R2 Score: {r2}")
print(f"Mean Squared Error (MSE): {mse}")
# print(f"Linear Coefficient: {linear.coef_[0][0]}")
# print(f"y-Axis Intercept: {linear.intercept_[0]}")

R2 Score: 0.8903809232672542
Mean Squared Error (MSE): 0.1059474383320401


### Predicting HDI value

In [29]:
feed_values = [57, 39, 22600770, 4055177000, 0.5]

X_scaled = X_scaler.transform([feed_values])
y_scaled = linear.predict(X_scaled)
predicted_y = y_scaler.inverse_transform(y_scaled)
predicted_HDI = predicted_y[0][0]

predicted_HDI

0.4051112902285737

### Plot the Model

In [30]:
# # Plot the model
# x_min = np.array([[X_train_scaled.min()]])
# x_max = np.array([[X_train_scaled.max()]])
# y_min = linear.predict(x_min)
# y_max = linear.predict(x_max)
# plt.scatter(X_train_scaled, y_train_scaled, c='blue')
# plt.plot([x_min[0], x_max[0]], [y_min[0], y_max[0]], c='red')
# plt.title("Linear Regression Model")
# plt.xlabel(f"{indicator} (Scaled)")
# plt.ylabel("HDI (Scaled)")
# plt.show()

### Plot the Residuals

In [31]:
# # Plot the residuals
# y_train_predict = linear.predict(X_train_scaled)
# y_test_predict = linear.predict(X_test_scaled)
# y_train_residual = linear.predict(X_train_scaled) - y_train_scaled
# y_test_residual = linear.predict(X_test_scaled) - y_test_scaled

# plt.scatter(y_train_predict, y_train_residual, c="blue", label="Training Data")
# plt.scatter(y_test_predict, y_test_residual, c="orange", label="Testing Data")
# plt.legend()
# plt.hlines(y=0, xmin=y_train_predict.min(), xmax=y_train_predict.max())
# plt.title("Residual Plot")
# plt.xlabel("Prediction")
# plt.ylabel("Residual")
# plt.show()