## Libraries

In [1]:
# importing necessary libraries
import pandas as pd  # data analysis
import numpy as np  # mathematic evaluations
#from sklearn.preprocessing import MinMaxScaler # for data scaling
#from sklearn.preprocessing import RobustScaler # for data scaling
from sklearn.preprocessing import StandardScaler # for data scaling
#from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
#from sklearn.ensemble import RandomForestRegressor
#from sklearn.utils.class_weight import compute_sample_weight # for changing weight of the columns
from statsmodels.tsa.arima.model import ARIMA
#from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import auto_arima
import matplotlib.pyplot as plt # plotting library
import seaborn as sns # data visualization

## Imports

In [2]:
df_merged = pd.read_csv('C:/Users/lluis/Desktop/Documents/IronHack/Final_Project/data/cleaned/df_merged.csv')
df_exchange_rate = pd.read_csv('C:/Users/lluis/Desktop/Documents/IronHack/Final_Project/data/cleaned/df_exchange_rate.csv')

### Transforming Data for ML

In [3]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 59 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   country_name      175 non-null    object 
 1   un_class_2014     150 non-null    object 
 2   imf_class_2023    172 non-null    object 
 3   g7                175 non-null    bool   
 4   eu_member         175 non-null    bool   
 5   fuel_exp_country  175 non-null    bool   
 6   wealth_rank       161 non-null    float64
 7   ISO2              174 non-null    object 
 8   ISO3              175 non-null    object 
 9   1980              126 non-null    float64
 10  1981              127 non-null    float64
 11  1982              127 non-null    float64
 12  1983              127 non-null    float64
 13  1984              127 non-null    float64
 14  1985              128 non-null    float64
 15  1986              129 non-null    float64
 16  1987              129 non-null    float64
 1

In [4]:
# First I am dropping the columns I will not use

df_merged_pred = df_merged.drop(['un_class_2014', 'g7', 'eu_member', 'fuel_exp_country', 'wealth_rank', 'imf_class_2023', 'ISO2', 'ISO3', '2025', '2026', '2027', '2028','2029'], axis=1)

### Dealing with null values

In [5]:
# We will estimate the NaN values based on other values in the same column (year)
df_merged_pred = df_merged_pred.interpolate(axis=0)

  df_merged_pred = df_merged_pred.interpolate(axis=0)


### Data Scaling

Selecting the countries for which I want to predict GDP per capita with ARIMA

In [6]:
# Create a new DataFrame with only rows where 'country_name' is 'Spain'
df_spain = df_merged_pred[df_merged_pred['country_name'] == 'Spain'].T
# Drop the first row
df_spain = df_spain.drop(df_spain.index[0])

# Create a new DataFrame with only rows where 'country_name' is 'Switzerland'
df_switzerland = df_merged_pred[df_merged_pred['country_name'] == 'Switzerland'].T
# Drop the first row
df_switzerland = df_switzerland.drop(df_switzerland.index[0])

# Create a new DataFrame with only rows where 'country_name' is 'United States'
df_usa = df_merged_pred[df_merged_pred['country_name'] == 'United States'].T
# Drop the first row
df_usa = df_usa.drop(df_usa.index[0])

# Create a new DataFrame with only rows where 'country_name' is 'India'
df_india = df_merged_pred[df_merged_pred['country_name'] == 'India'].T
# Drop the first row
df_india = df_india.drop(df_india.index[0])

# Create a new DataFrame with only rows where 'country_name' is 'Venezuela'
df_venezuela = df_merged_pred[df_merged_pred['country_name'] == 'Venezuela'].T
# Drop the first row
df_venezuela = df_venezuela.drop(df_venezuela.index[0])

In [7]:
# Create a StandardScaler instance for each of the countries
scaler1 = StandardScaler()
scaler2 = StandardScaler()
scaler3 = StandardScaler()
scaler4 = StandardScaler()
scaler5 = StandardScaler()

# Fit the scaler to the data and transform the data
df_spain_scaled = pd.DataFrame(scaler1.fit_transform(df_spain), columns=df_spain.columns, index=df_spain.index)
df_switzerland_scaled = pd.DataFrame(scaler2.fit_transform(df_switzerland), columns=df_switzerland.columns, index=df_switzerland.index)
df_usa_scaled = pd.DataFrame(scaler3.fit_transform(df_usa), columns=df_usa.columns, index=df_usa.index)
df_india_scaled = pd.DataFrame(scaler4.fit_transform(df_india), columns=df_india.columns, index=df_india.index)
df_venezuela_scaled = pd.DataFrame(scaler5.fit_transform(df_venezuela), columns=df_venezuela.columns, index=df_venezuela.index)

### ARIMA Prediction

ARIMA prediction for Spain

In [8]:
# We want to predict the next 5 columns of df_spain
y = df_spain_scaled.iloc[:, -1]

# Define the auto_arima model
model = auto_arima(y, start_p=1, start_q=1, max_p=6, max_q=6, m=12,
                   start_P=0, seasonal=True, d=1, D=1, trace=True,
                   error_action='ignore', suppress_warnings=True, stepwise=True)

# Fit the model
model_fit = model.fit(y)

# Print the best model parameters
print(model_fit.summary())

# Predict the next 5 values
next_values = model_fit.predict(n_periods=5)
print(f"The predicted next 5 values are {next_values}")

Performing stepwise search to minimize aic
 ARIMA(1,1,1)(0,1,1)[12]             : AIC=-26.474, Time=32.44 sec
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=-27.690, Time=0.03 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=-28.411, Time=13.27 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=-28.460, Time=46.65 sec
 ARIMA(0,1,1)(0,1,0)[12]             : AIC=-28.280, Time=4.59 sec
 ARIMA(0,1,1)(1,1,1)[12]             : AIC=-26.757, Time=46.84 sec
 ARIMA(0,1,1)(0,1,2)[12]             : AIC=-26.757, Time=39.01 sec
 ARIMA(0,1,1)(1,1,0)[12]             : AIC=-28.740, Time=17.16 sec
 ARIMA(0,1,1)(2,1,0)[12]             : AIC=-26.757, Time=28.62 sec
 ARIMA(0,1,1)(2,1,1)[12]             : AIC=-24.757, Time=50.75 sec
 ARIMA(0,1,0)(1,1,0)[12]             : AIC=-28.539, Time=9.90 sec
 ARIMA(1,1,1)(1,1,0)[12]             : AIC=-26.763, Time=55.43 sec
 ARIMA(0,1,2)(1,1,0)[12]             : AIC=-26.775, Time=23.30 sec
 ARIMA(1,1,2)(1,1,0)[12]             : AIC=-25.023, Time=80.91 sec
 ARIMA(0,1,1)(1,1,0)[1

In [9]:
# Define the TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=2)

# Loop over the splits
for train_index, test_index in tscv.split(y):
    y_train, y_test = y[train_index], y[test_index]

    # Fit the model on the training data
    model_fit = model.fit(y_train)

    # Make predictions on the test data
    y_pred = model_fit.predict(n_periods=len(y_test))

    # Calculate the metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R2 Score: {r2}")

  y_train, y_test = y[train_index], y[test_index]


MAE: 0.2501697625435116
MSE: 0.08214260209910619
RMSE: 0.2866053071719123
R2 Score: 0.4599458440052079


  y_train, y_test = y[train_index], y[test_index]


MAE: 0.2415005015967703
MSE: 0.11679653056011245
RMSE: 0.3417550739347003
R2 Score: 0.6146746418650428


Interpretation:

- MAE, MSE, and RMSE: These metrics appear to be in a reasonable range, but their adequacy depends on the context, including the scale of the data and the specific problem domain. In isolation, they don't tell you whether the model is good or bad; they need to be compared to the baseline errors (e.g., errors obtained from a simple model like mean prediction).

- R² Score: The negative R² score is concerning. It suggests that the model is not performing well and is worse than a trivial model that would predict the mean of the target variable for all observations. This indicates that there may be significant issues with the model, such as overfitting, underfitting, incorrect model assumptions, or issues with the data itself.

In [10]:
# Now to descale your predictions
next_values_descaled = scaler1.inverse_transform(next_values.values.reshape(-1, 1))

print(next_values_descaled)

[[53196.52168144]
 [54130.05591264]
 [55297.28060515]
 [57196.2362547 ]
 [59151.81642286]]


We add the predicted data back to spain_df

In [11]:
# We add our predictions to the dataframe
years = [2025, 2026, 2027, 2028, 2029]
for i, year in enumerate(years):
    df_spain.loc[year] = next_values_descaled[i]

ARIMA prediction for Switzerland

In [None]:
# We want to predict the next 5 columns of df_switzerland
y = df_switzerland_scaled.iloc[:, -1]

# Define the auto_arima model
model = auto_arima(y, start_p=1, start_q=1, max_p=6, max_q=6, m=12,
                   start_P=0, seasonal=True, d=1, D=1, trace=True,
                   error_action='ignore', suppress_warnings=True, stepwise=True)

# Fit the model
model_fit = model.fit(y)

# Print the best model parameters
print(model_fit.summary())

# Predict the next 5 values
next_values = model_fit.predict(n_periods=5)
print(f"The predicted next 5 values are {next_values}")

Performing stepwise search to minimize aic
 ARIMA(1,1,1)(0,1,1)[12]             : AIC=-51.219, Time=70.34 sec
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=-47.046, Time=0.03 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=-51.311, Time=21.94 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=-52.861, Time=28.44 sec
 ARIMA(0,1,1)(0,1,0)[12]             : AIC=-49.433, Time=3.61 sec
 ARIMA(0,1,1)(1,1,1)[12]             : AIC=-51.422, Time=41.45 sec
 ARIMA(0,1,1)(0,1,2)[12]             : AIC=-51.422, Time=78.89 sec
 ARIMA(0,1,1)(1,1,0)[12]             : AIC=-52.960, Time=33.98 sec


In [None]:
# Define the TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=2)

# Loop over the splits
for train_index, test_index in tscv.split(y):
    y_train, y_test = y[train_index], y[test_index]

    # Fit the model on the training data
    model_fit = model.fit(y_train)

    # Make predictions on the test data
    y_pred = model_fit.predict(n_periods=len(y_test))

    # Calculate the metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R2 Score: {r2}")

In [None]:
# Now to descale your predictions
next_values_descaled = scaler2.inverse_transform(next_values.values.reshape(-1, 1))

print(next_values_descaled)

In [None]:
# We add our predictions to the dataframe
years = [2025, 2026, 2027, 2028, 2029]
for i, year in enumerate(years):
    df_switzerland.loc[year] = next_values_descaled[i]

ARIMA prediction for USA

In [None]:
# We want to predict the next 5 columns of df_usa
y = df_usa_scaled.iloc[:, -1]

# Define the auto_arima model
model = auto_arima(y, start_p=1, start_q=1, max_p=6, max_q=6, m=12,
                   start_P=0, seasonal=True, d=1, D=1, trace=True,
                   error_action='ignore', suppress_warnings=True, stepwise=True)

# Fit the model
model_fit = model.fit(y)

# Print the best model parameters
print(model_fit.summary())

# Predict the next 5 values
next_values = model_fit.predict(n_periods=5)
print(f"The predicted next 5 values are {next_values}")

In [None]:
# Define the TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=2)

# Loop over the splits
for train_index, test_index in tscv.split(y):
    y_train, y_test = y[train_index], y[test_index]

    # Fit the model on the training data
    model_fit = model.fit(y_train)

    # Make predictions on the test data
    y_pred = model_fit.predict(n_periods=len(y_test))

    # Calculate the metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R2 Score: {r2}")

In [None]:
# Now to descale your predictions
next_values_descaled = scaler3.inverse_transform(next_values.values.reshape(-1, 1))

print(next_values_descaled)

In [None]:
# We add our predictions to the dataframe
years = [2025, 2026, 2027, 2028, 2029]
for i, year in enumerate(years):
    df_usa.loc[year] = next_values_descaled[i]

ARIMA prediction for India

In [None]:
# We want to predict the next 5 columns of df_india
y = df_india_scaled.iloc[:, -1]

# Define the auto_arima model
model = auto_arima(y, start_p=1, start_q=1, max_p=6, max_q=6, m=12,
                   start_P=0, seasonal=True, d=1, D=1, trace=True,
                   error_action='ignore', suppress_warnings=True, stepwise=True)

# Fit the model
model_fit = model.fit(y)

# Print the best model parameters
print(model_fit.summary())

# Predict the next 5 values
next_values = model_fit.predict(n_periods=5)
print(f"The predicted next 5 values are {next_values}")

In [None]:
# Define the TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=2)

# Loop over the splits
for train_index, test_index in tscv.split(y):
    y_train, y_test = y[train_index], y[test_index]

    # Fit the model on the training data
    model_fit = model.fit(y_train)

    # Make predictions on the test data
    y_pred = model_fit.predict(n_periods=len(y_test))

    # Calculate the metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R2 Score: {r2}")

In [None]:
# Now to descale your predictions
next_values_descaled = scaler4.inverse_transform(next_values.values.reshape(-1, 1))

print(next_values_descaled)

In [None]:
# We add our predictions to the dataframe
years = [2025, 2026, 2027, 2028, 2029]
for i, year in enumerate(years):
    df_india.loc[year] = next_values_descaled[i]

ARIMA prediction for Venezuela

In [None]:
# We want to predict the next 5 columns of df_venezuela
y = df_venezuela_scaled.iloc[:, -1]

# Define the auto_arima model
model = auto_arima(y, start_p=1, start_q=1, max_p=6, max_q=6, m=12,
                   start_P=0, seasonal=True, d=1, D=1, trace=True,
                   error_action='ignore', suppress_warnings=True, stepwise=True)

# Fit the model
model_fit = model.fit(y)

# Print the best model parameters
print(model_fit.summary())

# Predict the next 5 values
next_values = model_fit.predict(n_periods=5)
print(f"The predicted next 5 values are {next_values}")

In [None]:
# Split the data into training and test sets
y_train, y_test = train_test_split(y, test_size=0.2, shuffle=False)

# Fit the model on the training data
model_fit = model.fit(y_train)

# Make predictions on the test data
y_pred = model_fit.predict(n_periods=len(y_test))

# Calculate the metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")

In [None]:
# Now to descale your predictions
next_values_descaled = scaler4.inverse_transform(next_values.values.reshape(-1, 1))

print(next_values_descaled)

In [None]:
# We add our predictions to the dataframe
years = [2025, 2026, 2027, 2028, 2029]
for i, year in enumerate(years):
    df_venezuela.loc[year] = next_values_descaled[i]

### Predictions Comparison

First we create a pivot table with the forecast of ECB for the next years

In [None]:
# List of years from 1993 to 2029
years = list(map(str, range(1993, 2030)))

# List of countries
countries = ['Spain', 'United States', 'India', 'Switzerland', 'Venezuela']

# Filter the DataFrame to include only the countries in the list
df_filtered = df_merged[df_merged['country_name'].isin(countries)]

# Set 'country_name' as the index and select only the columns for the years
pivot_table_ecb = df_filtered.set_index('country_name')[years]

# Display the pivot table
print(pivot_table_ecb)

Plotting ECB Predictions

In [None]:
# Melt the DataFrame
df_melted = pd.melt(pivot_table_ecb.reset_index(), id_vars='country_name', var_name='Year', value_name='Value')

# Convert 'Year' to numeric
df_melted['Year'] = pd.to_numeric(df_melted['Year'])

# Create a line plot for the years before 2025
sns.lineplot(data=df_melted[df_melted['Year'] < 2025], x='Year', y='Value', hue='country_name')

# Create a line plot for the years from 2025 onwards with a different line style
sns.lineplot(data=df_melted[df_melted['Year'] >= 2025], x='Year', y='Value', hue='country_name', style=True, dashes=[(2,2)])

plt.show()

We create a pivot table with our predictions in the same format

In [None]:
# Transpose df_usa
df_spain_p = df_spain.T

# List of columns to drop
columns_to_drop = ['1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992']

# Drop the columns
df_spain_p = df_spain_p.drop(columns=columns_to_drop)

# Add a column named 'country_name' at the beginning and set its value to 'United States'
df_spain_p.insert(0, 'country_name', 'Spain')

# Set 'country_name' as the index
df_spain_p = df_spain_p.set_index('country_name')

In [None]:
# Transpose df_usa
df_usa_p = df_usa.T

# List of columns to drop
columns_to_drop = ['1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992']

# Drop the columns
df_usa_p = df_usa_p.drop(columns=columns_to_drop)

# Add a column named 'country_name' at the beginning and set its value to 'United States'
df_usa_p.insert(0, 'country_name', 'United States')

# Set 'country_name' as the index
df_usa_p = df_usa_p.set_index('country_name')

In [None]:
# Transpose df_switzerland
df_switzerland_p = df_switzerland.T

# List of columns to drop
columns_to_drop = ['1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992']

# Drop the columns
df_switzerland_p = df_switzerland_p.drop(columns=columns_to_drop)

# Add a column named 'country_name' at the beginning and set its value to 'United States'
df_switzerland_p.insert(0, 'country_name', 'Switzerland')

# Set 'country_name' as the index
df_switzerland_p = df_switzerland_p.set_index('country_name')

In [None]:
# Transpose df_india
df_india_p = df_india.T

# List of columns to drop
columns_to_drop = ['1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992']

# Drop the columns
df_india_p = df_india_p.drop(columns=columns_to_drop)

# Add a column named 'country_name' at the beginning and set its value to 'United States'
df_india_p.insert(0, 'country_name', 'India')

# Set 'country_name' as the index
df_india_p = df_india_p.set_index('country_name')

In [None]:
# Transpose df_venezuela
df_venezuela_p = df_venezuela.T

# List of columns to drop
columns_to_drop = ['1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992']

# Drop the columns
df_venezuela_p = df_venezuela_p.drop(columns=columns_to_drop)

# Add a column named 'country_name' at the beginning and set its value to 'United States'
df_venezuela_p.insert(0, 'country_name', 'Venezuela')

# Set 'country_name' as the index
df_venezuela_p = df_venezuela_p.set_index('country_name')

In [None]:
# Concatenate the dataframes
df_p = pd.concat([df_spain_p, df_usa_p, df_switzerland_p, df_india_p, df_venezuela_p])

In [None]:
df_p = df_p.drop(columns=['index'])

In [None]:
# Convert all columns to numerical
df_p = df_p.apply(pd.to_numeric, errors='coerce')

In [None]:
# Melt the DataFrame
df_melted = pd.melt(df_p.reset_index(), id_vars='country_name', var_name='Year', value_name='Value')

# Convert 'Year' to integer
df_melted['Year'] = df_melted['Year'].astype(int)

# Create a line plot for years before 2025
sns.lineplot(data=df_melted[df_melted['Year'] < 2025], x='Year', y='Value', hue='country_name')

# Create a dashed line plot for years from 2025 onwards
sns.lineplot(data=df_melted[df_melted['Year'] >= 2025], x='Year', y='Value', hue='country_name', style='country_name', dashes=True)

plt.show()

In [None]:
import matplotlib.pyplot as plt

# Melt the DataFrame
df_melted = pd.melt(df_p.reset_index(), id_vars='country_name', var_name='Year', value_name='Value')

# Convert 'Year' to integer
df_melted['Year'] = df_melted['Year'].astype(int)

# Create a figure with two subplots
fig, axs = plt.subplots(1, 2, figsize=(20, 10))

# Create a line plot for all years in the first subplot
sns.lineplot(data=df_melted, x='Year', y='Value', hue='country_name', ax=axs[0])
axs[0].set_title('All Years')

# Create a line plot for years before 2025 and a dashed line plot for years from 2025 onwards in the second subplot
sns.lineplot(data=df_melted[df_melted['Year'] < 2025], x='Year', y='Value', hue='country_name', ax=axs[1])
sns.lineplot(data=df_melted[df_melted['Year'] >= 2025], x='Year', y='Value', hue='country_name', style='country_name', dashes=True, ax=axs[1])
axs[1].set_title('Years < 2025 and Years >= 2025')

plt.show()