## Libraries

In [1]:
# importing necessary libraries
import pandas as pd  # data analysis
import numpy as np  # mathematic evaluations
#from sklearn.preprocessing import MinMaxScaler # for data scaling
#from sklearn.preprocessing import RobustScaler # for data scaling
from sklearn.preprocessing import StandardScaler # for data scaling
#from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
#from sklearn.ensemble import RandomForestRegressor
#from sklearn.utils.class_weight import compute_sample_weight # for changing weight of the columns
from statsmodels.tsa.arima.model import ARIMA
#from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import auto_arima
import matplotlib.pyplot as plt # plotting library
import seaborn as sns # data visualization

## Imports

In [2]:
df_merged = pd.read_csv('C:/Users/lluis/Desktop/Documents/IronHack/Final_Project/data/cleaned/df_merged.csv')
df_exchange_rate = pd.read_csv('C:/Users/lluis/Desktop/Documents/IronHack/Final_Project/data/cleaned/df_exchange_rate.csv')

### Transforming Data for ML

In [3]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 59 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   country_name      175 non-null    object 
 1   un_class_2014     150 non-null    object 
 2   imf_class_2023    172 non-null    object 
 3   g7                175 non-null    bool   
 4   eu_member         175 non-null    bool   
 5   fuel_exp_country  175 non-null    bool   
 6   wealth_rank       161 non-null    float64
 7   ISO2              174 non-null    object 
 8   ISO3              175 non-null    object 
 9   1980              126 non-null    float64
 10  1981              127 non-null    float64
 11  1982              127 non-null    float64
 12  1983              127 non-null    float64
 13  1984              127 non-null    float64
 14  1985              128 non-null    float64
 15  1986              129 non-null    float64
 16  1987              129 non-null    float64
 1

In [4]:
# First I am dropping the columns I will not use

df_merged_pred = df_merged.drop(['un_class_2014', 'g7', 'eu_member', 'fuel_exp_country', 'wealth_rank', 'imf_class_2023', 'ISO2', 'ISO3', '2025', '2026', '2027', '2028','2029'], axis=1)

### Dealing with null values

In [5]:
# We will estimate the NaN values based on other values in the same column (year)
df_merged_pred = df_merged_pred.interpolate(axis=0)

  df_merged_pred = df_merged_pred.interpolate(axis=0)


### Data Scaling

Selecting the countries for which I want to predict GDP per capita with ARIMA

In [6]:
# Create a new DataFrame with only rows where 'country_name' is 'Spain'
df_spain = df_merged_pred[df_merged_pred['country_name'] == 'Spain'].T
# Drop the first row
df_spain = df_spain.drop(df_spain.index[0])

# Create a new DataFrame with only rows where 'country_name' is 'Switzerland'
df_switzerland = df_merged_pred[df_merged_pred['country_name'] == 'Switzerland'].T
# Drop the first row
df_switzerland = df_switzerland.drop(df_switzerland.index[0])

# Create a new DataFrame with only rows where 'country_name' is 'United States'
df_usa = df_merged_pred[df_merged_pred['country_name'] == 'United States'].T
# Drop the first row
df_usa = df_usa.drop(df_usa.index[0])

# Create a new DataFrame with only rows where 'country_name' is 'India'
df_india = df_merged_pred[df_merged_pred['country_name'] == 'India'].T
# Drop the first row
df_india = df_india.drop(df_india.index[0])

# Create a new DataFrame with only rows where 'country_name' is 'Venezuela'
df_venezuela = df_merged_pred[df_merged_pred['country_name'] == 'Venezuela'].T
# Drop the first row
df_venezuela = df_venezuela.drop(df_venezuela.index[0])

In [7]:
# Create a StandardScaler instance for each of the countries
scaler1 = StandardScaler()
scaler2 = StandardScaler()
scaler3 = StandardScaler()
scaler4 = StandardScaler()
scaler5 = StandardScaler()

# Fit the scaler to the data and transform the data
df_spain_scaled = pd.DataFrame(scaler1.fit_transform(df_spain), columns=df_spain.columns, index=df_spain.index)
df_switzerland_scaled = pd.DataFrame(scaler2.fit_transform(df_switzerland), columns=df_switzerland.columns, index=df_switzerland.index)
df_usa_scaled = pd.DataFrame(scaler3.fit_transform(df_usa), columns=df_usa.columns, index=df_usa.index)
df_india_scaled = pd.DataFrame(scaler4.fit_transform(df_india), columns=df_india.columns, index=df_india.index)
df_venezuela_scaled = pd.DataFrame(scaler5.fit_transform(df_venezuela), columns=df_venezuela.columns, index=df_venezuela.index)

### ARIMA Prediction

ARIMA prediction for Spain

In [8]:
# We want to predict the next 5 columns of df_spain
y = df_spain_scaled.iloc[:, -1]

# Define the auto_arima model
model = auto_arima(y, start_p=1, start_q=1, max_p=6, max_q=6, m=12,
                   start_P=0, seasonal=True, d=1, D=1, trace=True,
                   error_action='ignore', suppress_warnings=True, stepwise=True)

# Fit the model
model_fit = model.fit(y)

# Print the best model parameters
print(model_fit.summary())

# Predict the next 5 values
next_values = model_fit.predict(n_periods=5)
print(f"The predicted next 5 values are {next_values}")

Performing stepwise search to minimize aic
 ARIMA(1,1,1)(0,1,1)[12]             : AIC=-26.474, Time=32.44 sec
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=-27.690, Time=0.03 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=-28.411, Time=13.27 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=-28.460, Time=46.65 sec
 ARIMA(0,1,1)(0,1,0)[12]             : AIC=-28.280, Time=4.59 sec
 ARIMA(0,1,1)(1,1,1)[12]             : AIC=-26.757, Time=46.84 sec
 ARIMA(0,1,1)(0,1,2)[12]             : AIC=-26.757, Time=39.01 sec
 ARIMA(0,1,1)(1,1,0)[12]             : AIC=-28.740, Time=17.16 sec
 ARIMA(0,1,1)(2,1,0)[12]             : AIC=-26.757, Time=28.62 sec
 ARIMA(0,1,1)(2,1,1)[12]             : AIC=-24.757, Time=50.75 sec
 ARIMA(0,1,0)(1,1,0)[12]             : AIC=-28.539, Time=9.90 sec
 ARIMA(1,1,1)(1,1,0)[12]             : AIC=-26.763, Time=55.43 sec
 ARIMA(0,1,2)(1,1,0)[12]             : AIC=-26.775, Time=23.30 sec
 ARIMA(1,1,2)(1,1,0)[12]             : AIC=-25.023, Time=80.91 sec
 ARIMA(0,1,1)(1,1,0)[1

In [9]:
# Define the TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=2)

# Loop over the splits
for train_index, test_index in tscv.split(y):
    y_train, y_test = y[train_index], y[test_index]

    # Fit the model on the training data
    model_fit = model.fit(y_train)

    # Make predictions on the test data
    y_pred = model_fit.predict(n_periods=len(y_test))

    # Calculate the metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R2 Score: {r2}")

  y_train, y_test = y[train_index], y[test_index]


MAE: 0.2501697625435116
MSE: 0.08214260209910619
RMSE: 0.2866053071719123
R2 Score: 0.4599458440052079


  y_train, y_test = y[train_index], y[test_index]


MAE: 0.2415005015967703
MSE: 0.11679653056011245
RMSE: 0.3417550739347003
R2 Score: 0.6146746418650428


Interpretation:

- MAE, MSE, and RMSE: These metrics appear to be in a reasonable range, but their adequacy depends on the context, including the scale of the data and the specific problem domain. In isolation, they don't tell you whether the model is good or bad; they need to be compared to the baseline errors (e.g., errors obtained from a simple model like mean prediction).

- R² Score: The negative R² score is concerning. It suggests that the model is not performing well and is worse than a trivial model that would predict the mean of the target variable for all observations. This indicates that there may be significant issues with the model, such as overfitting, underfitting, incorrect model assumptions, or issues with the data itself.

In [10]:
# Now to descale your predictions
next_values_descaled = scaler1.inverse_transform(next_values.values.reshape(-1, 1))

print(next_values_descaled)

[[53196.52168144]
 [54130.05591264]
 [55297.28060515]
 [57196.2362547 ]
 [59151.81642286]]


We add the predicted data back to spain_df

In [11]:
# We add our predictions to the dataframe
years = [2025, 2026, 2027, 2028, 2029]
for i, year in enumerate(years):
    df_spain.loc[year] = next_values_descaled[i]

ARIMA prediction for Switzerland

In [12]:
# We want to predict the next 5 columns of df_switzerland
y = df_switzerland_scaled.iloc[:, -1]

# Define the auto_arima model
model = auto_arima(y, start_p=1, start_q=1, max_p=6, max_q=6, m=12,
                   start_P=0, seasonal=True, d=1, D=1, trace=True,
                   error_action='ignore', suppress_warnings=True, stepwise=True)

# Fit the model
model_fit = model.fit(y)

# Print the best model parameters
print(model_fit.summary())

# Predict the next 5 values
next_values = model_fit.predict(n_periods=5)
print(f"The predicted next 5 values are {next_values}")

Performing stepwise search to minimize aic
 ARIMA(1,1,1)(0,1,1)[12]             : AIC=-51.219, Time=70.34 sec
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=-47.046, Time=0.03 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=-51.311, Time=21.94 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=-52.861, Time=28.44 sec
 ARIMA(0,1,1)(0,1,0)[12]             : AIC=-49.433, Time=3.61 sec
 ARIMA(0,1,1)(1,1,1)[12]             : AIC=-51.422, Time=41.45 sec
 ARIMA(0,1,1)(0,1,2)[12]             : AIC=-51.422, Time=78.89 sec
 ARIMA(0,1,1)(1,1,0)[12]             : AIC=-52.960, Time=33.98 sec
 ARIMA(0,1,1)(2,1,0)[12]             : AIC=-51.422, Time=93.23 sec
 ARIMA(0,1,1)(2,1,1)[12]             : AIC=-49.422, Time=75.75 sec
 ARIMA(0,1,0)(1,1,0)[12]             : AIC=-49.283, Time=17.23 sec
 ARIMA(1,1,1)(1,1,0)[12]             : AIC=-51.178, Time=47.58 sec
 ARIMA(0,1,2)(1,1,0)[12]             : AIC=-51.155, Time=33.88 sec
 ARIMA(1,1,2)(1,1,0)[12]             : AIC=-48.963, Time=41.81 sec
 ARIMA(0,1,1)(1,1,0)[

In [13]:
# Define the TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=2)

# Loop over the splits
for train_index, test_index in tscv.split(y):
    y_train, y_test = y[train_index], y[test_index]

    # Fit the model on the training data
    model_fit = model.fit(y_train)

    # Make predictions on the test data
    y_pred = model_fit.predict(n_periods=len(y_test))

    # Calculate the metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R2 Score: {r2}")

  y_train, y_test = y[train_index], y[test_index]


MAE: 0.37673565858625946
MSE: 0.2379596589380115
RMSE: 0.4878110893963067
R2 Score: -0.9758791942434588


  y_train, y_test = y[train_index], y[test_index]


MAE: 0.4483598516353471
MSE: 0.3040156237346455
RMSE: 0.5513761182121016
R2 Score: -0.01659624713560448


In [14]:
# Now to descale your predictions
next_values_descaled = scaler2.inverse_transform(next_values.values.reshape(-1, 1))

print(next_values_descaled)

[[ 94795.83632304]
 [ 97224.0129576 ]
 [ 99202.29603772]
 [101929.28139771]
 [104751.79454437]]


In [15]:
# We add our predictions to the dataframe
years = [2025, 2026, 2027, 2028, 2029]
for i, year in enumerate(years):
    df_switzerland.loc[year] = next_values_descaled[i]

ARIMA prediction for USA

In [16]:
# We want to predict the next 5 columns of df_usa
y = df_usa_scaled.iloc[:, -1]

# Define the auto_arima model
model = auto_arima(y, start_p=1, start_q=1, max_p=6, max_q=6, m=12,
                   start_P=0, seasonal=True, d=1, D=1, trace=True,
                   error_action='ignore', suppress_warnings=True, stepwise=True)

# Fit the model
model_fit = model.fit(y)

# Print the best model parameters
print(model_fit.summary())

# Predict the next 5 values
next_values = model_fit.predict(n_periods=5)
print(f"The predicted next 5 values are {next_values}")

Performing stepwise search to minimize aic
 ARIMA(1,1,1)(0,1,1)[12]             : AIC=-55.830, Time=40.61 sec
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=-53.085, Time=0.11 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=-58.160, Time=25.83 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=-56.424, Time=15.30 sec
 ARIMA(1,1,0)(0,1,0)[12]             : AIC=-57.150, Time=0.09 sec
 ARIMA(1,1,0)(2,1,0)[12]             : AIC=-56.162, Time=18.45 sec
 ARIMA(1,1,0)(1,1,1)[12]             : AIC=-56.162, Time=25.43 sec
 ARIMA(1,1,0)(0,1,1)[12]             : AIC=-57.583, Time=33.08 sec
 ARIMA(1,1,0)(2,1,1)[12]             : AIC=-54.162, Time=34.03 sec
 ARIMA(0,1,0)(1,1,0)[12]             : AIC=-53.852, Time=9.75 sec
 ARIMA(2,1,0)(1,1,0)[12]             : AIC=-56.299, Time=28.61 sec
 ARIMA(1,1,1)(1,1,0)[12]             : AIC=-56.409, Time=46.20 sec
 ARIMA(0,1,1)(1,1,0)[12]             : AIC=-56.996, Time=33.17 sec
 ARIMA(2,1,1)(1,1,0)[12]             : AIC=-54.976, Time=31.91 sec
 ARIMA(1,1,0)(1,1,0)[1

In [17]:
# Define the TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=2)

# Loop over the splits
for train_index, test_index in tscv.split(y):
    y_train, y_test = y[train_index], y[test_index]

    # Fit the model on the training data
    model_fit = model.fit(y_train)

    # Make predictions on the test data
    y_pred = model_fit.predict(n_periods=len(y_test))

    # Calculate the metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R2 Score: {r2}")

  y_train, y_test = y[train_index], y[test_index]


MAE: 0.05666672531587677
MSE: 0.006739057822099983
RMSE: 0.08209176464238044
R2 Score: 0.9415816604141831


  y_train, y_test = y[train_index], y[test_index]


MAE: 0.7454628178851508
MSE: 0.8077991399144799
RMSE: 0.8987764682692132
R2 Score: -1.4151135188584858


In [18]:
# Now to descale your predictions
next_values_descaled = scaler3.inverse_transform(next_values.values.reshape(-1, 1))

print(next_values_descaled)

[[87808.5946478 ]
 [90172.4850082 ]
 [92666.44403122]
 [95200.11477901]
 [98297.37107237]]


In [19]:
# We add our predictions to the dataframe
years = [2025, 2026, 2027, 2028, 2029]
for i, year in enumerate(years):
    df_usa.loc[year] = next_values_descaled[i]

ARIMA prediction for India

In [20]:
# We want to predict the next 5 columns of df_india
y = df_india_scaled.iloc[:, -1]

# Define the auto_arima model
model = auto_arima(y, start_p=1, start_q=1, max_p=6, max_q=6, m=12,
                   start_P=0, seasonal=True, d=1, D=1, trace=True,
                   error_action='ignore', suppress_warnings=True, stepwise=True)

# Fit the model
model_fit = model.fit(y)

# Print the best model parameters
print(model_fit.summary())

# Predict the next 5 values
next_values = model_fit.predict(n_periods=5)
print(f"The predicted next 5 values are {next_values}")

Performing stepwise search to minimize aic
 ARIMA(1,1,1)(0,1,1)[12]             : AIC=-59.530, Time=37.66 sec
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=-52.221, Time=0.03 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=-61.355, Time=21.05 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=-59.521, Time=14.14 sec
 ARIMA(1,1,0)(0,1,0)[12]             : AIC=-61.339, Time=0.10 sec
 ARIMA(1,1,0)(2,1,0)[12]             : AIC=-59.555, Time=40.82 sec
 ARIMA(1,1,0)(1,1,1)[12]             : AIC=-59.555, Time=65.75 sec
 ARIMA(1,1,0)(0,1,1)[12]             : AIC=-61.490, Time=8.02 sec
 ARIMA(1,1,0)(0,1,2)[12]             : AIC=-59.555, Time=37.88 sec
 ARIMA(1,1,0)(1,1,2)[12]             : AIC=-57.555, Time=27.50 sec
 ARIMA(0,1,0)(0,1,1)[12]             : AIC=-54.836, Time=14.57 sec
 ARIMA(2,1,0)(0,1,1)[12]             : AIC=-59.535, Time=46.92 sec
 ARIMA(2,1,1)(0,1,1)[12]             : AIC=-57.561, Time=61.73 sec
 ARIMA(1,1,0)(0,1,1)[12] intercept   : AIC=-62.886, Time=23.46 sec
 ARIMA(1,1,0)(0,1,0)[1

In [21]:
# Define the TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=2)

# Loop over the splits
for train_index, test_index in tscv.split(y):
    y_train, y_test = y[train_index], y[test_index]

    # Fit the model on the training data
    model_fit = model.fit(y_train)

    # Make predictions on the test data
    y_pred = model_fit.predict(n_periods=len(y_test))

    # Calculate the metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R2 Score: {r2}")

  y_train, y_test = y[train_index], y[test_index]


MAE: 0.18637325224948045
MSE: 0.05951050450931248
RMSE: 0.2439477495475465
R2 Score: 0.2926508030744899
MAE: 0.21539397478296352
MSE: 0.09783202330601586
RMSE: 0.3127811108523273
R2 Score: 0.7854876396681987


  y_train, y_test = y[train_index], y[test_index]


In [22]:
# Now to descale your predictions
next_values_descaled = scaler4.inverse_transform(next_values.values.reshape(-1, 1))

print(next_values_descaled)

[[10558.16087407]
 [10910.37140777]
 [11293.92940325]
 [11809.83085343]
 [12290.49704411]]


In [23]:
# We add our predictions to the dataframe
years = [2025, 2026, 2027, 2028, 2029]
for i, year in enumerate(years):
    df_india.loc[year] = next_values_descaled[i]

ARIMA prediction for Venezuela

In [24]:
# We want to predict the next 5 columns of df_venezuela
y = df_venezuela_scaled.iloc[:, -1]

# Define the auto_arima model
model = auto_arima(y, start_p=1, start_q=1, max_p=6, max_q=6, m=12,
                   start_P=0, seasonal=True, d=1, D=1, trace=True,
                   error_action='ignore', suppress_warnings=True, stepwise=True)

# Fit the model
model_fit = model.fit(y)

# Print the best model parameters
print(model_fit.summary())

# Predict the next 5 values
next_values = model_fit.predict(n_periods=5)
print(f"The predicted next 5 values are {next_values}")

Performing stepwise search to minimize aic
 ARIMA(1,1,1)(0,1,1)[12]             : AIC=inf, Time=39.76 sec
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=50.322, Time=0.03 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=32.762, Time=11.65 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=inf, Time=20.51 sec
 ARIMA(1,1,0)(0,1,0)[12]             : AIC=41.826, Time=0.04 sec
 ARIMA(1,1,0)(2,1,0)[12]             : AIC=31.048, Time=20.98 sec
 ARIMA(1,1,0)(2,1,1)[12]             : AIC=33.040, Time=19.44 sec
 ARIMA(1,1,0)(1,1,1)[12]             : AIC=inf, Time=29.04 sec
 ARIMA(0,1,0)(2,1,0)[12]             : AIC=44.151, Time=15.14 sec
 ARIMA(2,1,0)(2,1,0)[12]             : AIC=32.362, Time=27.58 sec
 ARIMA(1,1,1)(2,1,0)[12]             : AIC=32.345, Time=33.92 sec
 ARIMA(0,1,1)(2,1,0)[12]             : AIC=33.566, Time=49.28 sec
 ARIMA(2,1,1)(2,1,0)[12]             : AIC=34.325, Time=57.06 sec
 ARIMA(1,1,0)(2,1,0)[12] intercept   : AIC=32.074, Time=26.28 sec

Best model:  ARIMA(1,1,0)(2,1,0)[12]       

In [25]:
# Split the data into training and test sets
y_train, y_test = train_test_split(y, test_size=0.2, shuffle=False)

# Fit the model on the training data
model_fit = model.fit(y_train)

# Make predictions on the test data
y_pred = model_fit.predict(n_periods=len(y_test))

# Calculate the metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")

MAE: 3.3609205076826125
MSE: 12.493998098606177
RMSE: 3.5346850069852302
R2 Score: -20.750781048137185


In [26]:
# Now to descale your predictions
next_values_descaled = scaler4.inverse_transform(next_values.values.reshape(-1, 1))

print(next_values_descaled)

[[ 598.94175368]
 [ 584.99073887]
 [ 948.99020504]
 [2566.0649292 ]
 [3524.49542004]]


In [27]:
# We add our predictions to the dataframe
years = [2025, 2026, 2027, 2028, 2029]
for i, year in enumerate(years):
    df_venezuela.loc[year] = next_values_descaled[i]

### Predictions Comparison

First we create a new dataframe with the our forecast for the next years

In [38]:
# Transpose the DataFrames
df_spain_pr = df_spain.T
df_switzerland_pr = df_switzerland.T
df_usa_pr = df_usa.T
df_india_pr = df_india.T
df_venezuela_pr = df_venezuela.T

# Reset the index and drop it
df_spain_pr.reset_index(drop=True, inplace=True)
df_switzerland_pr.reset_index(drop=True, inplace=True)
df_usa_pr.reset_index(drop=True, inplace=True)
df_india_pr.reset_index(drop=True, inplace=True)
df_venezuela_pr.reset_index(drop=True, inplace=True)

# Add a new column at the beginning with the country name
df_spain_pr.insert(0, 'country_name', 'Spain')
df_switzerland_pr.insert(0, 'country_name', 'Switzerland')
df_usa_pr.insert(0, 'country_name', 'United States')
df_india_pr.insert(0, 'country_name', 'India')
df_venezuela_pr.insert(0, 'country_name', 'Venezuela')

In [44]:
# List of DataFrames
dfs = [df_spain_pr, df_switzerland_pr, df_usa_pr, df_india_pr, df_venezuela_pr]

# Merge the DataFrames
df_pr = pd.concat(dfs)

# Convert all columns except the first one to numeric
df_pr[df_pr.columns[1:]] = df_pr[df_pr.columns[1:]].apply(pd.to_numeric)

# List of columns to drop
columns_to_drop = ['1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992']

# Drop the columns
df_pr.drop(columns_to_drop, axis=1, inplace=True)

# Reset the index and drop it
df_pr.reset_index(drop=True, inplace=True)

In [51]:
df_pr

Unnamed: 0,country_name,1993,1994,1995,1996,1997,1998,1999,2000,2001,...,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029
0,Spain,16620.937,17325.517,18374.841,19120.069,20148.386,21211.034,22416.165,23880.729,25246.885,...,38044.067,42356.615,47669.667,50436.18,52012.448,53196.521681,54130.055913,55297.280605,57196.236255,59151.816423
1,Switzerland,33172.325,34021.135,34681.015,35258.617,36572.274,37988.682,39001.417,41292.689,42703.596,...,72255.288,79058.938,86195.048,89243.299,91931.752,94795.836323,97224.012958,99202.296038,101929.281398,104751.794544
2,United States,26364.192,27674.021,28671.48,29946.973,31440.087,32833.666,34496.241,36312.782,37101.453,...,64367.435,70995.794,77191.871,81632.253,85372.686,87808.594648,90172.485008,92666.444031,95200.114779,98297.371072
3,India,1367.822,1460.249,1572.155,1688.531,1753.237,1847.385,2001.886,2087.485,2197.354,...,6507.86,7406.466,8424.352,9339.349,10122.951,10558.160874,10910.371408,11293.929403,11809.830853,12290.497044
4,Venezuela,11427.619,11146.894,11575.821,11516.704,12210.037,12140.77,11350.44,11640.053,12103.331,...,5729.831,6132.464,7265.562,7942.862,8485.876,598.941754,584.990739,948.990205,2566.064929,3524.49542


Now we create a pivot table with the forecast of ECB for the next years

In [47]:
# List of countries to keep
countries_to_keep = ['Spain', 'Switzerland', 'United States', 'India', 'Venezuela']

# Filter the DataFrame
df_ecb_pred = df_merged_pred[df_merged_pred['country_name'].isin(countries_to_keep)]

# Reset the index and drop it
df_ecb_pred.reset_index(drop=True, inplace=True)

# List of columns to drop
columns_to_drop = ['1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992']

# Drop the columns
df_ecb_pred.drop(columns_to_drop, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ecb_pred.drop(columns_to_drop, axis=1, inplace=True)


In [48]:
df_ecb_pred

Unnamed: 0,country_name,1993,1994,1995,1996,1997,1998,1999,2000,2001,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Switzerland,33172.325,34021.135,34681.015,35258.617,36572.274,37988.682,39001.417,41292.689,42703.596,...,65613.767,67690.604,68924.081,71966.628,73492.548,72255.288,79058.938,86195.048,89243.299,91931.752
1,United States,26364.192,27674.021,28671.48,29946.973,31440.087,32833.666,34496.241,36312.782,37101.453,...,57006.926,58179.697,60292.978,63165.278,65504.783,64367.435,70995.794,77191.871,81632.253,85372.686
2,Spain,16620.937,17325.517,18374.841,19120.069,20148.386,21211.034,22416.165,23880.729,25246.885,...,34954.853,37329.293,39647.785,41308.037,42492.026,38044.067,42356.615,47669.667,50436.18,52012.448
3,India,1367.822,1460.249,1572.155,1688.531,1753.237,1847.385,2001.886,2087.485,2197.354,...,5412.335,5778.27,6112.066,6583.59,6882.332,6507.86,7406.466,8424.352,9339.349,10122.951
4,Venezuela,11427.619,11146.894,11575.821,11516.704,12210.037,12140.77,11350.44,11640.053,12103.331,...,16970.969,14169.523,12271.362,10622.181,8117.036,5729.831,6132.464,7265.562,7942.862,8485.876


Export both prediction dfs for plotting

In [55]:
# Export df_pr to a CSV file
df_pr.to_csv(r'https://raw.githubusercontent.com/lgib88/World-Economic-Classification/main/data/cleaned/df_pr.csv', index=False)

# Export df_ecb_pred to a CSV file
df_ecb_pred.to_csv(r'https://raw.githubusercontent.com/lgib88/World-Economic-Classification/main/data/cleaned/df_ecb_pred.csv', index=False)