## Imports & Load Data

In [145]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor, Booster
from sklearn.metrics import r2_score, mean_squared_error
import pickle

In [146]:
data_folder = "../../../DS-Notebooks-Summer-2022/data/"
inflation_df = pd.read_csv(data_folder + "inflation interest unemployment.csv")
gdp_df = pd.read_csv(data_folder + "gdp_long.csv", index_col = 0)
growth_df = pd.read_csv(data_folder + "gdp_growth_long.csv", index_col = 0)

## Merge Data

In [147]:
# merge datasets
df = inflation_df.merge(
                gdp_df, left_on = ['country', 'year'], right_on = ['Country Name', 'year'], how = 'left').merge(
                growth_df, left_on = ['country', 'year'], right_on = ['Country Name', 'year'], how = 'left')

# drop unwanted columns
df.drop(columns = [
        'iso3c', 
        'iso2c', 
        'Country Name_x', 
        'Country Name_y'
    ], inplace = True
)

df["Unemployment"] = df["Unemployment, total (% of total labor force) (modeled ILO estimate)"].fillna(df["Unemployment, total (% of total labor force) (national estimate)"])

# preview final dataframe
df.head()

Unnamed: 0,country,year,"Inflation, consumer prices (annual %)","Inflation, GDP deflator (annual %)",Real interest rate (%),Deposit interest rate (%),Lending interest rate (%),"Unemployment, total (% of total labor force) (national estimate)","Unemployment, total (% of total labor force) (modeled ILO estimate)",adminregion,incomeLevel,GDP,GDP Growth,Unemployment
0,Afghanistan,1970,,,,,,,,South Asia,Low income,1748887000.0,,
1,Afghanistan,1971,,,,,,,,South Asia,Low income,1831109000.0,,
2,Afghanistan,1972,,,,,,,,South Asia,Low income,1595555000.0,,
3,Afghanistan,1973,,,,,,,,South Asia,Low income,1733333000.0,,
4,Afghanistan,1974,,,,,,,,South Asia,Low income,2155555000.0,,


In [148]:
required_cols = [
    "Inflation, consumer prices (annual %)",
    "Lending interest rate (%)",
    "Unemployment",
    "GDP Growth"
]

for country in df['country'].unique():

    country_df = df[df['country'] == country]

    columns_to_drop = []
    for column in country_df.columns:

        # calculate percent nan
        percent_nan = country_df[column].isna().sum() / len(country_df)

        # drop column if too many nans
        if percent_nan > 0.2 and column not in columns_to_drop:
            columns_to_drop.append(column)

    country_nona_df = country_df.drop(columns_to_drop, axis = 1)

    is_complete = True
    for col in required_cols:
        if (country_nona_df is None) or (col not in country_nona_df.columns):
            is_complete = False

    if is_complete:
        print(country, ' '.join(country_nona_df.columns))

Australia country year Inflation, consumer prices (annual %) Inflation, GDP deflator (annual %) Real interest rate (%) Lending interest rate (%) Unemployment, total (% of total labor force) (national estimate) incomeLevel GDP GDP Growth Unemployment


Egypt, Arab Rep. country year Inflation, consumer prices (annual %) Inflation, GDP deflator (annual %) Real interest rate (%) Deposit interest rate (%) Lending interest rate (%) Unemployment, total (% of total labor force) (national estimate) adminregion incomeLevel GDP GDP Growth Unemployment


Jamaica country year Inflation, consumer prices (annual %) Inflation, GDP deflator (annual %) Real interest rate (%) Deposit interest rate (%) Lending interest rate (%) Unemployment, total (% of total labor force) (national estimate) adminregion incomeLevel GDP GDP Growth Unemployment


Philippines country year Inflation, consumer prices (annual %) Inflation, GDP deflator (annual %) Real interest rate (%) Deposit interest rate (%) Lending interest rate (%) Unemployment, total (% of total labor force) (national estimate) adminregion incomeLevel GDP GDP Growth Unemployment
Singapore country year Inflation, consumer prices (annual %) Inflation, GDP deflator (annual %) Real interest rate (%) Deposit interest rate (%) Lending interest rate (%) Unemployment, total (% of total labor force) (national estimate) incomeLevel GDP GDP Growth Unemployment
Thailand country year Inflation, consumer prices (annual %) Inflation, GDP deflator (annual %) Real interest rate (%) Deposit interest rate (%) Lending interest rate (%) Unemployment, total (% of total labor force) (national estimate) adminregion incomeLevel GDP GDP Growth Unemployment
Trinidad and Tobago country year Inflation, consumer prices (annual %) Inflation, GDP deflator (annual %) Real interest rate (%) Lending interest r

United Kingdom country year Inflation, consumer prices (annual %) Inflation, GDP deflator (annual %) Real interest rate (%) Lending interest rate (%) Unemployment, total (% of total labor force) (national estimate) incomeLevel GDP GDP Growth Unemployment
United States country year Inflation, consumer prices (annual %) Inflation, GDP deflator (annual %) Real interest rate (%) Lending interest rate (%) Unemployment, total (% of total labor force) (national estimate) incomeLevel GDP GDP Growth Unemployment
Uruguay country year Inflation, consumer prices (annual %) Inflation, GDP deflator (annual %) Real interest rate (%) Deposit interest rate (%) Lending interest rate (%) Unemployment, total (% of total labor force) (national estimate) incomeLevel GDP GDP Growth Unemployment


## ML Prep



In [149]:
# filter dataframe for specified country and columns
country = "United Kingdom"
country_df = df[df['country'] == country][["Inflation, consumer prices (annual %)", "Lending interest rate (%)", "Unemployment", "GDP Growth"]]

country_df.dropna(inplace = True)

print(len(country_df))

44


In [150]:
# define features
X = country_df.drop(['Inflation, consumer prices (annual %)'], axis = 1)

# define target
y = country_df['Inflation, consumer prices (annual %)']

# train-test split
X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.2, random_state=42)

## Training the Model

In [151]:
 # train model
model = XGBRegressor()
model.fit(X_train, y_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [152]:
# get predictions
X_pred = model.predict(X_test)

# calculate metrics
r2 = r2_score(y_test, X_pred)
mean_error = mean_squared_error(y_test, X_pred, squared=False)

# print report
print('R-squared = ', r2)
print('Mean Error = ', mean_error)

R-squared =  0.2580873455209908
Mean Error =  4.1477535966746055


In [153]:
# save model
model.save_model(f'{country.lower()}.json')

In [154]:
print(X.columns)

Index(['Lending interest rate (%)', 'Unemployment', 'GDP Growth'], dtype='object')
