In [59]:
import re  # Import the regular expressions module for pattern matching and text processing
import matplotlib.pyplot as plt  # For plotting data
import seaborn as sns  # For enhanced data visualizations

# Import libraries for machine learning models and evaluation
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler, OneHotEncoder  # For scaling numerical data and encoding categorical data
from sklearn.linear_model import LinearRegression, ElasticNet  # For linear Regression
from sklearn.tree import DecisionTreeRegressor  # For Decision Tree Regression
from sklearn.ensemble import RandomForestRegressor  # For Random Forest Regression
from sklearn.svm import SVR  # For Support Vector Regression 
#import xgboost as xgb # For XGBoost Regression
from sklearn.model_selection import cross_validate  # To perform cross-validation
from sklearn.metrics import mean_squared_error, r2_score, make_scorer  # For model evaluation metrics
from sklearn.model_selection import GridSearchCV   # For hyperparameter tuning


In [61]:
import pandas as pd
import numpy as np

In [63]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
df = pd.read_csv('African_crises_dataset.csv')
df.head()

Unnamed: 0,country_number,country_code,country,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,banking_crisis
0,1,DZA,Algeria,1870,1,0.052264,0,0,0.0,3.441456,0,0,0,crisis
1,1,DZA,Algeria,1871,0,0.052798,0,0,0.0,14.14914,0,0,0,no_crisis
2,1,DZA,Algeria,1872,0,0.052274,0,0,0.0,-3.718593,0,0,0,no_crisis
3,1,DZA,Algeria,1873,0,0.05168,0,0,0.0,11.203897,0,0,0,no_crisis
4,1,DZA,Algeria,1874,0,0.051308,0,0,0.0,-3.848561,0,0,0,no_crisis


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   country_number                   1059 non-null   int64  
 1   country_code                     1059 non-null   object 
 2   country                          1059 non-null   object 
 3   year                             1059 non-null   int64  
 4   systemic_crisis                  1059 non-null   int64  
 5   exch_usd                         1059 non-null   float64
 6   domestic_debt_in_default         1059 non-null   int64  
 7   sovereign_external_debt_default  1059 non-null   int64  
 8   gdp_weighted_default             1059 non-null   float64
 9   inflation_annual_cpi             1059 non-null   float64
 10  independence                     1059 non-null   int64  
 11  currency_crises                  1059 non-null   int64  
 12  inflation_crises    

In [13]:
df.describe()

Unnamed: 0,country_number,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises
count,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0
mean,35.613787,1967.767705,0.077432,43.140831,0.03966,0.152975,0.006402,20848.89,0.776204,0.1322,0.129367
std,23.692402,33.530632,0.267401,111.47538,0.195251,0.360133,0.043572,675727.4,0.416984,0.349847,0.335765
min,1.0,1860.0,0.0,0.0,0.0,0.0,0.0,-28.50214,0.0,0.0,0.0
25%,15.0,1951.0,0.0,0.19535,0.0,0.0,0.0,2.086162,1.0,0.0,0.0
50%,38.0,1973.0,0.0,0.8684,0.0,0.0,0.0,5.76233,1.0,0.0,0.0
75%,56.0,1994.0,0.0,8.46275,0.0,0.0,0.0,11.64405,1.0,0.0,0.0
max,70.0,2014.0,1.0,744.306139,1.0,1.0,0.4,21989700.0,1.0,2.0,1.0


In [15]:
 df.describe(include = 'object')

Unnamed: 0,country_code,country,banking_crisis
count,1059,1059,1059
unique,13,13,2
top,EGY,Egypt,no_crisis
freq,155,155,965


In [17]:
print(df['systemic_crisis'].value_counts())

systemic_crisis
0    977
1     82
Name: count, dtype: int64


In [None]:
# Create the profiling report
profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)

# Display the report in a Jupyter notebook (optional)
profile.to_notebook_iframe()

# Or save the report to an HTML file
profile.to_file("profiling_report.html")

In [19]:
df.isnull().sum()

country_number                     0
country_code                       0
country                            0
year                               0
systemic_crisis                    0
exch_usd                           0
domestic_debt_in_default           0
sovereign_external_debt_default    0
gdp_weighted_default               0
inflation_annual_cpi               0
independence                       0
currency_crises                    0
inflation_crises                   0
banking_crisis                     0
dtype: int64

In [21]:
print(df.columns.tolist())

['country_number', 'country_code', 'country', 'year', 'systemic_crisis', 'exch_usd', 'domestic_debt_in_default', 'sovereign_external_debt_default', 'gdp_weighted_default', 'inflation_annual_cpi', 'independence', 'currency_crises', 'inflation_crises', 'banking_crisis']


In [23]:
#trying to log transform exch_usd and inflation annual because of the extreme outliers
# Adding a small constant to avoid log(0) issues
df['log_exch_usd'] = np.log1p(df['exch_usd'])  # log1p is log(1 + x)
df['log_inflation_annual_cpi'] = np.log1p(df['inflation_annual_cpi'])

In [25]:
# Encoding 'banking_crisis' column
categorical_cols = ['banking_crisis']

# Apply one-hot encoding using the variable (no quotes!)
df = pd.get_dummies(df, columns=categorical_cols)

# Display all columns
pd.set_option("display.max_columns", None)


In [27]:
# Drop irrelevant columns and separate features (X) and target (y)
X = df.drop(columns=["country_code", "country_number", "country", "year", "systemic_crisis", "exch_usd", "gdp_weighted_default",
                     "inflation_annual_cpi"])  # Features
y = df["systemic_crisis"]  # Target variable

In [29]:
# Split X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

The target variable 'systemic_crisis' is very imbalanced which makes it difficult to log it because it is classification. Instead, it is easier to use smoting for my train data so that my oversampled data is balanced. Also use scaling and use class_weight = balanced in my model to further help in balancing the minority classs. I will also consider models like xgb boost and random forest as they are as they are robust to imbalanced dataset.smoting is for only training data, that is x train and y train while scaling is for only feature variable x cant be used on y target variable.

In [None]:
#replacing the missing values in my training data x with median

In [31]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')  # Use 'median' or 'most_frequent' as needed
X_train['log_inflation_annual_cpi'] = imputer.fit_transform(X_train[['log_inflation_annual_cpi']])


In [43]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


In [45]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train_smote)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

In [47]:
print("NaN values in X_train_scaled:", np.isnan(X_train_scaled).sum())
print("NaN values in X_test_scaled:", np.isnan(X_test_scaled).sum())


NaN values in X_train_scaled: 0
NaN values in X_test_scaled: 21


In [49]:
imputer = SimpleImputer(strategy='median')

X_train_scaled = imputer.fit_transform(X_train_scaled)
X_test_scaled = imputer.transform(X_test_scaled)


In [53]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [55]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
logistic_model = LogisticRegression(random_state=40, max_iter=1000)

# Train the model
logistic_model.fit(X_train_scaled, y_train_smote)

# Predict on the test set and evaluate
y_pred = logistic_model.predict(X_test_scaled)
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy Score: 0.9858490566037735
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       195
           1       0.89      0.94      0.91        17

    accuracy                           0.99       212
   macro avg       0.94      0.97      0.95       212
weighted avg       0.99      0.99      0.99       212

Confusion Matrix:
 [[193   2]
 [  1  16]]


In [65]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-macosx_10_15_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-macosx_10_15_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.0


In [67]:
from xgboost import XGBClassifier

# Initialize the XGBoost model
xgboost_model = XGBClassifier(random_state=40)

# Train the model
xgboost_model.fit(X_train_smote, y_train_smote)

# Predict on the test set and evaluate
y_pred = xgboost_model.predict(X_test)
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy Score: 0.9858490566037735
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       195
           1       0.94      0.88      0.91        17

    accuracy                           0.99       212
   macro avg       0.96      0.94      0.95       212
weighted avg       0.99      0.99      0.99       212

Confusion Matrix:
 [[194   1]
 [  2  15]]


In [69]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define the XGBoost model
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Define the hyperparameters grid
param_grid = {
    'n_estimators': [50, 100, 150],          # Number of trees
    'learning_rate': [0.01, 0.1, 0.2],       # Step size shrinkage
    'max_depth': [3, 5, 7],                  # Maximum depth of a tree
    'min_child_weight': [1, 3, 5],           # Minimum sum of weights of all child nodes
    'subsample': [0.8, 1.0],                 # Fraction of samples used for fitting
    'colsample_bytree': [0.8, 1.0],          # Fraction of features used per tree
    'gamma': [0, 0.1, 0.2],                  # Minimum loss reduction for splitting
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='f1_macro',                      # Optimize for F1-Score (macro)
    cv=5,                                    # 5-fold cross-validation
    verbose=1,
    n_jobs=-1                                # Use all CPU cores
)

# Fit GridSearchCV
grid_search.fit(X_train_smote, y_train_smote)

# Get the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)


Fitting 5 folds for each of 972 candidates, totalling 4860 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 150, 'subsample': 0.8}
Best F1 Score: 0.9884899225330199


In [71]:
# Train the final model with the best parameters
best_params = {
    'colsample_bytree': 1.0,
    'gamma': 0.2,
    'learning_rate': 0.2,
    'max_depth': 3,
    'min_child_weight': 1,
    'n_estimators': 50,
    'subsample': 0.8
}

final_model = XGBClassifier(**best_params, random_state=42, use_label_encoder=False, eval_metric='logloss')
final_model.fit(X_train_smote, y_train_smote)


In [73]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Predictions on the test set
y_test_pred = final_model.predict(X_test)

# Evaluate performance
print("Accuracy Score:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


Accuracy Score: 0.9905660377358491

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       195
           1       1.00      0.88      0.94        17

    accuracy                           0.99       212
   macro avg       0.99      0.94      0.97       212
weighted avg       0.99      0.99      0.99       212


Confusion Matrix:
 [[195   0]
 [  2  15]]
