In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# 1) Reload clean (important)
df = pd.read_csv("c:/Users/layel/Downloads/African_crises_dataset.csv")   # <-- put your real file name here
df.columns = df.columns.str.strip()

# 2) Basic missing value handling (safe)
num_cols = df.select_dtypes(include="number").columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

cat_cols = df.select_dtypes(include="object").columns
for c in cat_cols:
    df[c] = df[c].fillna(df[c].mode()[0])

# 3) Target + features (before encoding check)
y = df["systemic_crisis"]
print("FULL target distribution:\n", y.value_counts())

# 4) Encode categoricals (keep target untouched)
X = df.drop(columns=["systemic_crisis"])
X = pd.get_dummies(X, drop_first=True)

# 5) Stratified split (so both classes appear in train)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

print("\nTRAIN distribution:\n", y_train.value_counts())
print("\nTEST distribution:\n", y_test.value_counts())

# 6) Train (balanced helps when crises are rare)
model = LogisticRegression(max_iter=2000, class_weight="balanced")
model.fit(X_train, y_train)

print("\n✅ Model trained successfully!")


FULL target distribution:
 systemic_crisis
0    977
1     82
Name: count, dtype: int64

TRAIN distribution:
 systemic_crisis
0    879
1     74
Name: count, dtype: int64

TEST distribution:
 systemic_crisis
0    98
1     8
Name: count, dtype: int64

✅ Model trained successfully!


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [3]:
df.head()
df.shape
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   country_number                   1059 non-null   int64  
 1   country_code                     1059 non-null   object 
 2   country                          1059 non-null   object 
 3   year                             1059 non-null   int64  
 4   systemic_crisis                  1059 non-null   int64  
 5   exch_usd                         1059 non-null   float64
 6   domestic_debt_in_default         1059 non-null   int64  
 7   sovereign_external_debt_default  1059 non-null   int64  
 8   gdp_weighted_default             1059 non-null   float64
 9   inflation_annual_cpi             1059 non-null   float64
 10  independence                     1059 non-null   int64  
 11  currency_crises                  1059 non-null   int64  
 12  inflation_crises    

Unnamed: 0,country_number,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises
count,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0
mean,35.613787,1967.767705,0.077432,43.140831,0.03966,0.152975,0.006402,20848.89,0.776204,0.1322,0.129367
std,23.692402,33.530632,0.267401,111.47538,0.195251,0.360133,0.043572,675727.4,0.416984,0.349847,0.335765
min,1.0,1860.0,0.0,0.0,0.0,0.0,0.0,-28.50214,0.0,0.0,0.0
25%,15.0,1951.0,0.0,0.19535,0.0,0.0,0.0,2.086162,1.0,0.0,0.0
50%,38.0,1973.0,0.0,0.8684,0.0,0.0,0.0,5.76233,1.0,0.0,0.0
75%,56.0,1994.0,0.0,8.46275,0.0,0.0,0.0,11.64405,1.0,0.0,0.0
max,70.0,2014.0,1.0,744.306139,1.0,1.0,0.4,21989700.0,1.0,2.0,1.0


In [4]:
print("Duplicates:", df.duplicated().sum())
df = df.drop_duplicates()


Duplicates: 0


In [5]:
pd.get_dummies(X, drop_first=True)


Unnamed: 0,country_number,year,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,...,country_Ivory Coast,country_Kenya,country_Mauritius,country_Morocco,country_Nigeria,country_South Africa,country_Tunisia,country_Zambia,country_Zimbabwe,banking_crisis_no_crisis
0,1,1870,0.052264,0,0,0.0,3.441456,0,0,0,...,False,False,False,False,False,False,False,False,False,False
1,1,1871,0.052798,0,0,0.0,14.149140,0,0,0,...,False,False,False,False,False,False,False,False,False,True
2,1,1872,0.052274,0,0,0.0,-3.718593,0,0,0,...,False,False,False,False,False,False,False,False,False,True
3,1,1873,0.051680,0,0,0.0,11.203897,0,0,0,...,False,False,False,False,False,False,False,False,False,True
4,1,1874,0.051308,0,0,0.0,-3.848561,0,0,0,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1054,70,2009,354.800000,1,1,0.0,-7.670000,1,1,0,...,False,False,False,False,False,False,False,False,True,False
1055,70,2010,378.200000,1,1,0.0,3.217000,1,0,0,...,False,False,False,False,False,False,False,False,True,True
1056,70,2011,361.900000,1,1,0.0,4.920000,1,0,0,...,False,False,False,False,False,False,False,False,True,True
1057,70,2012,361.900000,1,1,0.0,3.720000,1,0,0,...,False,False,False,False,False,False,False,False,True,True


In [10]:
df["systemic_crisis"]


0       1
1       0
2       0
3       0
4       0
       ..
1054    1
1055    0
1056    0
1057    0
1058    0
Name: systemic_crisis, Length: 1059, dtype: int64

In [11]:
df["systemic_crisis"].value_counts()

systemic_crisis
0    977
1     82
Name: count, dtype: int64

In [15]:
y = df["systemic_crisis"]
X = df.drop(columns=["systemic_crisis"])


In [16]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9811320754716981
ROC-AUC: 0.9961734693877551

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99        98
           1       0.80      1.00      0.89         8

    accuracy                           0.98       106
   macro avg       0.90      0.99      0.94       106
weighted avg       0.98      0.98      0.98       106


Confusion Matrix:
 [[96  2]
 [ 0  8]]


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced"
)

rf.fit(X_train, y_train)

# Predictions
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

print("ROC-AUC:", roc_auc_score(y_test, y_proba_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


ROC-AUC: 0.9974489795918369

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99        98
           1       0.88      0.88      0.88         8

    accuracy                           0.98       106
   macro avg       0.93      0.93      0.93       106
weighted avg       0.98      0.98      0.98       106


Confusion Matrix:
 [[97  1]
 [ 1  7]]


In [20]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

rf_smote = RandomForestClassifier(
    n_estimators=300,
    random_state=42
)

rf_smote.fit(X_train_sm, y_train_sm)

y_pred_sm = rf_smote.predict(X_test)
y_proba_sm = rf_smote.predict_proba(X_test)[:, 1]

print("ROC-AUC:", roc_auc_score(y_test, y_proba_sm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_sm))


ROC-AUC: 0.9987244897959183

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99        98
           1       0.88      0.88      0.88         8

    accuracy                           0.98       106
   macro avg       0.93      0.93      0.93       106
weighted avg       0.98      0.98      0.98       106



In [21]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Probabilities from your logistic regression
y_proba = model.predict_proba(X_test)[:, 1]

thresholds = [0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50]

rows = []
for t in thresholds:
    y_pred_t = (y_proba >= t).astype(int)
    rows.append({
        "threshold": t,
        "precision_crisis(1)": precision_score(y_test, y_pred_t, pos_label=1, zero_division=0),
        "recall_crisis(1)": recall_score(y_test, y_pred_t, pos_label=1, zero_division=0),
        "f1_crisis(1)": f1_score(y_test, y_pred_t, pos_label=1, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_proba)
    })

pd.DataFrame(rows)


Unnamed: 0,threshold,precision_crisis(1),recall_crisis(1),f1_crisis(1),roc_auc
0,0.2,0.533333,1.0,0.695652,0.996173
1,0.25,0.571429,1.0,0.727273,0.996173
2,0.3,0.571429,1.0,0.727273,0.996173
3,0.35,0.571429,1.0,0.727273,0.996173
4,0.4,0.571429,1.0,0.727273,0.996173
5,0.45,0.666667,1.0,0.8,0.996173
6,0.5,0.8,1.0,0.888889,0.996173


In [22]:
from sklearn.metrics import classification_report, confusion_matrix

best_t = 0.35
y_pred_best = (y_proba >= best_t).astype(int)

print("Chosen threshold:", best_t)
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_best))


Chosen threshold: 0.35

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.94      0.97        98
           1       0.57      1.00      0.73         8

    accuracy                           0.94       106
   macro avg       0.79      0.97      0.85       106
weighted avg       0.97      0.94      0.95       106


Confusion Matrix:
 [[92  6]
 [ 0  8]]


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report

rf = RandomForestClassifier(
    n_estimators=400,
    random_state=42,
    class_weight="balanced_subsample"
)

rf.fit(X_train, y_train)

y_proba_rf = rf.predict_proba(X_test)[:, 1]

# try same threshold
t = 0.35
y_pred_rf = (y_proba_rf >= t).astype(int)

print("RF ROC-AUC:", roc_auc_score(y_test, y_proba_rf))
print("\nRF Classification Report:\n", classification_report(y_test, y_pred_rf))


RF ROC-AUC: 0.9987244897959183

RF Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99        98
           1       0.89      1.00      0.94         8

    accuracy                           0.99       106
   macro avg       0.94      0.99      0.97       106
weighted avg       0.99      0.99      0.99       106



In [24]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# final threshold
FINAL_THRESHOLD = 0.35

y_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= FINAL_THRESHOLD).astype(int)

print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


ROC-AUC: 0.9961734693877551

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.94      0.97        98
           1       0.57      1.00      0.73         8

    accuracy                           0.94       106
   macro avg       0.79      0.97      0.85       106
weighted avg       0.97      0.94      0.95       106


Confusion Matrix:
 [[92  6]
 [ 0  8]]


Data Cleaning

Missing numerical values were filled with the median and categorical values with the mode. Duplicate rows were checked and removed. Outliers were not removed because extreme economic values can be meaningful.

Feature Engineering

Categorical variables were converted into numerical ones using one-hot encoding. The target variable chosen was systemic_crisis.

Modeling

Logistic Regression was used since the problem is a binary classification task. Class imbalance was handled using class weights and probability threshold tuning.

Evaluation

The model was evaluated using accuracy, precision, recall, F1-score, confusion matrix, and ROC-AUC. Recall for the crisis class was prioritized to avoid missing crises.

Improvements

The model could be improved by using cross-validation, tuning hyperparameters, trying other models like Random Forest, or applying oversampling techniques such as SMOTE.