In [1]:
# Importing libraries
import pandas as pd                  # Pandas
import numpy as np                   # Numpy


In [17]:
from matplotlib import pyplot as plt # Matplotlib

# Package to implement Decision Tree Model
import sklearn
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Package for data partitioning
from sklearn.model_selection import train_test_split

# Package to visualize Decision Tree
from sklearn import tree

# Import packages to implement Stratified K-fold CV
from sklearn.model_selection import KFold # For creating folds

# Import Package to implement GridSearch CV (Hyperparameter Tuning Method 1)
from sklearn.model_selection import GridSearchCV

# Importing package for Randomized Search CV (Hyperparameter Tuning Method 2)
from sklearn.model_selection import RandomizedSearchCV



In [5]:
# Load dataset as dataframe
emdat_df = pd.read_excel('public_emdat_2023-10-26.xlsx')
emdat_df.head()


Unnamed: 0,DisNo.,Historic,Classification Key,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,External IDs,Event Name,ISO,...,Reconstruction Costs ('000 US$),"Reconstruction Costs, Adjusted ('000 US$)",Insured Damage ('000 US$),"Insured Damage, Adjusted ('000 US$)",Total Damage ('000 US$),"Total Damage, Adjusted ('000 US$)",CPI,Admin Units,Entry Date,Last Update
0,1999-9388-DJI,No,nat-cli-dro-dro,Natural,Climatological,Drought,Drought,,,DJI,...,,,,,,,60.503579,"[{""adm1_code"":1093,""adm1_name"":""Ali Sabieh""},{...",2006-03-01,2023-09-25
1,1999-9388-SDN,No,nat-cli-dro-dro,Natural,Climatological,Drought,Drought,,,SDN,...,,,,,,,58.840648,"[{""adm1_code"":2757,""adm1_name"":""Northern Darfu...",2006-03-08,2023-09-25
2,1999-9388-SOM,No,nat-cli-dro-dro,Natural,Climatological,Drought,Drought,,,SOM,...,,,,,,,58.840648,"[{""adm1_code"":2691,""adm1_name"":""Bay""},{""adm1_c...",2006-03-08,2023-09-25
3,2000-0001-AGO,No,tec-tra-roa-roa,Technological,Transport,Road,Road,,,AGO,...,,,,,,,58.840648,,2004-10-27,2023-09-25
4,2000-0002-AGO,No,nat-hyd-flo-riv,Natural,Hydrological,Flood,Riverine flood,,,AGO,...,,,,,10000.0,16995.0,58.840648,"[{""adm2_code"":4214,""adm2_name"":""Baia Farta""},{...",2005-02-03,2023-09-25


# Project Overview

First, subset by 'Disaster Type'

Predict Total Affected from    earthquake, drought, flood, fire
- ISO (country)
    - if earthquake or volcano, can use lat/long
- Magnitude (must first subset by Disaster Type)
- Disaster Subtype
- OFDA Response
- Appeal
- Declaration
- AID Contribution (\'000 US$)
- (length of time for fire, drought, flood)


## Subset Earthquakes for Analysis Exploration

In [29]:
df = emdat_df[emdat_df['Disaster Type'] == "Earthquake"]
df = df[['ISO', 'Magnitude', 'Disaster Subtype', 'OFDA Response', 'Appeal', 'Declaration', 'Total Affected']]

df.head()
df.dropna(inplace=True)
df.shape

(628, 6)

## Hyperparameter Tuning DT

In [31]:
# set target variable
X = df.drop(columns = ['Total Affected'])
y = df['Total Affected']

# encode categorical variables
cat_var = ['ISO', 'Disaster Subtype', 'OFDA Response', 'Appeal', 'Declaration']
X_encoded = pd.get_dummies(X, columns = cat_var)

# split train/test dataset
train_X, test_X, train_y, test_y = train_test_split(X_encoded, y, test_size = 0.3, random_state = 1)


In [32]:
# define model
regressor = DecisionTreeRegressor(random_state = 42)

# Creating folds 
folds = KFold(n_splits = 3, shuffle = True, random_state = 100)

# Start with an initial guess for parameters
hyper_params = {
    'max_depth': [2, 5, 10, 20],
    'min_samples_split': [2, 5, 10, 50],
    'min_samples_leaf': [20, 80, 100, 150]
}

# Call GridSearchCV()
model_cv = GridSearchCV(estimator = regressor,     # algorithm
                        param_grid = hyper_params,  # ranges for hyperparameters
                        scoring = 'r2',    # metric choice for scoring/evaluation ('r2' for regression)
                        cv = folds,                 # specify cv folds
                        verbose = 1,                # specify how much info you want about experiments
                        n_jobs = -1) # Will utilize all available CPUs (can specify # of core processors to use, or -1 to use all)
                                        # if you're running other apps in background, don't use -1

In [33]:
# Fit the model
model_cv.fit(train_X, train_y)

# Return the performance metric score
print('Initial score: ', model_cv.best_score_)

# Return set of parameters with the best performance
print('Initial parameters: ', model_cv.best_params_)

Fitting 3 folds for each of 64 candidates, totalling 192 fits
Initial score:  -0.04525677003409393
Initial parameters:  {'max_depth': 2, 'min_samples_leaf': 150, 'min_samples_split': 2}


## Most Basic DT w/o Hyperparameter

In [None]:
# Defining prediction model
clf = DecisionTreeRegressor(random_state = 0)

# Fitting model on training data
clf.fit(train_X, train_y)

In [21]:
from sklearn.metrics import r2_score
# Predictions on test set
y_pred = clf.predict(test_X)
# R-2 score using r2_score(y_true, y_pred)
r2_score(test_y, y_pred)

-0.796268459183153

## Basic RF model

In [36]:
# Defining prediction model
clf = RandomForestRegressor(random_state = 0)

# Fitting model on training data
clf.fit(train_X, train_y)

In [37]:
from sklearn.metrics import r2_score
# Predictions on test set
y_pred = clf.predict(test_X)
# R-2 score using r2_score(y_true, y_pred)
r2_score(test_y, y_pred)

-1.1449212115854666