In [1]:
import imp
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [2]:
df = pd.read_csv('dataset/dataset.csv')
df

Unnamed: 0,Year,Seq,Glide,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Country,...,No Affected,No Homeless,Total Affected,Insured Damages ('000 US$),Total Damages ('000 US$),CPI,Adm Level,Admin1 Code,Admin2 Code,Geo Locations
0,1900,9002,,Natural,Climatological,Drought,Drought,,,Cabo Verde,...,,,,,,3.221647,,,,
1,1900,9001,,Natural,Climatological,Drought,Drought,,,India,...,,,,,,3.221647,,,,
2,1902,12,,Natural,Geophysical,Earthquake,Ground movement,,,Guatemala,...,,,,,25000.0,3.350513,,,,
3,1902,3,,Natural,Geophysical,Volcanic activity,Ash fall,,Santa Maria,Guatemala,...,,,,,,3.350513,,,,
4,1902,10,,Natural,Geophysical,Volcanic activity,Ash fall,,Santa Maria,Guatemala,...,,,,,,3.350513,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16121,2021,449,FL-2021-000110,Natural,Hydrological,Flood,,,,Yemen,...,67980.0,,67980.0,,,,,,,
16122,2021,75,,Natural,Hydrological,Flood,,,,South Africa,...,400.0,,400.0,,75000.0,,1;2,2707;77311;77312;77313;77315,77364;77367,"Free State, KwaZulu-Natal, Limpopo, Mpumalanga..."
16123,2021,599,EP-2021-000138,Natural,Biological,Epidemic,Viral disease,,Meningitis,Congo (the Democratic Republic of the),...,,,301.0,,,,,,,
16124,2021,20,,Natural,Hydrological,Flood,,,,Serbia,...,22.0,,22.0,,,,1,25374;25378;25379;25380;25383;25386;25397,,"Jablanicki, Kosovski, Kosovsko-mitrovatski, Ko..."


In [3]:
data = df[['Year', 'Disaster Subgroup', 'Disaster Type', 'Country', 'Dis Mag Value', 'Dis Mag Scale']]
data

Unnamed: 0,Year,Disaster Subgroup,Disaster Type,Country,Dis Mag Value,Dis Mag Scale
0,1900,Climatological,Drought,Cabo Verde,,Km2
1,1900,Climatological,Drought,India,,Km2
2,1902,Geophysical,Earthquake,Guatemala,8.0,Richter
3,1902,Geophysical,Volcanic activity,Guatemala,,
4,1902,Geophysical,Volcanic activity,Guatemala,,
...,...,...,...,...,...,...
16121,2021,Hydrological,Flood,Yemen,,Km2
16122,2021,Hydrological,Flood,South Africa,,Km2
16123,2021,Biological,Epidemic,Congo (the Democratic Republic of the),,Vaccinated
16124,2021,Hydrological,Flood,Serbia,,Km2


In [4]:
data = data.dropna()
data

Unnamed: 0,Year,Disaster Subgroup,Disaster Type,Country,Dis Mag Value,Dis Mag Scale
2,1902,Geophysical,Earthquake,Guatemala,8.0,Richter
9,1905,Geophysical,Earthquake,India,8.0,Richter
10,1906,Geophysical,Earthquake,Chile,8.0,Richter
11,1906,Geophysical,Earthquake,Colombia,9.0,Richter
15,1907,Geophysical,Earthquake,China,7.0,Richter
...,...,...,...,...,...,...
16092,2021,Meteorological,Storm,United States of America (the),120.0,Kph
16093,2021,Meteorological,Storm,United States of America (the),240.0,Kph
16094,2021,Meteorological,Storm,Viet Nam,90.0,Kph
16096,2021,Meteorological,Storm,Zimbabwe,160.0,Kph


In [5]:
data['Disaster Type'].value_counts()

Flood                   1779
Earthquake              1455
Storm                   1123
Extreme temperature      278
Wildfire                 187
Drought                   51
Epidemic                  50
Name: Disaster Type, dtype: int64

In [6]:
data = data[['Disaster Subgroup','Disaster Type','Country','Dis Mag Value','Dis Mag Scale']]
data

Unnamed: 0,Disaster Subgroup,Disaster Type,Country,Dis Mag Value,Dis Mag Scale
2,Geophysical,Earthquake,Guatemala,8.0,Richter
9,Geophysical,Earthquake,India,8.0,Richter
10,Geophysical,Earthquake,Chile,8.0,Richter
11,Geophysical,Earthquake,Colombia,9.0,Richter
15,Geophysical,Earthquake,China,7.0,Richter
...,...,...,...,...,...
16092,Meteorological,Storm,United States of America (the),120.0,Kph
16093,Meteorological,Storm,United States of America (the),240.0,Kph
16094,Meteorological,Storm,Viet Nam,90.0,Kph
16096,Meteorological,Storm,Zimbabwe,160.0,Kph


In [7]:
data.isnull().sum()

Disaster Subgroup    0
Disaster Type        0
Country              0
Dis Mag Value        0
Dis Mag Scale        0
dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4923 entries, 2 to 16105
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Disaster Subgroup  4923 non-null   object 
 1   Disaster Type      4923 non-null   object 
 2   Country            4923 non-null   object 
 3   Dis Mag Value      4923 non-null   float64
 4   Dis Mag Scale      4923 non-null   object 
dtypes: float64(1), object(4)
memory usage: 230.8+ KB


In [9]:
data['Country'].value_counts(), data['Dis Mag Scale'].value_counts()

(China                             403
 United States of America (the)    324
 Indonesia                         217
 India                             210
 Philippines (the)                 205
                                  ... 
 Côte d’Ivoire                       1
 Yemen P Dem Rep                     1
 Turkmenistan                        1
 Anguilla                            1
 Guinea-Bissau                       1
 Name: Country, Length: 204, dtype: int64,
 Km2           2017
 Richter       1455
 Kph           1123
 °C             278
 Vaccinated      50
 Name: Dis Mag Scale, dtype: int64)

In [10]:
max(data['Dis Mag Value'])

13025874.0

In [11]:
data['Disaster Subgroup'] = LabelEncoder().fit_transform(data['Disaster Subgroup'])
data['Disaster Type'] = LabelEncoder().fit_transform(data['Disaster Type'])
data['Country'] = LabelEncoder().fit_transform(data['Country'])
data['Dis Mag Scale'] = LabelEncoder().fit_transform(data['Dis Mag Scale'])
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Disaster Subgroup'] = LabelEncoder().fit_transform(data['Disaster Subgroup'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Disaster Type'] = LabelEncoder().fit_transform(data['Disaster Type'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Country'] = LabelEncoder().fit_transfor

Unnamed: 0,Disaster Subgroup,Disaster Type,Country,Dis Mag Value,Dis Mag Scale
2,2,1,71,8.0,2
9,2,1,80,8.0,2
10,2,1,37,8.0,2
11,2,1,39,9.0,2
15,2,1,38,7.0,2
...,...,...,...,...,...
16092,4,5,190,120.0,1
16093,4,5,190,240.0,1
16094,4,5,195,90.0,1
16096,4,5,203,160.0,1


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4923 entries, 2 to 16105
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Disaster Subgroup  4923 non-null   int32  
 1   Disaster Type      4923 non-null   int32  
 2   Country            4923 non-null   int32  
 3   Dis Mag Value      4923 non-null   float64
 4   Dis Mag Scale      4923 non-null   int32  
dtypes: float64(1), int32(4)
memory usage: 153.8 KB


In [13]:
X = data[['Disaster Subgroup','Country','Dis Mag Value','Dis Mag Scale']]
y = data['Disaster Type']

In [14]:
X

Unnamed: 0,Disaster Subgroup,Country,Dis Mag Value,Dis Mag Scale
2,2,71,8.0,2
9,2,80,8.0,2
10,2,37,8.0,2
11,2,39,9.0,2
15,2,38,7.0,2
...,...,...,...,...
16092,4,190,120.0,1
16093,4,190,240.0,1
16094,4,195,90.0,1
16096,4,203,160.0,1


In [15]:
y

2        1
9        1
10       1
11       1
15       1
        ..
16092    5
16093    5
16094    5
16096    5
16105    3
Name: Disaster Type, Length: 4923, dtype: int32

In [16]:
X = MinMaxScaler().fit_transform(X)
X

array([[5.00000000e-01, 3.49753695e-01, 4.99004639e-06, 5.00000000e-01],
       [5.00000000e-01, 3.94088670e-01, 4.99004639e-06, 5.00000000e-01],
       [5.00000000e-01, 1.82266010e-01, 4.99004639e-06, 5.00000000e-01],
       ...,
       [1.00000000e+00, 9.60591133e-01, 1.12851818e-05, 2.50000000e-01],
       [1.00000000e+00, 1.00000000e+00, 1.66590780e-05, 2.50000000e-01],
       [1.00000000e+00, 9.35960591e-01, 7.98407423e-06, 1.00000000e+00]])

In [17]:
np.savez('data.npz', x=X, y=y)
print('File saved.')

File saved.


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3938, 4), (3938,), (985, 4), (985,))

In [28]:
y_train.value_counts()

4    1454
1    1151
5     880
3     220
6     149
0      42
2      42
Name: Disaster Type, dtype: int64

In [29]:
y_test.value_counts()

4    325
1    304
5    243
3     58
6     38
0      9
2      8
Name: Disaster Type, dtype: int64

In [19]:
model = GaussianNB()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, pred))

Accuracy:  0.9908629441624366
Confusion Matrix:
 [[  0   0   0   0   0   0   9]
 [  0 304   0   0   0   0   0]
 [  0   0   8   0   0   0   0]
 [  0   0   0  58   0   0   0]
 [  0   0   0   0 325   0   0]
 [  0   0   0   0   0 243   0]
 [  0   0   0   0   0   0  38]]


In [20]:
model = LogisticRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, pred))

Accuracy:  0.9888324873096447
Confusion Matrix:
 [[  0   0   0   0   0   0   9]
 [  0 304   0   0   0   0   0]
 [  0   2   6   0   0   0   0]
 [  0   0   0  58   0   0   0]
 [  0   0   0   0 325   0   0]
 [  0   0   0   0   0 243   0]
 [  0   0   0   0   0   0  38]]


In [21]:
model = SVC()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, pred))

Accuracy:  0.9908629441624366
Confusion Matrix:
 [[  0   0   0   0   0   0   9]
 [  0 304   0   0   0   0   0]
 [  0   0   8   0   0   0   0]
 [  0   0   0  58   0   0   0]
 [  0   0   0   0 325   0   0]
 [  0   0   0   0   0 243   0]
 [  0   0   0   0   0   0  38]]


In [22]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, pred))

Accuracy:  0.9888324873096447
Confusion Matrix:
 [[  3   0   0   0   0   0   6]
 [  0 304   0   0   0   0   0]
 [  0   0   8   0   0   0   0]
 [  0   0   0  58   0   0   0]
 [  0   0   0   0 325   0   0]
 [  0   0   0   0   0 243   0]
 [  5   0   0   0   0   0  33]]


In [23]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, pred))
print('Classification Report:\n', classification_report(y_test, pred))

Accuracy:  0.9888324873096447
Confusion Matrix:
 [[  2   0   0   0   0   0   7]
 [  0 304   0   0   0   0   0]
 [  0   0   8   0   0   0   0]
 [  0   0   0  58   0   0   0]
 [  0   0   0   0 325   0   0]
 [  0   0   0   0   0 243   0]
 [  4   0   0   0   0   0  34]]
Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.22      0.27         9
           1       1.00      1.00      1.00       304
           2       1.00      1.00      1.00         8
           3       1.00      1.00      1.00        58
           4       1.00      1.00      1.00       325
           5       1.00      1.00      1.00       243
           6       0.83      0.89      0.86        38

    accuracy                           0.99       985
   macro avg       0.88      0.87      0.88       985
weighted avg       0.99      0.99      0.99       985



In [24]:
model = BaggingClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, pred))
print('Classification Report:\n', classification_report(y_test, pred))

Accuracy:  0.9888324873096447
Confusion Matrix:
 [[  3   0   0   0   0   0   6]
 [  0 304   0   0   0   0   0]
 [  0   0   8   0   0   0   0]
 [  0   0   0  58   0   0   0]
 [  0   0   0   0 325   0   0]
 [  0   0   0   0   0 243   0]
 [  5   0   0   0   0   0  33]]
Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.33      0.35         9
           1       1.00      1.00      1.00       304
           2       1.00      1.00      1.00         8
           3       1.00      1.00      1.00        58
           4       1.00      1.00      1.00       325
           5       1.00      1.00      1.00       243
           6       0.85      0.87      0.86        38

    accuracy                           0.99       985
   macro avg       0.89      0.89      0.89       985
weighted avg       0.99      0.99      0.99       985



In [25]:
model = AdaBoostClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, pred))
print('Classification Report:\n', classification_report(y_test, pred))

Accuracy:  0.7055837563451777
Confusion Matrix:
 [[  0   0   0   0   9   0   0]
 [  0 304   0   0   0   0   0]
 [  0   0   8   0   0   0   0]
 [  0   0   0  58   0   0   0]
 [  0   0   0   0 325   0   0]
 [  0 243   0   0   0   0   0]
 [  0   0   0   0  38   0   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.56      1.00      0.71       304
           2       1.00      1.00      1.00         8
           3       1.00      1.00      1.00        58
           4       0.87      1.00      0.93       325
           5       0.00      0.00      0.00       243
           6       0.00      0.00      0.00        38

    accuracy                           0.71       985
   macro avg       0.49      0.57      0.52       985
weighted avg       0.53      0.71      0.60       985



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
model = ExtraTreesClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, pred))
print('Classification Report:\n', classification_report(y_test, pred))

Accuracy:  0.9878172588832488
Confusion Matrix:
 [[  3   0   0   0   0   0   6]
 [  0 304   0   0   0   0   0]
 [  0   0   8   0   0   0   0]
 [  0   0   0  58   0   0   0]
 [  0   0   0   0 325   0   0]
 [  0   0   0   0   0 243   0]
 [  6   0   0   0   0   0  32]]
Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.33      0.33         9
           1       1.00      1.00      1.00       304
           2       1.00      1.00      1.00         8
           3       1.00      1.00      1.00        58
           4       1.00      1.00      1.00       325
           5       1.00      1.00      1.00       243
           6       0.84      0.84      0.84        38

    accuracy                           0.99       985
   macro avg       0.88      0.88      0.88       985
weighted avg       0.99      0.99      0.99       985



In [30]:
data = np.load('data.npz')
X = data['x']
y = data['y']
X.shape, y.shape

((4923, 4), (4923,))