In [62]:
import pandas as pd

In [None]:
df = pd.read_csv("data/volcano-events.csv")


In [64]:
df.isnull().sum()

Year                                    0
Month                                 131
Day                                   191
Name                                    0
Location                                0
Country                                 0
Latitude                                0
Longitude                               0
Elevation (m)                           0
Type                                    0
VEI                                   175
Agent                                 362
Deaths                                436
Death Description                     308
Missing                               865
Missing Description                   863
Injuries                              777
Injuries Description                  753
Damage ($Mil)                         855
Damage Description                    631
Houses Destroyed                      832
Houses Destroyed Description          754
Total Deaths                          414
Total Death Description           

In [65]:
df.nunique()

Year                                  364
Month                                  12
Day                                    31
Name                                  272
Location                               78
Country                                50
Latitude                              273
Longitude                             274
Elevation (m)                         262
Type                                   20
VEI                                     8
Agent                                  74
Deaths                                124
Death Description                       4
Missing                                11
Missing Description                     4
Injuries                               46
Injuries Description                    4
Damage ($Mil)                          21
Damage Description                      4
Houses Destroyed                       36
Houses Destroyed Description            4
Total Deaths                          136
Total Death Description           

### Location

In [66]:
location_missing = df.isnull()["Location"].astype(int)

In [67]:
location_counts = df['Location'].value_counts()
location_counts

Location
Java                 104
Italy                 60
Honshu-Japan          52
Kyushu-Japan          45
Luzon-Philippines     39
                    ... 
Galapagos              1
Antarctica             1
Luzon-N of             1
Philippines-C          1
US-Wyoming             1
Name: count, Length: 78, dtype: int64

In [68]:
common_locations = location_counts[location_counts > 20].index
df['Location_processed'] = df['Location'].where(df['Location'].isin(common_locations), 'other')
location_dummies = df['Location_processed'].str.get_dummies()
location_dummies

Unnamed: 0,Ecuador,Honshu-Japan,Iceland-S,Italy,Izu Is-Japan,Java,Kyushu-Japan,Lesser Sunda Is,Luzon-Philippines,New Guinea-NE of,Sangihe Is-Indonesia,other
0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
871,0,0,0,0,0,1,0,0,0,0,0,0
872,0,0,0,0,0,0,0,0,0,0,0,1
873,0,0,0,0,0,0,0,0,0,0,0,1
874,0,0,0,0,0,0,0,0,0,0,0,1


### Type

In [69]:
type_missing = df.isnull()["Type"].astype(int)

In [70]:
type_value_counts = df['Type'].value_counts()
type_value_counts

Type
Stratovolcano          561
Complex volcano         92
Caldera                 69
Shield volcano          69
Submarine volcano       22
Subglacial volcano      15
Pyroclastic shield      12
Lava dome                9
Fissure vent             4
Tuff cone                4
Pyroclastic cone         3
Volcanic field           3
Crater rows              2
Lava cone                2
Compound volcano         2
Cinder cone              2
Maar                     2
Pumice cone              1
Mud volcano              1
Submarine volcanoes      1
Name: count, dtype: int64

In [71]:
common_types = type_value_counts[type_value_counts > 10].index
df['Type_processed'] = df['Type'].where(df['Type'].isin(common_types), 'other')
df["Type_processed"]
type_dummies = df["Type_processed"].str.get_dummies()
type_dummies

Unnamed: 0,Caldera,Complex volcano,Pyroclastic shield,Shield volcano,Stratovolcano,Subglacial volcano,Submarine volcano,other
0,1,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...
871,0,0,0,0,1,0,0,0
872,0,0,0,0,1,0,0,0
873,0,0,0,0,0,0,0,1
874,0,1,0,0,0,0,0,0


### Agent

In [72]:
agent_missing = df.isnull()["Agent"].astype(int)

In [73]:
df['Agent'].value_counts()

Agent
T         112
P          62
M          54
G          42
m          34
         ... 
T,G,M       1
S,T,G       1
A,P,T       1
G,W, S      1
L,I,S       1
Name: count, Length: 74, dtype: int64

In [74]:
agent_dummies = df["Agent"].str.get_dummies(sep=",")
agent_dummies = agent_dummies.drop(columns=["?"], errors="ignore")
agent_dummies

Unnamed: 0,S,m,A,E,F,G,I,L,M,P,S.1,T,W,m.1
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,0,0,0,0,0,0,0,0,1,0,0,1,0,0
872,0,0,0,0,0,0,0,0,0,0,0,0,1,0
873,0,0,0,0,0,1,0,0,0,0,0,1,0,0
874,0,0,0,0,0,0,0,0,0,1,0,1,0,0


In [75]:
agent_dummies.columns

Index([' S', ' m', 'A', 'E', 'F', 'G', 'I', 'L', 'M', 'P', 'S', 'T', 'W', 'm'], dtype='object')

### Standardization

In [76]:
columns_to_drop = ["Name", "Location", "Location_processed", "Country", "Agent", "Type", "Type_processed"]
df_dropped = df.drop(columns=columns_to_drop)
df_dropped

Unnamed: 0,Year,Month,Day,Latitude,Longitude,Elevation (m),VEI,Deaths,Death Description,Missing,...,Total Deaths,Total Death Description,Total Missing,Total Missing Description,Total Injuries,Total Injuries Description,Total Damage ($Mil),Total Damage Description,Total Houses Destroyed,Total Houses Destroyed Description
0,-4360,,,-30.210,-178.475,238,6.0,,,,...,,,,,,,,,,
1,-4350,,,30.793,130.305,704,7.0,,3.0,,...,,3.0,,,,,,3.0,,3.0
2,-4050,,,11.985,-86.165,594,6.0,,,,...,,,,,,,,,,
3,-4000,,,-5.576,150.516,724,6.0,,1.0,,...,,1.0,,,,,,1.0,,
4,-3580,,,14.002,120.993,311,6.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,2022,12.0,4.0,-8.108,112.922,3657,4.0,,,,...,,,,,,,,1.0,,
872,2023,1.0,31.0,-16.680,168.370,833,,,,,...,,,,,,,,,,
873,2023,7.0,7.0,-6.137,155.196,1855,2.0,,,,...,,,,,,,,1.0,,1.0
874,2023,12.0,3.0,-0.380,100.474,2885,,23.0,1.0,0.0,...,23.0,1.0,0.0,,12.0,1.0,,,,


In [77]:
df_missing = df_dropped.isnull().astype(int).add_suffix('_missing')

In [78]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [79]:
features = df_dropped.drop(columns=['VEI'])
target = df_dropped['VEI']

scaled_features = scaler.fit_transform(features)
df_scaled = pd.DataFrame(scaled_features, columns=features.columns, index=df_dropped.index)
df_scaled['VEI'] = target

In [80]:
import joblib
joblib.dump(scaler, "./meta/scaler.pkl")

['./meta/scaler.pkl']

In [81]:
df_scaled = df_scaled.fillna(0.0)
df_scaled

Unnamed: 0,Year,Month,Day,Latitude,Longitude,Elevation (m),Deaths,Death Description,Missing,Missing Description,...,Total Death Description,Total Missing,Total Missing Description,Total Injuries,Total Injuries Description,Total Damage ($Mil),Total Damage Description,Total Houses Destroyed,Total Houses Destroyed Description,VEI
0,-8.492242,0.000000,0.000000,-1.761861,-2.287181,-1.367074,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,6.0
1,-8.478298,0.000000,0.000000,0.611068,0.766186,-0.996280,0.000000,1.492581,0.000000,0.0,...,1.383782,0.000000,0.0,0.000000,0.000000,0.0,1.244217,0.000000,0.749059,7.0
2,-8.059990,0.000000,0.000000,-0.120536,-1.374375,-1.083807,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,6.0
3,-7.990272,0.000000,0.000000,-0.803634,0.966042,-0.980366,0.000000,-0.577714,0.000000,0.0,...,-0.611282,0.000000,0.0,0.000000,0.000000,0.0,-0.803718,0.000000,0.000000,6.0
4,-7.404640,0.000000,0.000000,-0.042078,0.674104,-1.308988,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,0.406578,1.659707,-1.241673,-0.902125,0.594294,1.353407,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,-0.803718,0.000000,0.000000,4.0
872,0.407972,-1.625631,1.731879,-1.235564,1.142591,-0.893636,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0
873,0.407972,0.166372,-0.911278,-0.825456,1.012320,-0.080435,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,-0.803718,0.000000,-1.142633,2.0
874,0.407972,1.659707,-1.351805,-0.601517,0.471202,0.739131,-0.173071,-0.577714,-0.554848,0.0,...,-0.611282,-0.556589,0.0,-0.144802,-0.583272,0.0,0.000000,0.000000,0.000000,0.0


In [82]:
import json
with open("./meta/column_names.json", "w") as f:
    json.dump(df_scaled.columns.to_list(), f)

In [83]:
df_missing["Location_missing"] = location_missing
df_missing["Type_missing"] = type_missing
df_missing["Agent_missing"] = agent_missing

In [84]:
final_df = pd.concat([df_scaled, location_dummies, type_dummies, agent_dummies, df_missing], axis=1)
final_df

Unnamed: 0,Year,Month,Day,Latitude,Longitude,Elevation (m),Deaths,Death Description,Missing,Missing Description,...,Total Missing Description_missing,Total Injuries_missing,Total Injuries Description_missing,Total Damage ($Mil)_missing,Total Damage Description_missing,Total Houses Destroyed_missing,Total Houses Destroyed Description_missing,Location_missing,Type_missing,Agent_missing
0,-8.492242,0.000000,0.000000,-1.761861,-2.287181,-1.367074,0.000000,0.000000,0.000000,0.0,...,1,1,1,1,1,1,1,0,0,1
1,-8.478298,0.000000,0.000000,0.611068,0.766186,-0.996280,0.000000,1.492581,0.000000,0.0,...,1,1,1,1,0,1,0,0,0,0
2,-8.059990,0.000000,0.000000,-0.120536,-1.374375,-1.083807,0.000000,0.000000,0.000000,0.0,...,1,1,1,1,1,1,1,0,0,1
3,-7.990272,0.000000,0.000000,-0.803634,0.966042,-0.980366,0.000000,-0.577714,0.000000,0.0,...,1,1,1,1,0,1,1,0,0,0
4,-7.404640,0.000000,0.000000,-0.042078,0.674104,-1.308988,0.000000,0.000000,0.000000,0.0,...,1,1,1,1,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,0.406578,1.659707,-1.241673,-0.902125,0.594294,1.353407,0.000000,0.000000,0.000000,0.0,...,1,1,1,1,0,1,1,0,0,0
872,0.407972,-1.625631,1.731879,-1.235564,1.142591,-0.893636,0.000000,0.000000,0.000000,0.0,...,1,1,1,1,1,1,1,0,0,0
873,0.407972,0.166372,-0.911278,-0.825456,1.012320,-0.080435,0.000000,0.000000,0.000000,0.0,...,1,1,1,1,0,1,0,0,0,0
874,0.407972,1.659707,-1.351805,-0.601517,0.471202,0.739131,-0.173071,-0.577714,-0.554848,0.0,...,1,0,0,1,1,1,1,0,0,0


In [85]:
usable_examples = final_df[final_df["VEI_missing"]==0]
unusable_examples = final_df[final_df["VEI_missing"]==1]
usable_examples = usable_examples.drop(columns=['VEI_missing'])
usable_examples

Unnamed: 0,Year,Month,Day,Latitude,Longitude,Elevation (m),Deaths,Death Description,Missing,Missing Description,...,Total Missing Description_missing,Total Injuries_missing,Total Injuries Description_missing,Total Damage ($Mil)_missing,Total Damage Description_missing,Total Houses Destroyed_missing,Total Houses Destroyed Description_missing,Location_missing,Type_missing,Agent_missing
0,-8.492242,0.000000,0.000000,-1.761861,-2.287181,-1.367074,0.000000,0.000000,0.0,0.0,...,1,1,1,1,1,1,1,0,0,1
1,-8.478298,0.000000,0.000000,0.611068,0.766186,-0.996280,0.000000,1.492581,0.0,0.0,...,1,1,1,1,0,1,0,0,0,0
2,-8.059990,0.000000,0.000000,-0.120536,-1.374375,-1.083807,0.000000,0.000000,0.0,0.0,...,1,1,1,1,1,1,1,0,0,1
3,-7.990272,0.000000,0.000000,-0.803634,0.966042,-0.980366,0.000000,-0.577714,0.0,0.0,...,1,1,1,1,0,1,1,0,0,0
4,-7.404640,0.000000,0.000000,-0.042078,0.674104,-1.308988,0.000000,0.000000,0.0,0.0,...,1,1,1,1,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868,0.406578,-0.132296,0.740695,0.153232,-1.497556,2.734734,-0.182165,-0.577714,0.0,0.0,...,1,1,1,1,1,1,1,0,0,0
869,0.406578,1.062373,-0.691015,0.922101,-0.371900,-0.821227,0.000000,0.000000,0.0,0.0,...,1,1,1,1,1,1,1,0,0,0
870,0.406578,1.659707,-1.241673,0.922101,-0.371900,-0.821227,0.000000,0.000000,0.0,0.0,...,1,1,1,1,1,1,1,0,0,0
871,0.406578,1.659707,-1.241673,-0.902125,0.594294,1.353407,0.000000,0.000000,0.0,0.0,...,1,1,1,1,0,1,1,0,0,0


In [86]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(usable_examples, test_size=0.2)
X_train

Unnamed: 0,Year,Month,Day,Latitude,Longitude,Elevation (m),Deaths,Death Description,Missing,Missing Description,...,Total Missing Description_missing,Total Injuries_missing,Total Injuries Description_missing,Total Damage ($Mil)_missing,Total Damage Description_missing,Total Houses Destroyed_missing,Total Houses Destroyed Description_missing,Location_missing,Type_missing,Agent_missing
113,-0.170688,0.166372,0.300169,-0.555617,0.736767,-0.191832,0.000000,0.000000,0.0,0.0,...,1,1,1,1,0,1,1,0,0,1
237,0.120733,0.000000,0.000000,-0.902553,0.605508,1.037516,0.000000,-0.577714,0.0,0.0,...,1,1,1,1,1,1,1,0,0,1
359,0.228099,1.062373,0.000000,-1.218837,1.140119,-0.494992,-0.178445,-0.577714,0.0,0.0,...,1,1,1,1,1,1,1,0,0,0
25,-2.482541,0.000000,0.000000,-0.110539,-1.376125,-1.144279,0.000000,0.000000,0.0,0.0,...,1,1,1,1,1,1,1,0,0,1
356,0.225310,0.465039,1.621747,-1.354982,-2.253511,-1.146667,0.000000,0.000000,0.0,0.0,...,1,1,1,1,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,-0.158139,0.000000,0.000000,-0.023757,-1.420999,1.437751,0.000000,0.000000,0.0,0.0,...,1,1,1,1,1,1,1,0,0,1
326,0.200212,0.166372,0.410300,-0.042078,0.674104,-1.308988,0.000000,-0.577714,0.0,0.0,...,1,1,1,1,1,1,1,0,0,1
93,-0.226463,0.000000,0.000000,-0.941879,1.058044,-1.170537,0.231213,2.527729,0.0,0.0,...,1,1,1,1,1,1,1,0,0,0
169,-0.017309,-1.326964,-1.572068,0.909692,-0.803166,0.314230,-0.181752,-0.577714,0.0,0.0,...,1,1,1,1,1,1,1,0,0,1


In [87]:
X_train_array = X_train.drop(columns=['VEI']).to_numpy()
y_train_array = X_train['VEI'].to_numpy()

X_test_array = X_test.drop(columns=['VEI']).to_numpy()
y_test_array = X_test['VEI'].to_numpy()


In [90]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Convert NumPy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train_array, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_array, dtype=torch.float32)  # or torch.long if classification

X_test_tensor = torch.tensor(X_test_array, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_array, dtype=torch.float32)

# Create TensorDataset objects
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Optional: create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Save datasets to disk
torch.save(train_dataset, './data/train_dataset.pt')
torch.save(test_dataset, './data/test_dataset.pt')