In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# The preprocessing function accepts a pandas dataframe with the "Year", "Month" and "Day" features for when a natural disaster
# occured as well as the "Longitude" and "latitude" coordinates. The function returns a pandas dataframe with the mentioned 
# features but for all days within the starting and ending year of the original dataframe. It also has an added feature 
# "target" which is a 1 if a natural disaster occured on that day, and a 0 otherwise.

def preprocessing_dataframe(disaster_df):
    preprocessed_dict = {'Year': [], 'Latitude':[], 'Longitude': [], 'Month': [], 'Day': [], 'target': []} # Starting with a dictionary to hold all values, but will later change to a pandas dataframe
    # Creating a dictionary that stores the latitude and longitude values for each specific place in the dataframe
    print('Preprocessing ... ')
    print(' ')
    place_coords = {}
    for place in disaster_df['Name'].unique():
        lat = disaster_df[disaster_df['Name'] == place]['Latitude'].unique()[0]
        lng = disaster_df[disaster_df['Name'] == place]['Longitude'].unique()[0]
        place_coords[place] = (lat, lng)
        
    # All places with their respective coordinates are now stored in the "place_coords" dictionary
    
    year_start = disaster_df['Year'].unique().min() # Getting the earliest year in the dataframe
    year_end = disaster_df['Year'].unique().max() # Getting the last year in the dataframe
    
    # Now, I'll iterate through all the years in order to assign the targets
    for year in range(year_start, year_end+1):  
        year_df = disaster_df[disaster_df['Year'] == year] # Dataframe for disasters happening in year "year" 
        
        # I'll have to account for all the days of the months in the year, which are usually 30 and 31 except February
        # Assigning the number of days for a specific year in the month of February is dependent on if the year is a leap year 
        # or not, where the number of days will be 29 or 28 respectively.
        
        month_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] # List containing number of days for each month of the year accordingly, i.e index 0 or January with 31 days. This is the list of days assuming it is not a leap year
        if year%4 == 0:
            if year%100 != 0:
                month_days[1] = 29
            else:
                if year%400 == 0:
                    month_days[1] = 29
        
        # Now, the "month_days" list's index "1" will remain 28 if it is not a leap year, and be changed to 29 if it is indeed
        # a leap year
        
        # Would also need to iterate through all the places in the dataframe
        for place in place_coords:
            place_df = year_df[year_df['Name'] == place] # DataFrame for observations of only the place "place" 
            month_number = 1 # This is supposed to be January
            
            #Similar, iterating through all months...
            for days in month_days:
                month_df = place_df[place_df['Month'] == month_number] # DataFrame containing observations of only the month "month"
                
                # Iterating through all the days in the month...
                for day in range(1, days+1):
                    preprocessed_dict['Year'].append(year)
                    preprocessed_dict['Latitude'].append(place_coords[place][0])
                    preprocessed_dict['Longitude'].append(place_coords[place][1])
                    preprocessed_dict['Month'].append(month_number)
                    preprocessed_dict['Day'].append(day)
                    # And finally, if the particular date is present in the dataframe, the target is set to 1, and 0 otherwise
                    if place in year_df['Name'].unique() and month_number in place_df['Month'].unique() and day in month_df['Day'].unique():
                        preprocessed_dict['target'].append(1)
                    else:
                        preprocessed_dict['target'].append(0)
                month_number += 1
                
    preprocessed_df = pd.DataFrame(preprocessed_dict) # Transforming to a dataframe
    
    # Things to note: The function doesn't consider nan values, so if there is a nan value in any of the date features it will
    # set the target to 0. Also, the preprocessed dataframe can be very large without care, so maybe sticking to 40, 50 years
    # at most will be desirable. Also helps that for latter years, there's a lot less nan values. But could also edit it to
    # perform a task if there is are nan values present.
    print('Done!')
    return preprocessed_df 

In [3]:
volc = pd.read_csv('data/txt/volerup.txt',delimiter = '\t', quoting = 3, encoding='utf-8')

In [4]:
volc  = volc[['Year','Month','Day','Name','Latitude','Longitude']]

In [5]:
volc.isnull().sum()

Year           0
Month        127
Day          188
Name           0
Latitude       0
Longitude      0
dtype: int64

In [6]:
volc = volc.dropna()

In [7]:
volc.duplicated().any()

False

In [8]:
volc_2000= volc[volc['Year'] >= 2000]
volc_2000.head()

Unnamed: 0,Year,Month,Day,Name,Latitude,Longitude
718,2000,6.0,27.0,Miyake-jima,34.08,139.53
719,2000,7.0,27.0,Semeru,-8.108,112.92
720,2000,8.0,23.0,Arenal,10.463,-84.703
721,2000,11.0,3.0,Kilauea,19.425,-155.292
723,2001,5.0,14.0,Etna,37.734,15.004


In [9]:
volc_2000.reset_index(drop=True, inplace=True)

In [10]:
volc_2000.shape

(109, 6)

In [11]:
preprocessed_volc = preprocessing_dataframe(volc_2000)

Preprocessing ... 
 
Done!


In [12]:
preprocessed_volc.head()

Unnamed: 0,Year,Latitude,Longitude,Month,Day,target
0,2000,34.08,139.53,1,1,0
1,2000,34.08,139.53,1,2,0
2,2000,34.08,139.53,1,3,0
3,2000,34.08,139.53,1,4,0
4,2000,34.08,139.53,1,5,0


In [13]:
preprocessed_volc.shape

(460260, 6)

In [14]:
len(preprocessed_volc[preprocessed_volc['target'] == 1])

109

In [15]:
len(preprocessed_volc[preprocessed_volc['target'] == 0])

460151

In [16]:
majority = preprocessed_volc[preprocessed_volc.target==0]
minority = preprocessed_volc[preprocessed_volc.target==1]

In [17]:
majority.head()

Unnamed: 0,Year,Latitude,Longitude,Month,Day,target
0,2000,34.08,139.53,1,1,0
1,2000,34.08,139.53,1,2,0
2,2000,34.08,139.53,1,3,0
3,2000,34.08,139.53,1,4,0
4,2000,34.08,139.53,1,5,0


In [18]:
minority.head()

Unnamed: 0,Year,Latitude,Longitude,Month,Day,target
178,2000,34.08,139.53,6,27,1
574,2000,-8.108,112.92,7,27,1
967,2000,10.463,-84.703,8,23,1
1405,2000,19.425,-155.292,11,3,1
23553,2001,37.734,15.004,5,14,1


In [19]:
from sklearn.utils import resample

In [20]:
# Upsample minority class
min_upsamp = resample(minority, replace=True, n_samples=460151, random_state=1) 

In [21]:
min_upsamp.head()

Unnamed: 0,Year,Latitude,Longitude,Month,Day,target
221952,2010,2.78,125.48,8,6,1
458213,2020,-8.058,114.242,5,29,1
47451,2002,-0.078,-77.656,11,3,1
344911,2015,-41.326,-72.614,4,22,1
46413,2002,38.789,15.213,12,30,1


In [22]:
min_upsamp.shape

(460151, 6)

In [23]:
upsampled_data = pd.concat([majority, min_upsamp])

In [24]:
upsampled_data.shape

(920302, 6)

In [25]:
upsampled_data = upsampled_data.sample(frac=1).reset_index(drop=True) # to shuffle the dataframe

In [26]:
x = upsampled_data.iloc[:, 0:5]
y = upsampled_data.iloc[:, 5]

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state = 0)

In [28]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test =  sc.transform(x_test)

### Using Naive Bayes

In [29]:
from sklearn.naive_bayes import GaussianNB
classifier= GaussianNB()
classifier.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [30]:
y_pred = classifier.predict(x_test)

In [31]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)

In [32]:
cm

array([[60949, 54263],
       [39867, 74997]], dtype=int64)

In [33]:
print(accuracy_score(y_pred, y_test))

0.5908743197899824


In [34]:
check = classification_report(y_test, y_pred)
print(check)

              precision    recall  f1-score   support

           0       0.60      0.53      0.56    115212
           1       0.58      0.65      0.61    114864

    accuracy                           0.59    230076
   macro avg       0.59      0.59      0.59    230076
weighted avg       0.59      0.59      0.59    230076



In [35]:
from sklearn.metrics import roc_auc_score

In [36]:
# Predict class probabilities
y_pred_2 = classifier.predict_proba(x_test)
 
# Keep only the positive class
y_pred_2 = [y[1] for y in y_pred_2]

In [37]:
print(roc_auc_score(y_test, y_pred_2))

0.6114285589491868


### Using Random Forest

In [38]:
from sklearn.ensemble import RandomForestClassifier
classifier1 = RandomForestClassifier(n_estimators = 10, criterion= 'entropy' , random_state =0)
classifier1.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [39]:
y_pred1 = classifier1.predict(x_test)

In [40]:
cm2 = confusion_matrix(y_test, y_pred1)

In [41]:
cm2

array([[115208,      4],
       [     0, 114864]], dtype=int64)

In [42]:
print(accuracy_score(y_pred1, y_test))

0.9999826144404458


In [43]:
# Predict class probabilities
y_pred_3 = classifier1.predict_proba(x_test)
 
# Keep only the positive class
y_pred_3 = [y[1] for y in y_pred_3]

In [44]:
print(roc_auc_score(y_test, y_pred_3))

1.0


In [45]:
# Looks good, but knowing tree based algorithms it probably overfitted.

In [46]:
#https://elitedatascience.com/imbalanced-classes