In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# The preprocessing function accepts a pandas dataframe with the "Year", "Month" and "Day" features for when a natural disaster
# occured as well as the "Longitude" and "latitude" coordinates. The function returns a pandas dataframe with the mentioned 
# features but for all days within the starting and ending year of the original dataframe. It also has an added feature 
# "target" which is a 1 if a natural disaster occured on that day, and a 0 otherwise.

def preprocessing_dataframe(disaster_df):
    preprocessed_dict = {'Year': [], 'Latitude':[], 'Longitude': [], 'Month': [], 'Day': [], 'target': []} # Starting with a dictionary to hold all values, but will later change to a pandas dataframe
    # Creating a dictionary that stores the latitude and longitude values for each specific place in the dataframe
    print('Preprocessing ... ')
    print(' ')
    place_coords = {}
    for place in disaster_df['Name'].unique():
        lat = disaster_df[disaster_df['Name'] == place]['Latitude'].unique()[0]
        lng = disaster_df[disaster_df['Name'] == place]['Longitude'].unique()[0]
        place_coords[place] = (lat, lng)
        
    # All places with their respective coordinates are now stored in the "place_coords" dictionary
    
    year_start = disaster_df['Year'].unique().min() # Getting the earliest year in the dataframe
    year_end = disaster_df['Year'].unique().max() # Getting the last year in the dataframe
    
    # Now, I'll iterate through all the years in order to assign the targets
    for year in range(year_start, year_end+1):  
        year_df = disaster_df[disaster_df['Year'] == year] # Dataframe for disasters happening in year "year" 
        
        # I'll have to account for all the days of the months in the year, which are usually 30 and 31 except February
        # Assigning the number of days for a specific year in the month of February is dependent on if the year is a leap year 
        # or not, where the number of days will be 29 or 28 respectively.
        
        month_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] # List containing number of days for each month of the year accordingly, i.e index 0 or January with 31 days. This is the list of days assuming it is not a leap year
        if year%4 == 0:
            if year%100 != 0:
                month_days[1] = 29
            else:
                if year%400 == 0:
                    month_days[1] = 29
        
        # Now, the "month_days" list's index "1" will remain 28 if it is not a leap year, and be changed to 29 if it is indeed
        # a leap year
        
        # Would also need to iterate through all the places in the dataframe
        for place in place_coords:
            place_df = year_df[year_df['Name'] == place] # DataFrame for observations of only the place "place" 
            month_number = 1 # This is supposed to be January
            
            #Similar, iterating through all months...
            for days in month_days:
                month_df = place_df[place_df['Month'] == month_number] # DataFrame containing observations of only the month "month"
                
                # Iterating through all the days in the month...
                for day in range(1, days+1):
                    preprocessed_dict['Year'].append(year)
                    preprocessed_dict['Latitude'].append(place_coords[place][0])
                    preprocessed_dict['Longitude'].append(place_coords[place][1])
                    preprocessed_dict['Month'].append(month_number)
                    preprocessed_dict['Day'].append(day)
                    # And finally, if the particular date is present in the dataframe, the target is set to 1, and 0 otherwise
                    if place in year_df['Name'].unique() and month_number in place_df['Month'].unique() and day in month_df['Day'].unique():
                        preprocessed_dict['target'].append(1)
                    else:
                        preprocessed_dict['target'].append(0)
                month_number += 1
                
    preprocessed_df = pd.DataFrame(preprocessed_dict) # Transforming to a dataframe
    
    # Things to note: The function doesn't consider nan values, so if there is a nan value in any of the date features it will
    # set the target to 0. Also, the preprocessed dataframe can be very large without care, so maybe sticking to 40, 50 years
    # at most will be desirable. Also helps that for latter years, there's a lot less nan values. But could also edit it to
    # perform a task if there is are nan values present.
    print('Done!')
    return preprocessed_df 

In [3]:
volc = pd.read_csv('data/txt/volerup.txt',delimiter = '\t', quoting = 3, encoding='utf-8')

In [4]:
volc  = volc[['Year','Month','Day','Name','Latitude','Longitude']]

In [5]:
volc.head()

Unnamed: 0,Year,Month,Day,Name,Latitude,Longitude
0,-4360,,,Macauley Island,-30.2,-178.47
1,-4350,,,Kikai,30.78,130.28
2,-4050,,,Masaya,11.984,-86.161
3,-4000,,,Pago,-5.58,150.52
4,-3580,,,Taal,14.002,120.993


In [6]:
volc.shape

(835, 6)

In [7]:
volc.isnull().sum()

Year           0
Month        127
Day          188
Name           0
Latitude       0
Longitude      0
dtype: int64

In [8]:
volc = volc.dropna()

In [9]:
volc.duplicated().any()

False

In [108]:
volc_2000= volc[volc['Year'] >= 1980]
volc_2000.head()

Unnamed: 0,Year,Month,Day,Name,Latitude,Longitude
608,1980,5.0,18.0,St. Helens,46.2,-122.18
609,1980,8.0,17.0,Hekla,63.98,-19.7
610,1980,12.0,25.0,Hood,45.374,-121.694
611,1981,3.0,17.0,Etna,37.734,15.004
612,1981,3.0,29.0,Semeru,-8.108,112.92


In [109]:
volc_2000.reset_index(drop=True, inplace=True)

In [110]:
volc_2000.head()

Unnamed: 0,Year,Month,Day,Name,Latitude,Longitude
0,1980,5.0,18.0,St. Helens,46.2,-122.18
1,1980,8.0,17.0,Hekla,63.98,-19.7
2,1980,12.0,25.0,Hood,45.374,-121.694
3,1981,3.0,17.0,Etna,37.734,15.004
4,1981,3.0,29.0,Semeru,-8.108,112.92


In [113]:
volc_2000.shape

(214, 6)

In [114]:
preprocessed_volc = preprocessing_dataframe(volc_2000)

Preprocessing ... 
 
Done!


In [115]:
preprocessed_volc.head()

Unnamed: 0,Year,Latitude,Longitude,Month,Day,target
0,1980,46.2,-122.18,1,1,0
1,1980,46.2,-122.18,1,2,0
2,1980,46.2,-122.18,1,3,0
3,1980,46.2,-122.18,1,4,0
4,1980,46.2,-122.18,1,5,0


In [116]:
preprocessed_volc.head()

Unnamed: 0,Year,Latitude,Longitude,Month,Day,target
0,1980,46.2,-122.18,1,1,0
1,1980,46.2,-122.18,1,2,0
2,1980,46.2,-122.18,1,3,0
3,1980,46.2,-122.18,1,4,0
4,1980,46.2,-122.18,1,5,0


In [117]:
preprocessed_volc.shape

(1512576, 6)

In [118]:
normalized_volc = pd.DataFrame({}) # Dataframe to store normalized x, y, z coordinates

In [119]:
# Changing lat lon values to x, y, z coordinates
def find_x(lat, lon):
    return np.cos(np.deg2rad(lat))*np.cos(np.deg2rad(lon)) # x = cos(lat)*cos(lon)

def find_y(lat, lon):
    return np.cos(np.deg2rad(lat))*np.sin(np.deg2rad(lon)) # y = cos(lat)*sin(lon)

def find_z(lat):
    return np.sin(np.deg2rad(lat)) # z = sin(lat)

In [120]:
normalized_volc['Year'] = preprocessed_volc['Year']
normalized_volc['x'] = preprocessed_volc.apply(lambda x: find_x(x['Latitude'], x['Longitude']), axis=1)
normalized_volc['y'] = preprocessed_volc.apply(lambda x: find_x(x['Latitude'], x['Longitude']), axis=1)
normalized_volc['z'] = preprocessed_volc['Latitude'].apply(find_z)
normalized_volc['Month'] = preprocessed_volc['Month']
normalized_volc['Day'] = preprocessed_volc['Day']
normalized_volc['target'] = preprocessed_volc['target']

In [121]:
normalized_volc.head()

Unnamed: 0,Year,x,y,z,Month,Day,target
0,1980,-0.368622,-0.368622,0.72176,1,1,0
1,1980,-0.368622,-0.368622,0.72176,1,2,0
2,1980,-0.368622,-0.368622,0.72176,1,3,0
3,1980,-0.368622,-0.368622,0.72176,1,4,0
4,1980,-0.368622,-0.368622,0.72176,1,5,0


In [122]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [183]:
# Taking out test data before upsampling
train_data = normalized_volc[normalized_volc['Year'] < 2019] # Training the model from 1980-2018
test_data = normalized_volc[normalized_volc['Year'] >= 2019] # Testing on 2019-2020

In [184]:
train_data.head()

Unnamed: 0,Year,x,y,z,Month,Day,target
0,1980,-0.368622,-0.368622,0.72176,1,1,0
1,1980,-0.368622,-0.368622,0.72176,1,2,0
2,1980,-0.368622,-0.368622,0.72176,1,3,0
3,1980,-0.368622,-0.368622,0.72176,1,4,0
4,1980,-0.368622,-0.368622,0.72176,1,5,0


In [185]:
train_data.shape

(1438745, 7)

In [186]:
len(train_data[train_data['target'] == 1])

205

In [187]:
num_zero_targets = len(train_data[train_data['target'] == 0])
num_zero_targets

1438540

In [188]:
majority = train_data[train_data['target'] == 0]
minority = train_data[train_data['target'] == 1]

In [189]:
majority.head()

Unnamed: 0,Year,x,y,z,Month,Day,target
0,1980,-0.368622,-0.368622,0.72176,1,1,0
1,1980,-0.368622,-0.368622,0.72176,1,2,0
2,1980,-0.368622,-0.368622,0.72176,1,3,0
3,1980,-0.368622,-0.368622,0.72176,1,4,0
4,1980,-0.368622,-0.368622,0.72176,1,5,0


In [190]:
minority.head()

Unnamed: 0,Year,x,y,z,Month,Day,target
138,1980,-0.368622,-0.368622,0.72176,5,18,1
595,1980,0.413009,0.413009,0.898641,8,17,1
1091,1980,-0.369069,-0.369069,0.711707,12,25,1
38136,1981,0.763898,0.763898,0.611996,3,17,1
38513,1981,-0.385553,-0.385553,-0.141039,3,29,1


In [191]:
from sklearn.utils import resample

In [192]:
# Upsample minority class
min_upsamp = resample(minority, replace=True, n_samples=num_zero_targets, random_state=1) 

In [193]:
min_upsamp.head()

Unnamed: 0,Year,x,y,z,Month,Day,target
342180,1989,-0.555339,-0.555339,0.528883,8,26,1
1098444,2009,-0.438043,-0.438043,0.870184,3,23,1
526152,1994,-0.55207,-0.55207,0.542881,5,29,1
1060865,2008,0.218736,0.218736,-0.679864,5,2,1
1436999,2018,-0.406538,-0.406538,-0.140175,3,21,1


In [194]:
min_upsamp.shape

(1438540, 7)

In [195]:
upsampled_data = pd.concat([majority, min_upsamp])

In [196]:
upsampled_data.shape

(2877080, 7)

In [197]:
upsampled_data = upsampled_data.sample(frac=1).reset_index(drop=True) # to shuffle the dataframe

In [198]:
upsampled_data.head()

Unnamed: 0,Year,x,y,z,Month,Day,target
0,2009,-0.285884,-0.285884,-0.117017,4,2,0
1,2011,0.00627,0.00627,0.239432,10,30,0
2,2007,-0.285884,-0.285884,-0.117017,7,7,1
3,2012,-0.528876,-0.528876,0.827375,12,12,1
4,2011,-0.521251,-0.521251,-0.152261,4,12,0


In [199]:
# There's 6 features now
x_train = upsampled_data.iloc[:, 0:6]
y_train = upsampled_data.iloc[:, 6]

In [200]:
x_test = test_data.drop(columns='target')
y_test = test_data['target']

In [201]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
scaled_x_train = sc.fit_transform(x_train)
scaled_x_test =  sc.transform(x_test)

### Using Naive Bayes

In [202]:
from sklearn.naive_bayes import GaussianNB
classifier= GaussianNB()
classifier.fit(scaled_x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [203]:
y_pred = classifier.predict(scaled_x_test)

In [204]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)

In [205]:
cm

array([[15147, 58675],
       [    1,     8]], dtype=int64)

In [206]:
print(accuracy_score(y_pred, y_test))

0.20526608064363208


In [207]:
y_pred.shape

(73831,)

In [208]:
y_test.shape

(73831,)

In [209]:
y_test.sum()

9

In [210]:
y_pred.sum()

58683

In [211]:
check = classification_report(y_test, y_pred)
print(check)

              precision    recall  f1-score   support

           0       1.00      0.21      0.34     73822
           1       0.00      0.89      0.00         9

    accuracy                           0.21     73831
   macro avg       0.50      0.55      0.17     73831
weighted avg       1.00      0.21      0.34     73831



In [212]:
from sklearn.metrics import roc_auc_score

In [213]:
# Predict class probabilities
y_pred_2 = classifier.predict_proba(x_test)
 
# Keep only the positive class
y_pred_2 = [y[1] for y in y_pred_2]

In [214]:
print(roc_auc_score(y_test, y_pred_2))

0.5


### Using Random Forest

In [215]:
from sklearn.ensemble import RandomForestClassifier
classifier1 = RandomForestClassifier(n_estimators=10, criterion='entropy' , random_state=0)
classifier1.fit(scaled_x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [216]:
y_pred1 = classifier1.predict(scaled_x_test)

In [217]:
cm2 = confusion_matrix(y_test, y_pred1)

In [218]:
cm2

array([[73794,    28],
       [    9,     0]], dtype=int64)

In [219]:
print(accuracy_score(y_pred1, y_test))

0.9994988554943045


In [220]:
y_pred1.sum()

28

In [221]:
y_test.sum()

9

In [222]:
t = []
for i in range(len(y_pred1)):
    if y_pred1[i] == 1:
        t.append(x_test.iloc[i])
t # Dates and places that the model predicted disasters will happen

[Year     2019.000000
 x           0.763898
 y           0.763898
 z           0.611996
 Month      12.000000
 Day        26.000000
 Name: 1440199, dtype: float64,
 Year     2019.000000
 x          -0.539846
 y          -0.539846
 z           0.229319
 Month       1.000000
 Day        13.000000
 Name: 1440947, dtype: float64,
 Year     2019.000000
 x          -0.856739
 y          -0.856739
 z           0.332573
 Month       2.000000
 Day         1.000000
 Name: 1455931, dtype: float64,
 Year     2019.000000
 x          -0.856739
 y          -0.856739
 z           0.332573
 Month       4.000000
 Day        28.000000
 Name: 1456017, dtype: float64,
 Year     2019.000000
 x          -0.856739
 y          -0.856739
 z           0.332573
 Month       7.000000
 Day        16.000000
 Name: 1456096, dtype: float64,
 Year     2019.000000
 x          -0.264436
 y          -0.264436
 z          -0.106299
 Month      12.000000
 Day        22.000000
 Name: 1456620, dtype: float64,
 Year     2019.0

In [223]:
v = []
p = y_test.to_list()
for i in range(len(p)):
    if p[i] == 1:
        v.append(x_test.iloc[i])
v # Dates and places that disasters actually happened. Not much of a difference

[Year     2019.000000
 x           0.752144
 y           0.752144
 z           0.626454
 Month       7.000000
 Day         3.000000
 Name: 1445498, dtype: float64,
 Year     2019.000000
 x           0.752144
 y           0.752144
 z           0.626454
 Month       8.000000
 Day        28.000000
 Name: 1445554, dtype: float64,
 Year     2019.000000
 x          -0.817664
 y          -0.817664
 z          -0.071497
 Month       1.000000
 Day         7.000000
 Name: 1461016, dtype: float64,
 Year     2019.000000
 x          -0.817664
 y          -0.817664
 z          -0.071497
 Month       6.000000
 Day        28.000000
 Name: 1461188, dtype: float64,
 Year     2019.000000
 x          -0.145722
 y          -0.145722
 z           0.055299
 Month       5.000000
 Day         7.000000
 Name: 1468436, dtype: float64,
 Year     2019.000000
 x          -0.873992
 y          -0.873992
 z          -0.088025
 Month       6.000000
 Day        26.000000
 Name: 1474691, dtype: float64,
 Year     2019.0

In [224]:
y_train.sum()

1438540

In [225]:
# Predict class probabilities
y_pred_3 = classifier1.predict_proba(x_test)
 
# Keep only the positive class
y_pred_3 = [y[1] for y in y_pred_3]

In [226]:
print(roc_auc_score(y_test, y_pred_3))

0.5


In [45]:
# Looks good, but knowing tree based algorithms it probably overfitted.

In [46]:
#https://elitedatascience.com/imbalanced-classes