In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# The preprocessing function accepts a pandas dataframe with the "Year", "Month" and "Day" features for when a natural disaster
# occured as well as the "Longitude" and "latitude" coordinates. The function returns a pandas dataframe with the mentioned 
# features but for all days within the starting and ending year of the original dataframe. It also has an added feature 
# "target" which is a 1 if a natural disaster occured on that day, and a 0 otherwise.

def preprocessing_dataframe(disaster_df):
    preprocessed_dict = {'Year': [], 'Latitude':[], 'Longitude': [], 'Month': [], 'Day': [], 'target': []} # Starting with a dictionary to hold all values, but will later change to a pandas dataframe
    # Creating a dictionary that stores the latitude and longitude values for each specific place in the dataframe
    print('Preprocessing ... ')
    print(' ')
    place_coords = {}
    for place in disaster_df['Name'].unique():
        lat = disaster_df[disaster_df['Name'] == place]['Latitude'].unique()[0]
        lng = disaster_df[disaster_df['Name'] == place]['Longitude'].unique()[0]
        place_coords[place] = (lat, lng)
        
    # All places with their respective coordinates are now stored in the "place_coords" dictionary
    
    year_start = disaster_df['Year'].unique().min() # Getting the earliest year in the dataframe
    year_end = disaster_df['Year'].unique().max() # Getting the last year in the dataframe
    
    # Now, I'll iterate through all the years in order to assign the targets
    for year in range(year_start, year_end+1):  
        year_df = disaster_df[disaster_df['Year'] == year] # Dataframe for disasters happening in year "year" 
        
        # I'll have to account for all the days of the months in the year, which are usually 30 and 31 except February
        # Assigning the number of days for a specific year in the month of February is dependent on if the year is a leap year 
        # or not, where the number of days will be 29 or 28 respectively.
        
        month_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] # List containing number of days for each month of the year accordingly, i.e index 0 or January with 31 days. This is the list of days assuming it is not a leap year
        if year%4 == 0:
            if year%100 != 0:
                month_days[1] = 29
            else:
                if year%400 == 0:
                    month_days[1] = 29
        
        # Now, the "month_days" list's index "1" will remain 28 if it is not a leap year, and be changed to 29 if it is indeed
        # a leap year
        
        # Would also need to iterate through all the places in the dataframe
        for place in place_coords:
            place_df = year_df[year_df['Name'] == place] # DataFrame for observations of only the place "place" 
            month_number = 1 # This is supposed to be January
            
            #Similar, iterating through all months...
            for days in month_days:
                month_df = place_df[place_df['Month'] == month_number] # DataFrame containing observations of only the month "month"
                
                # Iterating through all the days in the month...
                for day in range(1, days+1):
                    preprocessed_dict['Year'].append(year)
                    preprocessed_dict['Latitude'].append(place_coords[place][0])
                    preprocessed_dict['Longitude'].append(place_coords[place][1])
                    preprocessed_dict['Month'].append(month_number)
                    preprocessed_dict['Day'].append(day)
                    # And finally, if the particular date is present in the dataframe, the target is set to 1, and 0 otherwise
                    if place in year_df['Name'].unique() and month_number in place_df['Month'].unique() and day in month_df['Day'].unique():
                        preprocessed_dict['target'].append(1)
                    else:
                        preprocessed_dict['target'].append(0)
                month_number += 1
                
    preprocessed_df = pd.DataFrame(preprocessed_dict) # Transforming to a dataframe
    
    # Things to note: The function doesn't consider nan values, so if there is a nan value in any of the date features it will
    # set the target to 0. Also, the preprocessed dataframe can be very large without care, so maybe sticking to 40, 50 years
    # at most will be desirable. Also helps that for latter years, there's a lot less nan values. But could also edit it to
    # perform a task if there is are nan values present.
    print('Done!')
    return preprocessed_df 

In [3]:
volc = pd.read_csv('data/txt/volerup.txt',delimiter = '\t', quoting = 3, encoding='utf-8')

In [4]:
volc  = volc[['Year','Month','Day','Name','Latitude','Longitude']]

In [5]:
volc.head()

Unnamed: 0,Year,Month,Day,Name,Latitude,Longitude
0,-4360,,,Macauley Island,-30.2,-178.47
1,-4350,,,Kikai,30.78,130.28
2,-4050,,,Masaya,11.984,-86.161
3,-4000,,,Pago,-5.58,150.52
4,-3580,,,Taal,14.002,120.993


In [6]:
volc.shape

(835, 6)

In [7]:
volc.isnull().sum()

Year           0
Month        127
Day          188
Name           0
Latitude       0
Longitude      0
dtype: int64

In [8]:
volc = volc.dropna()

In [9]:
volc.duplicated().any()

False

In [10]:
volc_2000= volc[volc['Year'] >= 2000]
volc_2000.head()

Unnamed: 0,Year,Month,Day,Name,Latitude,Longitude
718,2000,6.0,27.0,Miyake-jima,34.08,139.53
719,2000,7.0,27.0,Semeru,-8.108,112.92
720,2000,8.0,23.0,Arenal,10.463,-84.703
721,2000,11.0,3.0,Kilauea,19.425,-155.292
723,2001,5.0,14.0,Etna,37.734,15.004


In [11]:
volc_2000.reset_index(drop=True, inplace=True)

In [12]:
volc_2000.head()

Unnamed: 0,Year,Month,Day,Name,Latitude,Longitude
0,2000,6.0,27.0,Miyake-jima,34.08,139.53
1,2000,7.0,27.0,Semeru,-8.108,112.92
2,2000,8.0,23.0,Arenal,10.463,-84.703
3,2000,11.0,3.0,Kilauea,19.425,-155.292
4,2001,5.0,14.0,Etna,37.734,15.004


In [13]:
volc_2000.shape

(109, 6)

In [14]:
preprocessed_volc = preprocessing_dataframe(volc_2000)

Preprocessing ... 
 
Done!


In [15]:
preprocessed_volc.head()

Unnamed: 0,Year,Latitude,Longitude,Month,Day,target
0,2000,34.08,139.53,1,1,0
1,2000,34.08,139.53,1,2,0
2,2000,34.08,139.53,1,3,0
3,2000,34.08,139.53,1,4,0
4,2000,34.08,139.53,1,5,0


In [16]:
preprocessed_volc.shape

(460260, 6)

In [17]:
preprocessed_volc.head()

Unnamed: 0,Year,Latitude,Longitude,Month,Day,target
0,2000,34.08,139.53,1,1,0
1,2000,34.08,139.53,1,2,0
2,2000,34.08,139.53,1,3,0
3,2000,34.08,139.53,1,4,0
4,2000,34.08,139.53,1,5,0


In [20]:
normalized_volc = pd.DataFrame({}) # Dataframe to store normalized x, y, z coordinates

In [21]:
# Changing lat lon values to x, y, z coordinates
def find_x(lat, lon):
    return np.cos(np.deg2rad(lat))*np.cos(np.deg2rad(lon)) # x = cos(lat)*cos(lon)

def find_y(lat, lon):
    return np.cos(np.deg2rad(lat))*np.sin(np.deg2rad(lon)) # y = cos(lat)*sin(lon)

def find_z(lat):
    return np.sin(np.deg2rad(lat)) # z = sin(lat)

In [22]:
normalized_volc['Year'] = preprocessed_volc['Year']
normalized_volc['x'] = preprocessed_volc.apply(lambda x: find_x(x['Latitude'], x['Longitude']), axis=1)
normalized_volc['y'] = preprocessed_volc.apply(lambda x: find_x(x['Latitude'], x['Longitude']), axis=1)
normalized_volc['z'] = preprocessed_volc['Latitude'].apply(find_z)
normalized_volc['Month'] = preprocessed_volc['Month']
normalized_volc['Day'] = preprocessed_volc['Day']
normalized_volc['target'] = preprocessed_volc['target']

In [23]:
normalized_volc.head()

Unnamed: 0,Year,x,y,z,Month,Day,target
0,2000,-0.630092,-0.630092,0.56035,1,1,0
1,2000,-0.630092,-0.630092,0.56035,1,2,0
2,2000,-0.630092,-0.630092,0.56035,1,3,0
3,2000,-0.630092,-0.630092,0.56035,1,4,0
4,2000,-0.630092,-0.630092,0.56035,1,5,0


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [25]:
# Taking out test data before upsampling
x_train, x_test, y_train, y_test = train_test_split(normalized_volc.drop(columns='target'), 
                                                    normalized_volc['target'], 
                                                    test_size = 0.25, 
                                                    random_state = 0)

In [26]:
x_train['target'] = y_train

In [27]:
x_train.head()

Unnamed: 0,Year,x,y,z,Month,Day,target
260770,2011,-0.601583,-0.601583,0.596505,12,12,0
242223,2011,-0.856739,-0.856739,0.332573,2,18,0
49759,2002,-0.871751,-0.871751,-0.488926,3,1,0
14898,2000,-0.785252,-0.785252,-0.617311,9,15,0
329970,2015,-0.856739,-0.856739,0.332573,5,16,0


In [28]:
x_train.shape

(345195, 7)

In [29]:
len(x_train[x_train['target'] == 1])

77

In [30]:
num_zero_targets = len(x_train[x_train['target'] == 0])
num_zero_targets

345118

In [31]:
majority = x_train[x_train['target'] == 0]
minority = x_train[x_train['target'] == 1]

In [32]:
majority.head()

Unnamed: 0,Year,x,y,z,Month,Day,target
260770,2011,-0.601583,-0.601583,0.596505,12,12,0
242223,2011,-0.856739,-0.856739,0.332573,2,18,0
49759,2002,-0.871751,-0.871751,-0.488926,3,1,0
14898,2000,-0.785252,-0.785252,-0.617311,9,15,0
329970,2015,-0.856739,-0.856739,0.332573,5,16,0


In [33]:
minority.head()

Unnamed: 0,Year,x,y,z,Month,Day,target
395791,2018,-0.856739,-0.856739,0.332573,7,16,1
374776,2017,-0.346238,-0.346238,-0.131253,12,18,1
321848,2014,-0.375956,-0.375956,-0.137963,2,13,1
23553,2001,0.763898,0.763898,0.611996,5,14,1
437381,2019,-0.873992,-0.873992,-0.088025,6,26,1


In [34]:
from sklearn.utils import resample

In [35]:
# Upsample minority class
min_upsamp = resample(minority, replace=True, n_samples=num_zero_targets, random_state=1) 

In [36]:
min_upsamp.head()

Unnamed: 0,Year,x,y,z,Month,Day,target
47451,2002,0.21378,0.21378,-0.001361,11,3,1
137882,2006,-0.546067,-0.546067,0.221038,6,7,1
395626,2018,-0.856739,-0.856739,0.332573,2,1,1
247338,2011,-0.546067,-0.546067,0.221038,2,23,1
390666,2017,-0.338021,-0.338021,-0.125333,7,1,1


In [37]:
min_upsamp.shape

(345118, 7)

In [38]:
upsampled_data = pd.concat([majority, min_upsamp])

In [39]:
upsampled_data.shape

(690236, 7)

In [40]:
upsampled_data = upsampled_data.sample(frac=1).reset_index(drop=True) # to shuffle the dataframe

In [41]:
upsampled_data.head()

Unnamed: 0,Year,x,y,z,Month,Day,target
0,2008,0.733764,0.733764,0.653777,10,5,0
1,2011,0.419841,0.419841,0.895944,6,27,0
2,2009,-0.010161,-0.010161,0.248369,5,1,0
3,2006,-0.539846,-0.539846,0.229319,11,30,1
4,2007,-0.406538,-0.406538,-0.140175,5,13,0


In [42]:
# There's 6 features now
x_train = upsampled_data.iloc[:, 0:6]
y_train = upsampled_data.iloc[:, 6]

In [43]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
scaled_x_train = sc.fit_transform(x_train)
scaled_x_test =  sc.transform(x_test)

### Using Naive Bayes

In [45]:
from sklearn.naive_bayes import GaussianNB
classifier= GaussianNB()
classifier.fit(scaled_x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [46]:
y_pred = classifier.predict(scaled_x_test)

In [47]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)

In [48]:
cm

array([[63508, 51525],
       [   16,    16]], dtype=int64)

In [49]:
print(accuracy_score(y_pred, y_test))

0.5520705688089341


In [50]:
y_pred.shape

(115065,)

In [51]:
y_test.shape

(115065,)

In [52]:
y_test.sum()

32

In [53]:
y_pred.sum()

51541

In [54]:
check = classification_report(y_test, y_pred)
print(check)

              precision    recall  f1-score   support

           0       1.00      0.55      0.71    115033
           1       0.00      0.50      0.00        32

    accuracy                           0.55    115065
   macro avg       0.50      0.53      0.36    115065
weighted avg       1.00      0.55      0.71    115065



In [55]:
from sklearn.metrics import roc_auc_score

In [56]:
# Predict class probabilities
y_pred_2 = classifier.predict_proba(x_test)
 
# Keep only the positive class
y_pred_2 = [y[1] for y in y_pred_2]

In [57]:
print(roc_auc_score(y_test, y_pred_2))

0.5


### Using Random Forest

In [58]:
from sklearn.ensemble import RandomForestClassifier
classifier1 = RandomForestClassifier(n_estimators=10, criterion='entropy' , random_state=0)
classifier1.fit(scaled_x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [59]:
y_pred1 = classifier1.predict(scaled_x_test)

In [60]:
cm2 = confusion_matrix(y_test, y_pred1)

In [61]:
cm2

array([[115029,      4],
       [    32,      0]], dtype=int64)

In [62]:
print(accuracy_score(y_pred1, y_test))

0.9996871333594055


In [63]:
y_pred1.sum()

4

In [64]:
y_test.sum()

32

In [66]:
t = []
for i in range(len(y_pred1)):
    if y_pred1[i] == 1:
        t.append(x_test.iloc[i])
t # Dates and places that the model predicted disasters will happen

[Year     2018.000000
 x          -0.792180
 y          -0.792180
 z          -0.609038
 Month      12.000000
 Day         9.000000
 Name: 416012, dtype: float64,
 Year     2017.000000
 x           0.733764
 y           0.733764
 z           0.653777
 Month      10.000000
 Day        12.000000
 Name: 391134, dtype: float64,
 Year     2018.000000
 x          -0.856739
 y          -0.856739
 z           0.332573
 Month       3.000000
 Day         1.000000
 Name: 395654, dtype: float64,
 Year     2011.000000
 x          -0.606276
 y          -0.606276
 z           0.013962
 Month      12.000000
 Day        28.000000
 Name: 254216, dtype: float64]

In [67]:
v = []
p = y_test.to_list()
for i in range(len(p)):
    if p[i] == 1:
        v.append(x_test.iloc[i])
v # Dates and places that disasters actually happened. Not much of a difference

[Year     2017.000000
 x          -0.145722
 y          -0.145722
 z           0.055299
 Month       4.000000
 Day        13.000000
 Name: 383287, dtype: float64,
 Year     2019.000000
 x          -0.145722
 y          -0.145722
 z           0.055299
 Month       5.000000
 Day         7.000000
 Name: 427111, dtype: float64,
 Year     2000.000000
 x          -0.385553
 y          -0.385553
 z          -0.141039
 Month       7.000000
 Day        27.000000
 Name: 574, dtype: float64,
 Year     2013.000000
 x          -0.555191
 y          -0.555191
 z           0.523689
 Month       9.000000
 Day         4.000000
 Name: 299421, dtype: float64,
 Year     2018.000000
 x          -0.601583
 y          -0.601583
 z           0.596505
 Month       1.000000
 Day        23.000000
 Name: 413867, dtype: float64,
 Year     2013.000000
 x          -0.346238
 y          -0.346238
 z          -0.131253
 Month       2.000000
 Day        12.000000
 Name: 286807, dtype: float64,
 Year     2001.000000
 x 

In [68]:
y_train.sum()

345118

In [69]:
# Predict class probabilities
y_pred_3 = classifier1.predict_proba(x_test)
 
# Keep only the positive class
y_pred_3 = [y[1] for y in y_pred_3]

In [71]:
print(roc_auc_score(y_test, y_pred_3))

0.5


In [45]:
# Looks good, but knowing tree based algorithms it probably overfitted.

In [46]:
#https://elitedatascience.com/imbalanced-classes