In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### PREPROCESSING

In [None]:
# The preprocessing function accepts a pandas dataframe with the "Year", "Month" and "Day" features for when a natural disaster
# occured as well as the "Longitude" and "latitude" coordinates. The function returns a pandas dataframe with the mentioned 
# features but for all days within the starting and ending year of the original dataframe. It also has an added feature 
# "target" which is a 1 if a natural disaster occured on that day, and a 0 otherwise.

def preprocessing_dataframe(disaster_df):
    preprocessed_dict = {'Year': [], 'Latitude':[], 'Longitude': [], 'Month': [], 'Day': [], 'target': []} # Starting with a dictionary to hold all values, but will later change to a pandas dataframe
    # Creating a dictionary that stores the latitude and longitude values for each specific place in the dataframe
    print('Preprocessing ... ')
    print(' ')
    place_coords = {}
    for place in disaster_df['Name'].unique():
        lat = disaster_df[disaster_df['Name'] == place]['Latitude'].unique()[0]
        lng = disaster_df[disaster_df['Name'] == place]['Longitude'].unique()[0]
        place_coords[place] = (lat, lng)
        
    # All places with their respective coordinates are now stored in the "place_coords" dictionary
    
    year_start = disaster_df['Year'].unique().min() # Getting the earliest year in the dataframe
    year_end = disaster_df['Year'].unique().max() # Getting the last year in the dataframe
    
    # Now, I'll iterate through all the years in order to assign the targets
    for year in range(year_start, year_end+1):  
        year_df = disaster_df[disaster_df['Year'] == year] # Dataframe for disasters happening in year "year" 
        
        # I'll have to account for all the days of the months in the year, which are usually 30 and 31 except February
        # Assigning the number of days for a specific year in the month of February is dependent on if the year is a leap year 
        # or not, where the number of days will be 29 or 28 respectively.
        
        month_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] # List containing number of days for each month of the year accordingly, i.e index 0 or January with 31 days. This is the list of days assuming it is not a leap year
        if year%4 == 0:
            if year%100 != 0:
                month_days[1] = 29
            else:
                if year%400 == 0:
                    month_days[1] = 29
        
        # Now, the "month_days" list's index "1" will remain 28 if it is not a leap year, and be changed to 29 if it is indeed
        # a leap year
        
        # Would also need to iterate through all the places in the dataframe
        for place in place_coords:
            place_df = year_df[year_df['Name'] == place] # DataFrame for observations of only the place "place" 
            month_number = 1 # This is supposed to be January
            
            #Similar, iterating through all months...
            for days in month_days:
                month_df = place_df[place_df['Month'] == month_number] # DataFrame containing observations of only the month "month"
                
                # Iterating through all the days in the month...
                for day in range(1, days+1):
                    preprocessed_dict['Year'].append(year)
                    preprocessed_dict['Latitude'].append(place_coords[place][0])
                    preprocessed_dict['Longitude'].append(place_coords[place][1])
                    preprocessed_dict['Month'].append(month_number)
                    preprocessed_dict['Day'].append(day)
                    # And finally, if the particular date is present in the dataframe, the target is set to 1, and 0 otherwise
                    if place in year_df['Name'].unique() and month_number in place_df['Month'].unique() and day in month_df['Day'].unique():
                        preprocessed_dict['target'].append(1)
                    else:
                        preprocessed_dict['target'].append(0)
                month_number += 1
                
    preprocessed_df = pd.DataFrame(preprocessed_dict) # Transforming to a dataframe
    
    # Things to note: The function doesn't consider nan values, so if there is a nan value in any of the date features it will
    # set the target to 0. Also, the preprocessed dataframe can be very large without care, so maybe sticking to 40, 50 years
    # at most will be desirable. Also helps that for latter years, there's a lot less nan values. But could also edit it to
    # perform a task if there is are nan values present.
    print('Done!')
    return preprocessed_df 

In [None]:
# Reading in the csv txt file
tsu = pd.read_csv('/kaggle/input/tsunami/tsrunup.txt',delimiter = '\t', quoting = 3, encoding='latin-1')

In [None]:
# checking the first 5
tsu.head()

In [None]:
# checking for null values
tsu.isnull().sum()

In [None]:
# taking the features to used for classification
tsu  = tsu[['DAY','MONTH','YEAR', 'LOCATION_NAME','COUNTRY','LATITUDE','LONGITUDE']]

In [None]:
tsu.head()

In [None]:
# After checking on what year to start from, it turns out that from 2000 to present times, has the fewest amount NaN values.
# In order not to mess with the data i would be removing all nan cells
checking = tsu[tsu['YEAR'] >= 2000]
checking.head()

In [None]:
# shape of new dataframe
checking.shape

In [None]:
# resetting index
checking.reset_index(drop=True, inplace=True)

In [None]:
# checking for null cells
checking.isnull().sum()

In [None]:
# dropping all rows with null cells
checking = checking.dropna()

In [None]:
# checking for duplicates
checking.duplicated().any()

In [None]:
# dropping all duplicates
checking.drop_duplicates(inplace=True)

In [None]:
checking.shape

In [None]:
# resetting index
checking.reset_index(drop=True, inplace=True)

In [None]:
uniq = checking['LOCATION_NAME'].unique()

In [None]:
see = checking['LOCATION_NAME'].value_counts()
see[:50] #Displaying all the values in  the variable see, it can be noted that there are someplaces that a tsunami occured only once.

In [None]:
# analyzing the data, checking to see what the model would train on and if they are relevant. And possibly removing 
# places that a tsunami occurred a few number of times in a couple of years.
for uni in uniq:
    if checking['LOCATION_NAME'].value_counts()[uni] <= 5:
        chi = checking[checking['LOCATION_NAME'] == uni]
        print(chi)
        #chi = chi.values.tolist()    converting to pandaframe to lists

In [None]:
# Removing places with 5 or less occurences since 2000
for i in range(len(checking)):
    if see[checking.loc[i, 'LOCATION_NAME']] < 4:
        checking.drop(i, axis=0, inplace=True)

In [None]:
checking.shape

In [None]:
# resetting index
checking.reset_index(drop=True, inplace=True)

In [None]:
checking.rename(columns={'DAY':'Day',
                          'MONTH':'Month',
                          'YEAR':'Year',
                         'LOCATION_NAME':'Name',
                        'COUNTRY':'Country',
                         'LATITUDE':'Latitude',
                         'LONGITUDE':'Longitude'}, 
                 inplace=True)

In [None]:
checking.head()

In [None]:
#preprocessed_tsu = preprocessing_dataframe(checking)

In [None]:
#preprocessed_tsu.head()

In [None]:
#preprocessed_tsu.to_csv('/kaggle/input/tsunami-clean/tsunami_classification_clean.csv')

In [None]:
preprocessed_tsu = pd.read_csv('/kaggle/input/tsunami-clean/tsunami_classification_clean.csv')

In [None]:
preprocessed_tsu.head()

In [None]:
preprocessed_tsu.drop(['Unnamed: 0'],axis = 1, inplace =True)

In [None]:
preprocessed_tsu.shape

In [None]:
preprocessed_tsu.head()

In [None]:
len(preprocessed_tsu[preprocessed_tsu['target'] == 1])

In [None]:
len(preprocessed_tsu[preprocessed_tsu['target'] == 0])

In [None]:
minority = preprocessed_tsu[preprocessed_tsu['target'] == 1]

In [None]:
minority.head()

In [None]:
majority = preprocessed_tsu[preprocessed_tsu['target'] == 0]

In [None]:
majority .head()

In [None]:
from sklearn.utils import resample
# Upsample minority class
maj_downsamp = resample(majority , replace=True, n_samples=1900, random_state=1) 

In [None]:
downsampled_data = pd.concat([minority,maj_downsamp])

In [None]:
downsampled_data.head()

In [None]:
downsampled_data = downsampled_data.sample(frac=1).reset_index(drop=True) # to shuffle the dataframe

In [None]:
downsampled_data.head()

In [None]:
downsampled_data.shape

In [None]:
x = downsampled_data.iloc[:, 0:5]
y = downsampled_data.iloc[:, 5]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state = 0)

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier= GaussianNB()
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)

In [None]:
cm

In [None]:
print(accuracy_score(y_pred, y_test))

In [None]:
check = classification_report(y_test, y_pred)
print(check)

In [None]:
from sklearn.metrics import roc_auc_score

# Predict class probabilities
y_pred1 = classifier.predict_proba(x_test)
 
# Keep only the positive class
y_pred1 = [y[1] for y in y_pred1]

In [None]:
print(roc_auc_score(y_test, y_pred1))

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier1 = RandomForestClassifier(n_estimators=10, criterion='entropy' , random_state=0) # Increased to 30 estimators
classifier1.fit(x_train, y_train)

In [None]:
y_predR = classifier1.predict(x_test)

In [None]:
cm_1 = confusion_matrix(y_test, y_predR)


In [None]:
cm_1 

In [None]:
print(accuracy_score(y_predR, y_test))

In [None]:
check = classification_report(y_test, y_predR)
print(check)

In [None]:
# Predict class probabilities
y_predR2 = classifier1.predict_proba(x_test)
 
# Keep only the positive class
y_predR2 = [y[1] for y in y_predR2]

In [None]:
print(roc_auc_score(y_test, y_predR2))


In [None]:
import gzip
import dill

# serializing using dill
with gzip.open('natural_disaster_Tsunami_Classification.dill.gz', 'wb') as f:
    dill.dump(classifier1, f, recurse=True)

In [None]:
with open('TsuClass_Model.dill', 'wb') as f:
    dill.dump(classifier1, f, recurse=True)