<a href="https://colab.research.google.com/github/kene111/AI6_ass/blob/master/Tsunami_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


### PREPROCESSING

In [2]:
# The preprocessing function accepts a pandas dataframe with the "Year", "Month" and "Day" features for when a natural disaster
# occured as well as the "Longitude" and "latitude" coordinates. The function returns a pandas dataframe with the mentioned 
# features but for all days within the starting and ending year of the original dataframe. It also has an added feature 
# "target" which is a 1 if a natural disaster occured on that day, and a 0 otherwise.

def preprocessing_dataframe(disaster_df):
    preprocessed_dict = {'Year': [], 'Latitude':[], 'Longitude': [], 'Month': [], 'Day': [], 'target': []} # Starting with a dictionary to hold all values, but will later change to a pandas dataframe
    # Creating a dictionary that stores the latitude and longitude values for each specific place in the dataframe
    print('Preprocessing ... ')
    print(' ')
    place_coords = {}
    for place in disaster_df['Name'].unique():
        lat = disaster_df[disaster_df['Name'] == place]['Latitude'].unique()[0]
        lng = disaster_df[disaster_df['Name'] == place]['Longitude'].unique()[0]
        place_coords[place] = (lat, lng)
        
    # All places with their respective coordinates are now stored in the "place_coords" dictionary
    
    year_start = disaster_df['Year'].unique().min() # Getting the earliest year in the dataframe
    year_end = disaster_df['Year'].unique().max() # Getting the last year in the dataframe
    
    # Now, I'll iterate through all the years in order to assign the targets
    for year in range(year_start, year_end+1):  
        year_df = disaster_df[disaster_df['Year'] == year] # Dataframe for disasters happening in year "year" 
        
        # I'll have to account for all the days of the months in the year, which are usually 30 and 31 except February
        # Assigning the number of days for a specific year in the month of February is dependent on if the year is a leap year 
        # or not, where the number of days will be 29 or 28 respectively.
        
        month_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] # List containing number of days for each month of the year accordingly, i.e index 0 or January with 31 days. This is the list of days assuming it is not a leap year
        if year%4 == 0:
            if year%100 != 0:
                month_days[1] = 29
            else:
                if year%400 == 0:
                    month_days[1] = 29
        
        # Now, the "month_days" list's index "1" will remain 28 if it is not a leap year, and be changed to 29 if it is indeed
        # a leap year
        
        # Would also need to iterate through all the places in the dataframe
        for place in place_coords:
            place_df = year_df[year_df['Name'] == place] # DataFrame for observations of only the place "place" 
            month_number = 1 # This is supposed to be January
            
            #Similar, iterating through all months...
            for days in month_days:
                month_df = place_df[place_df['Month'] == month_number] # DataFrame containing observations of only the month "month"
                
                # Iterating through all the days in the month...
                for day in range(1, days+1):
                    preprocessed_dict['Year'].append(year)
                    preprocessed_dict['Latitude'].append(place_coords[place][0])
                    preprocessed_dict['Longitude'].append(place_coords[place][1])
                    preprocessed_dict['Month'].append(month_number)
                    preprocessed_dict['Day'].append(day)
                    # And finally, if the particular date is present in the dataframe, the target is set to 1, and 0 otherwise
                    if place in year_df['Name'].unique() and month_number in place_df['Month'].unique() and day in month_df['Day'].unique():
                        preprocessed_dict['target'].append(1)
                    else:
                        preprocessed_dict['target'].append(0)
                month_number += 1
                
    preprocessed_df = pd.DataFrame(preprocessed_dict) # Transforming to a dataframe
    
    # Things to note: The function doesn't consider nan values, so if there is a nan value in any of the date features it will
    # set the target to 0. Also, the preprocessed dataframe can be very large without care, so maybe sticking to 40, 50 years
    # at most will be desirable. Also helps that for latter years, there's a lot less nan values. But could also edit it to
    # perform a task if there is are nan values present.
    print('Done!')
    return preprocessed_df 

In [3]:
tsu = pd.read_csv('/content/tsrunup.txt',delimiter = '\t', quoting = 3, encoding='latin-1')

In [4]:
tsu.head()

Unnamed: 0,I_D,TSEVENT_ID,YEAR,MONTH,DAY,HOUR,MINUTE,SECOND,DOUBTFUL,COUNTRY,STATE,LOCATION_NAME,LATITUDE,LONGITUDE,REGION_CODE,DISTANCE_FROM_SOURCE,ARR_DAY,ARR_HOUR,ARR_MIN,TRAVEL_TIME_HOURS,TRAVEL_TIME_MINUTES,WATER_HT,HORIZONTAL_INUNDATION,TYPE_MEASUREMENT_ID,PERIOD,FIRST_MOTION,DEATHS,DEATHS_DESCRIPTION,INJURIES,INJURIES_DESCRIPTION,DAMAGE_MILLIONS_DOLLARS,DAMAGE_DESCRIPTION,HOUSES_DAMAGED,HOUSES_DAMAGED_DESCRIPTION,HOUSES_DESTROYED,HOUSES_DESTROYED_DESCRIPTION
0,11014,1,-2000,,,,,,,SYRIA,,UGARIT,35.583,35.75,50,12.0,,,,,,,,1.0,,,,3.0,,,,4.0,,,,
1,17601,3,-1610,,,,,,,SYRIA,,UGARIT,35.583,35.75,50,935.0,,,,,,,,1.0,,,,3.0,,,,3.0,,,,
2,1,3,-1610,,,,,,,GREECE,,N. AND E. COAST CRETE,35.5,25.0,50,106.0,,,,,,,,1.0,,,,3.0,,,,3.0,,,,
3,2,9,-479,,,,,,,GREECE,,"POTIDAEA, MACEDONIA",40.3,23.33,50,67.0,,,,,,,,1.0,,,,3.0,,,,,,,,
4,7,10,-426,6.0,,,,,,GREECE,,OPOUS,38.633,23.0833,50,45.0,,,,,,,800.0,1.0,,,,,,,,,,,,


In [5]:
tsu.isnull().sum()

I_D                                 0
TSEVENT_ID                          0
YEAR                                0
MONTH                             112
DAY                               229
HOUR                             1867
MINUTE                           2096
SECOND                           4582
DOUBTFUL                        27626
COUNTRY                             2
STATE                            5711
LOCATION_NAME                       5
LATITUDE                          647
LONGITUDE                         647
REGION_CODE                         0
DISTANCE_FROM_SOURCE              966
ARR_DAY                         24467
ARR_HOUR                        24517
ARR_MIN                         24586
TRAVEL_TIME_HOURS               24756
TRAVEL_TIME_MINUTES             24763
WATER_HT                         4148
HORIZONTAL_INUNDATION           18948
TYPE_MEASUREMENT_ID               234
PERIOD                          25638
FIRST_MOTION                    26532
DEATHS      

In [6]:
tsu  = tsu[['DAY','MONTH','YEAR', 'LOCATION_NAME','COUNTRY','LATITUDE','LONGITUDE']]

In [7]:
tsu.head()

Unnamed: 0,DAY,MONTH,YEAR,LOCATION_NAME,COUNTRY,LATITUDE,LONGITUDE
0,,,-2000,UGARIT,SYRIA,35.583,35.75
1,,,-1610,UGARIT,SYRIA,35.583,35.75
2,,,-1610,N. AND E. COAST CRETE,GREECE,35.5,25.0
3,,,-479,"POTIDAEA, MACEDONIA",GREECE,40.3,23.33
4,,6.0,-426,OPOUS,GREECE,38.633,23.0833


In [8]:
tsu_2000= tsu[tsu['YEAR'] >= 2000]
tsu_2000.head()

Unnamed: 0,DAY,MONTH,YEAR,LOCATION_NAME,COUNTRY,LATITUDE,LONGITUDE
14030,26.0,1.0,2000,MATATA,PHILIPPINES,5.13,120.33
14031,26.0,1.0,2000,SAPA-SAPA ISLAND,PHILIPPINES,5.1,120.27
14032,26.0,1.0,2000,TAWI TAWI,PHILIPPINES,4.76,119.41
14033,26.0,1.0,2000,SIMUNOL,PHILIPPINES,4.55,119.82
14034,5.0,4.0,2000,"HERAKLION (IRAKLION), CRETE",GREECE,35.339,25.18


In [9]:
tsu_2000.shape

(14093, 7)

In [10]:
tsu_2000.isnull().sum()

DAY               0
MONTH             0
YEAR              0
LOCATION_NAME     1
COUNTRY           0
LATITUDE         61
LONGITUDE        61
dtype: int64

In [11]:
tsu_2000 = tsu_2000.dropna()

In [12]:
tsu_2000.duplicated().any()

True

In [13]:
tsu_2000.drop_duplicates(inplace=True)

In [14]:
tsu_2000.reset_index(drop=True,inplace=True)

In [15]:
tsu_2000.shape

(13545, 7)

In [16]:
tsu_2000.rename(columns={'DAY':'Day',
                          'MONTH':'Month',
                          'YEAR':'Year',
                        'LOCATION_NAME':'Name',
                        'COUNTRY':'Country',
                         'LATITUDE':'Latitude',
                         'LONGITUDE':'Longitude'}, 
                 inplace=True)

In [17]:
tsu_2000.head()

Unnamed: 0,Day,Month,Year,Name,Country,Latitude,Longitude
0,26.0,1.0,2000,MATATA,PHILIPPINES,5.13,120.33
1,26.0,1.0,2000,SAPA-SAPA ISLAND,PHILIPPINES,5.1,120.27
2,26.0,1.0,2000,TAWI TAWI,PHILIPPINES,4.76,119.41
3,26.0,1.0,2000,SIMUNOL,PHILIPPINES,4.55,119.82
4,5.0,4.0,2000,"HERAKLION (IRAKLION), CRETE",GREECE,35.339,25.18


In [18]:
preprocessed_tsu = preprocessing_dataframe(tsu_2000)

Preprocessing ... 
 
Done!


In [41]:
preprocessed_tsu[preprocessed_tsu['target'] == 1]

Unnamed: 0,Year,Latitude,Longitude,Month,Day,target
25,2000,5.13000,120.33000,1,26,1
391,2000,5.10000,120.27000,1,26,1
757,2000,4.76000,119.41000,1,26,1
1123,2000,4.55000,119.82000,1,26,1
1559,2000,35.33900,25.18000,4,5,1
...,...,...,...,...,...,...
28795110,2020,18.50460,-68.37550,1,7,1
28795476,2020,18.31820,-64.72420,1,7,1
28795842,2020,18.09386,-65.47135,1,7,1
28796324,2020,35.00000,25.74000,5,2,1


In [43]:
len(preprocessed_tsu[preprocessed_tsu['target'] == 1])

5890

In [44]:
len(preprocessed_tsu[preprocessed_tsu['target'] == 0])

28791044

In [21]:
# ------------ --------- ------ ------------- ------- -------- -------- --------- ----------- -------- ------
preprocessed_tsu.shape


(28796934, 6)

In [25]:
x = preprocessed_tsu.iloc[:,0:5].values

In [26]:
y = preprocessed_tsu.iloc[:, 5].values

In [27]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state = 0)

In [28]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test =  sc.transform(x_test)

In [29]:
from sklearn.naive_bayes import GaussianNB
classifier= GaussianNB()
classifier.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [30]:
y_pred = classifier.predict(x_test)

In [31]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)

In [32]:
cm

array([[7197790,       0],
       [   1444,       0]])

In [37]:
analysis = pd.DataFrame(y_pred, columns=['predicted'])

In [38]:
analysis.head()

Unnamed: 0,predicted
0,0
1,0
2,0
3,0
4,0


In [40]:
analysis[analysis['predicted'] == 1]

Unnamed: 0,predicted
