In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### PREPROCESSING

In [21]:
# The preprocessing function accepts a pandas dataframe with the "Year", "Month" and "Day" features for when a natural disaster
# occured as well as the "Longitude" and "latitude" coordinates. The function returns a pandas dataframe with the mentioned 
# features but for all days within the starting and ending year of the original dataframe. It also has an added feature 
# "target" which is a 1 if a natural disaster occured on that day, and a 0 otherwise.

def preprocessing_dataframe(disaster_df):
    preprocessed_dict = {'Year': [], 'Latitude':[], 'Longitude': [], 'Month': [], 'Day': [], 'target': []} # Starting with a dictionary to hold all values, but will later change to a pandas dataframe
    # Creating a dictionary that stores the latitude and longitude values for each specific place in the dataframe
    print('Preprocessing ... ')
    print(' ')
    place_coords = {}
    for place in disaster_df['Name'].unique():
        lat = disaster_df[disaster_df['Name'] == place]['Latitude'].unique()[0]
        lng = disaster_df[disaster_df['Name'] == place]['Longitude'].unique()[0]
        place_coords[place] = (lat, lng)
        
    # All places with their respective coordinates are now stored in the "place_coords" dictionary
    
    year_start = disaster_df['Year'].unique().min() # Getting the earliest year in the dataframe
    year_end = disaster_df['Year'].unique().max() # Getting the last year in the dataframe
    
    # Now, I'll iterate through all the years in order to assign the targets
    for year in range(year_start, year_end+1):  
        year_df = disaster_df[disaster_df['Year'] == year] # Dataframe for disasters happening in year "year" 
        
        # I'll have to account for all the days of the months in the year, which are usually 30 and 31 except February
        # Assigning the number of days for a specific year in the month of February is dependent on if the year is a leap year 
        # or not, where the number of days will be 29 or 28 respectively.
        
        month_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] # List containing number of days for each month of the year accordingly, i.e index 0 or January with 31 days. This is the list of days assuming it is not a leap year
        if year%4 == 0:
            if year%100 != 0:
                month_days[1] = 29
            else:
                if year%400 == 0:
                    month_days[1] = 29
        
        # Now, the "month_days" list's index "1" will remain 28 if it is not a leap year, and be changed to 29 if it is indeed
        # a leap year
        
        # Would also need to iterate through all the places in the dataframe
        for place in place_coords:
            place_df = year_df[year_df['Name'] == place] # DataFrame for observations of only the place "place" 
            month_number = 1 # This is supposed to be January
            
            #Similar, iterating through all months...
            for days in month_days:
                month_df = place_df[place_df['Month'] == month_number] # DataFrame containing observations of only the month "month"
                
                # Iterating through all the days in the month...
                for day in range(1, days+1):
                    preprocessed_dict['Year'].append(year)
                    preprocessed_dict['Latitude'].append(place_coords[place][0])
                    preprocessed_dict['Longitude'].append(place_coords[place][1])
                    preprocessed_dict['Month'].append(month_number)
                    preprocessed_dict['Day'].append(day)
                    # And finally, if the particular date is present in the dataframe, the target is set to 1, and 0 otherwise
                    if place in year_df['Name'].unique() and month_number in place_df['Month'].unique() and day in month_df['Day'].unique():
                        preprocessed_dict['target'].append(1)
                    else:
                        preprocessed_dict['target'].append(0)
                month_number += 1
                
    preprocessed_df = pd.DataFrame(preprocessed_dict) # Transforming to a dataframe
    
    # Things to note: The function doesn't consider nan values, so if there is a nan value in any of the date features it will
    # set the target to 0. Also, the preprocessed dataframe can be very large without care, so maybe sticking to 40, 50 years
    # at most will be desirable. Also helps that for latter years, there's a lot less nan values. But could also edit it to
    # perform a task if there is are nan values present.
    print('Done!')
    return preprocessed_df 

In [2]:
earth = pd.read_csv('data/txt/signif.txt',delimiter = '\t', quoting = 3, encoding='latin-1')

In [3]:
earth.head()

Unnamed: 0,I_D,FLAG_TSUNAMI,YEAR,MONTH,DAY,HOUR,MINUTE,SECOND,FOCAL_DEPTH,EQ_PRIMARY,...,TOTAL_MISSING,TOTAL_MISSING_DESCRIPTION,TOTAL_INJURIES,TOTAL_INJURIES_DESCRIPTION,TOTAL_DAMAGE_MILLIONS_DOLLARS,TOTAL_DAMAGE_DESCRIPTION,TOTAL_HOUSES_DESTROYED,TOTAL_HOUSES_DESTROYED_DESCRIPTION,TOTAL_HOUSES_DAMAGED,TOTAL_HOUSES_DAMAGED_DESCRIPTION
0,1,,-2150,,,,,,,7.3,...,,,,,,,,,,
1,3,,-2000,,,,,,18.0,7.1,...,,,,,,1.0,,1.0,,
2,2,Tsu,-2000,,,,,,,,...,,,,,,,,,,
3,5877,Tsu,-1610,,,,,,,,...,,,,,,3.0,,,,
4,8,,-1566,,,,,,,,...,,,,,,,,,,


In [4]:
earth.isnull().sum()

I_D                                      0
FLAG_TSUNAMI                          4362
YEAR                                     0
MONTH                                  407
DAY                                    561
HOUR                                  2042
MINUTE                                2247
SECOND                                1846
FOCAL_DEPTH                           2965
EQ_PRIMARY                            1791
EQ_MAG_MW                             4873
EQ_MAG_MS                             3268
EQ_MAG_MB                             4393
EQ_MAG_ML                             6012
EQ_MAG_MFA                            6182
EQ_MAG_UNK                            5419
INTENSITY                             3379
COUNTRY                                  0
STATE                                 5875
LOCATION_NAME                            1
LATITUDE                                49
LONGITUDE                               49
REGION_CODE                              1
DEATHS     

In [5]:
earth = earth[['DAY','MONTH','YEAR', 'LOCATION_NAME','COUNTRY','LATITUDE','LONGITUDE']]

In [6]:
earth.rename(columns={'DAY':'Day',
                          'MONTH':'Month',
                          'YEAR':'Year',
                        'LOCATION_NAME':'Name',
                        'COUNTRY':'Country',
                         'LATITUDE':'Latitude',
                         'LONGITUDE':'Longitude'}, 
                 inplace=True)

In [7]:
earth.head()

Unnamed: 0,Day,Month,Year,Name,Country,Latitude,Longitude
0,,,-2150,"JORDAN: BAB-A-DARAA,AL-KARAK",JORDAN,31.1,35.5
1,,,-2000,TURKMENISTAN: W,TURKMENISTAN,38.0,58.2
2,,,-2000,SYRIA: UGARIT,SYRIA,35.683,35.8
3,,,-1610,GREECE: THERA ISLAND (SANTORINI),GREECE,36.4,25.4
4,,,-1566,ISRAEL: ARIHA (JERICHO),ISRAEL,31.5,35.3


In [9]:
earth_2000= earth[earth['Year'] >= 2000]
earth_2000.head()

Unnamed: 0,Day,Month,Year,Name,Country,Latitude,Longitude
5013,3.0,1.0,2000,INDIA-BANGLADESH BORDER: MAHESHKHALI,INDIA,22.132,92.771
5014,11.0,1.0,2000,CHINA: LIAONING PROVINCE,CHINA,40.498,122.994
5015,14.0,1.0,2000,CHINA: YUNNAN PROVINCE: YAOAN COUNTY,CHINA,25.607,101.063
5016,2.0,2.0,2000,"IRAN: BARDASKAN, KASHMAR",IRAN,35.288,58.218
5017,7.0,2.0,2000,SOUTH AFRICA; SWAZILAND: MBABANE-MANZINI,SOUTH AFRICA,-26.288,30.888


In [12]:
earth_2000.shape

(1183, 7)

In [11]:
earth_2000.isnull().sum()

Day          0
Month        0
Year         0
Name         0
Country      0
Latitude     0
Longitude    0
dtype: int64

In [14]:
earth_2000.duplicated().any()

False

In [18]:
earth_2000.reset_index(drop=True, inplace=True)

In [19]:
earth_2000.head()

Unnamed: 0,Day,Month,Year,Name,Country,Latitude,Longitude
0,3.0,1.0,2000,INDIA-BANGLADESH BORDER: MAHESHKHALI,INDIA,22.132,92.771
1,11.0,1.0,2000,CHINA: LIAONING PROVINCE,CHINA,40.498,122.994
2,14.0,1.0,2000,CHINA: YUNNAN PROVINCE: YAOAN COUNTY,CHINA,25.607,101.063
3,2.0,2.0,2000,"IRAN: BARDASKAN, KASHMAR",IRAN,35.288,58.218
4,7.0,2.0,2000,SOUTH AFRICA; SWAZILAND: MBABANE-MANZINI,SOUTH AFRICA,-26.288,30.888


In [None]:
preprocessed_earth = preprocessing_dataframe(earth_2000)

Preprocessing ... 
 
