In [171]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [172]:
df = pd.read_excel("Data/2021_all_states.xlsx")
df.head()

Unnamed: 0,date,negeri,code,daerah,kawasan_banjir,latitude,longitude,kedalaman_banjir_max,nilai_hujan_max,tempoh_hujan,tempoh_ulang,day,month,year
0,2/1/2021,Johor,JHR,PONTIAN,Kg. Ulu Pulai,1.4808,103.572,0.4,73.0,3.0,NORMA\nL,2,1,2021
1,3/1/2021,Johor,JHR,PONTIAN,Kg. Sri Gambut,1.6261,103.4498,0.4,19.0,3.0,NORMA\nL,3,1,2021
2,3/1/2021,Johor,JHR,PONTIAN,Pt. Haji Yasin,1.7172,103.3275,0.4,,,,3,1,2021
3,3/1/2021,Johor,JHR,PONTIAN,Pt. Wak Jabir,1.7432,103.2649,0.4,,,,3,1,2021
4,1/1/2021,Johor,JHR,JOHOR BAHRU,Kg. Bukit Pulai,1.4793,103.9365,0.3,337.0,48.0,57,1,1,2021


In [173]:
df.columns

Index(['date', 'negeri', 'code', 'daerah', 'kawasan_banjir', 'latitude',
       'longitude', 'kedalaman_banjir_max', 'nilai_hujan_max', 'tempoh_hujan',
       'tempoh_ulang', 'day', 'month', 'year'],
      dtype='object')

# Check for Null value

In [174]:
# check for empty coordinates in the dataset
print("Number of empty coordinates: ", df['latitude'].isnull().sum())
print("Length of dataframe: ", len(df))
print("Percentage of empty coordinates: ", df['latitude'].isnull().sum()/len(df)*100, "%")

Number of empty coordinates:  569
Length of dataframe:  4955
Percentage of empty coordinates:  11.483350151362261 %


In [175]:
# check for missing values in flood depth column
print("Number of empty flood depth values: ", df['kedalaman_banjir_max'].isnull().sum())
print("Percentage of empty flood depth values: ", df['kedalaman_banjir_max'].isnull().sum()/len(df)*100, "%")

Number of empty flood depth values:  808
Percentage of empty flood depth values:  16.306760847628656 %


In [176]:
# missing value for max rainfall and the rainfall ARI
print("Number of empty cells in max rainfall: ", df['nilai_hujan_max'].isnull().sum())
print("Number of empty cells in rain duration: ", df['tempoh_hujan'].isnull().sum())
print("Number of empty cells in ARI:", df["tempoh_ulang"].isnull().sum())

# print the percentage of missing values in each column
print("Percentage of missing values in max rainfall: ", df['nilai_hujan_max'].isnull().sum()/len(df)*100)
print('Percentage of missing values in rain duration: ', df['tempoh_hujan'].isnull().sum()/len(df)*100)
print("Percentage of missing values in ARI:", df["tempoh_ulang"].isnull().sum()/len(df)*100)

Number of empty cells in max rainfall:  3825
Number of empty cells in rain duration:  3824
Number of empty cells in ARI: 3827
Percentage of missing values in max rainfall:  77.19475277497477
Percentage of missing values in rain duration:  77.17457114026236
Percentage of missing values in ARI: 77.2351160443996


- We will drop the max rainfall, rainfall duration and ARI column due to the tremendous amount of missing values
- For the coordinate and flood depth columns, we will remove the missing values in flood depth columns and impute the corresponding flood depth either with before fill or 0.

In [177]:
df = df.drop(["nilai_hujan_max", "tempoh_hujan", "tempoh_ulang"], axis = 1)
df.head()

Unnamed: 0,date,negeri,code,daerah,kawasan_banjir,latitude,longitude,kedalaman_banjir_max,day,month,year
0,2/1/2021,Johor,JHR,PONTIAN,Kg. Ulu Pulai,1.4808,103.572,0.4,2,1,2021
1,3/1/2021,Johor,JHR,PONTIAN,Kg. Sri Gambut,1.6261,103.4498,0.4,3,1,2021
2,3/1/2021,Johor,JHR,PONTIAN,Pt. Haji Yasin,1.7172,103.3275,0.4,3,1,2021
3,3/1/2021,Johor,JHR,PONTIAN,Pt. Wak Jabir,1.7432,103.2649,0.4,3,1,2021
4,1/1/2021,Johor,JHR,JOHOR BAHRU,Kg. Bukit Pulai,1.4793,103.9365,0.3,1,1,2021


In [178]:
df['kawasan_banjir'] = df['kawasan_banjir'].fillna(value = "Unkonwn Location")
df["kedalaman_banjir_max"] = df["kedalaman_banjir_max"].fillna(method = "bfill")

In [179]:
df.isnull().sum()

date                      0
negeri                    0
code                      0
daerah                    0
kawasan_banjir            0
latitude                569
longitude               569
kedalaman_banjir_max      0
day                       0
month                     0
year                      0
dtype: int64

In [180]:
df = df.dropna(how = "any", axis = 0)
df.head()

Unnamed: 0,date,negeri,code,daerah,kawasan_banjir,latitude,longitude,kedalaman_banjir_max,day,month,year
0,2/1/2021,Johor,JHR,PONTIAN,Kg. Ulu Pulai,1.4808,103.572,0.4,2,1,2021
1,3/1/2021,Johor,JHR,PONTIAN,Kg. Sri Gambut,1.6261,103.4498,0.4,3,1,2021
2,3/1/2021,Johor,JHR,PONTIAN,Pt. Haji Yasin,1.7172,103.3275,0.4,3,1,2021
3,3/1/2021,Johor,JHR,PONTIAN,Pt. Wak Jabir,1.7432,103.2649,0.4,3,1,2021
4,1/1/2021,Johor,JHR,JOHOR BAHRU,Kg. Bukit Pulai,1.4793,103.9365,0.3,1,1,2021


In [181]:
df.isnull().sum()

date                    0
negeri                  0
code                    0
daerah                  0
kawasan_banjir          0
latitude                0
longitude               0
kedalaman_banjir_max    0
day                     0
month                   0
year                    0
dtype: int64

In [182]:
df.shape

(4386, 11)

In [183]:
df["kedalaman_banjir_max"].astype("float64")
df["latitude"].astype("float64")
df["longitude"].astype("float64")

0       103.5720
1       103.4498
2       103.3275
3       103.2649
4       103.9365
          ...   
4950    103.3961
4951    102.9770
4952    102.9943
4953    103.1735
4954    103.1077
Name: longitude, Length: 4386, dtype: float64

In [184]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4386 entries, 0 to 4954
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   date                  4386 non-null   object 
 1   negeri                4386 non-null   object 
 2   code                  4386 non-null   object 
 3   daerah                4386 non-null   object 
 4   kawasan_banjir        4386 non-null   object 
 5   latitude              4386 non-null   float64
 6   longitude             4386 non-null   float64
 7   kedalaman_banjir_max  4386 non-null   float64
 8   day                   4386 non-null   int64  
 9   month                 4386 non-null   int64  
 10  year                  4386 non-null   object 
dtypes: float64(3), int64(2), object(6)
memory usage: 411.2+ KB


In [185]:
df.to_csv("Cleaned/2021_all_states_cleaned.csv", index=False)