In [1]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [47]:
# Load the data
df = pd.read_csv("BFP_FIreIncidents2012-2016.csv")

In [48]:
df.head()

Unnamed: 0,PSGC,REGION,PROVINCE_FIRE_DISTRICT,CONGRESSIONAL_DISTRICT,CITY_MUNICIPALITY,YEAR,INCIDENTS,INJURIES,DEATHS,ESTIMATED_DAMAGES
0,12801000,1,ILOCOS NORTE,1ST,ADAMS,2012,6,0,0,0.0
1,12801000,1,ILOCOS NORTE,1ST,ADAMS,2013,0,0,0,0.0
2,12801000,1,ILOCOS NORTE,1ST,ADAMS,2014,0,0,0,0.0
3,12801000,1,ILOCOS NORTE,1ST,ADAMS,2015,0,0,0,0.0
4,12801000,1,ILOCOS NORTE,1ST,ADAMS,2016,0,0,0,0.0


In [49]:
df.tail()

Unnamed: 0,PSGC,REGION,PROVINCE_FIRE_DISTRICT,CONGRESSIONAL_DISTRICT,CITY_MUNICIPALITY,YEAR,INCIDENTS,INJURIES,DEATHS,ESTIMATED_DAMAGES
8165,137404000,NCR,DISTRICT V,LONE,QUEZON CITY,2012,865,59,25,177338100.0
8166,137404000,NCR,DISTRICT V,LONE,QUEZON CITY,2013,889,57,9,155861700.0
8167,137404000,NCR,DISTRICT V,LONE,QUEZON CITY,2014,1026,75,8,91806500.0
8168,137404000,NCR,DISTRICT V,LONE,QUEZON CITY,2015,1023,75,23,53127200.0
8169,137404000,NCR,DISTRICT V,LONE,QUEZON CITY,2016,1246,76,9,123420964.9


# DETECT MISSING VALUES

In [50]:
df.isnull().sum()

PSGC                      0
REGION                    0
PROVINCE_FIRE_DISTRICT    0
CONGRESSIONAL_DISTRICT    0
CITY_MUNICIPALITY         0
YEAR                      0
INCIDENTS                 0
INJURIES                  0
DEATHS                    0
ESTIMATED_DAMAGES         0
dtype: int64

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8170 entries, 0 to 8169
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   PSGC                    8170 non-null   int64  
 1   REGION                  8170 non-null   object 
 2   PROVINCE_FIRE_DISTRICT  8170 non-null   object 
 3   CONGRESSIONAL_DISTRICT  8170 non-null   object 
 4   CITY_MUNICIPALITY       8170 non-null   object 
 5   YEAR                    8170 non-null   int64  
 6   INCIDENTS               8170 non-null   int64  
 7   INJURIES                8170 non-null   int64  
 8   DEATHS                  8170 non-null   int64  
 9   ESTIMATED_DAMAGES       8170 non-null   float64
dtypes: float64(1), int64(5), object(4)
memory usage: 638.4+ KB


In [52]:
df.isna()

Unnamed: 0,PSGC,REGION,PROVINCE_FIRE_DISTRICT,CONGRESSIONAL_DISTRICT,CITY_MUNICIPALITY,YEAR,INCIDENTS,INJURIES,DEATHS,ESTIMATED_DAMAGES
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8165,False,False,False,False,False,False,False,False,False,False
8166,False,False,False,False,False,False,False,False,False,False
8167,False,False,False,False,False,False,False,False,False,False
8168,False,False,False,False,False,False,False,False,False,False


# DETECTING OUTLIERS REMOVING OUTLIERS

In [53]:
# Compute the absolute Z-scores for numerical columns in the DataFrame
z_scores = np.abs(zscore(df.select_dtypes(include=[np.number])))

# Print a message indicating that Z-scores are being printed
print("Z-scores:\n")
# Print the computed Z-scores
print(z_scores)

Z-scores:

          PSGC      YEAR  INCIDENTS   INJURIES     DEATHS  ESTIMATED_DAMAGES
0     1.442192  1.414214   0.068139   0.137555   0.120261           0.081480
1     1.442192  0.707107   0.211505   0.137555   0.120261           0.081480
2     1.442192  0.000000   0.211505   0.137555   0.120261           0.081480
3     1.442192  0.707107   0.211505   0.137555   0.120261           0.081480
4     1.442192  1.414214   0.211505   0.137555   0.120261           0.081480
...        ...       ...        ...        ...        ...                ...
8165  1.122297  1.414214  20.457169  16.070094  20.298100           6.178299
8166  1.122297  0.707107  21.030636  15.520682   7.230349           5.420213
8167  1.122297  0.000000  24.304171  20.465388   6.413615           3.159157
8168  1.122297  0.707107  24.232488  20.465388  18.664631           1.793834
8169  1.122297  1.414214  29.560944  20.740094   7.230349           4.275102

[8170 rows x 6 columns]


In [54]:
# Removing outliers
df = df[(z_scores < 3).all(axis=1)]
print("\nData after removing outliers:")
print(df)



Data after removing outliers:
           PSGC REGION PROVINCE_FIRE_DISTRICT CONGRESSIONAL_DISTRICT  \
0      12801000      1           ILOCOS NORTE                    1ST   
1      12801000      1           ILOCOS NORTE                    1ST   
2      12801000      1           ILOCOS NORTE                    1ST   
3      12801000      1           ILOCOS NORTE                    1ST   
4      12801000      1           ILOCOS NORTE                    1ST   
...         ...    ...                    ...                    ...   
8154  137606000    NCR            DISTRICT IV                   LONE   
8156  137405000    NCR            DISTRICT IV                   LONE   
8157  137405000    NCR            DISTRICT IV                   LONE   
8158  137405000    NCR            DISTRICT IV                   LONE   
8159  137405000    NCR            DISTRICT IV                   LONE   

     CITY_MUNICIPALITY  YEAR  INCIDENTS  INJURIES  DEATHS  ESTIMATED_DAMAGES  
0                ADAMS  2

# DETECT DUPLICATES

In [59]:
duplicates = df.duplicated(keep=False)
print(duplicates)

0       False
1       False
2       False
3       False
4       False
        ...  
8154    False
8156    False
8157    False
8158    False
8159    False
Length: 8003, dtype: bool


In [60]:
# Show only duplicate rows
duplicate_rows = df[duplicates]
print("Duplicated rows (DataFrame):")
print(duplicate_rows)

Duplicated rows (DataFrame):
Empty DataFrame
Columns: [PSGC, REGION, PROVINCE_FIRE_DISTRICT, CONGRESSIONAL_DISTRICT, CITY_MUNICIPALITY, YEAR, INCIDENTS, INJURIES, DEATHS, ESTIMATED_DAMAGES]
Index: []


# Save the cleaned data to another file

In [73]:

# Save to CSV
output_csv_file = 'cleaned_data_BFP_FIreIncidents2012-2016.csv'
df.to_csv(output_csv_file, index=False)

print(f"Cleaned data saved to {output_csv_file}")


Cleaned data saved to cleaned_data_BFP_FIreIncidents2012-2016.csv
