In this notebook we cleaned up the data on 03/05/2025.

In [None]:
import numpy as np
import pandas as pd


In [None]:
housing = pd.read_csv('cal_fire_data.csv')
housing.info()


In [None]:
#We start by removing the colums that had unusable or repeated information.

housing = housing.drop(columns=['GLOBALID', 'OBJECTID', 'Zip Code', '* Street Name',
                                '* Street Number', 'Site Address (parcel)', 'x', 'y',
                                'Incident Number (e.g. CAAEU 123456)', 'Battalion',
                                'State', 'APN (parcel)'])



#We removed the 2 rows where the incident was not Fire, and then the column Hazard Type.

housing.drop(housing.index[housing['Hazard Type'] != 'Fire'], inplace = True)
housing = housing.drop(columns=['Hazard Type'])


In [None]:
# We removed the rows where Fire Damage was Inaccessible. We created a new row copying assessing damage with numerical values instead.

housing.drop(housing.index[housing['* Damage'] == 'Inaccessible'], inplace = True)
housing['Damage_Level'] = housing['* Damage'].replace({'No Damage': 0, 'Affected (1-9%)': 1, 'Minor (10-25%)': 2, 'Major (26-50%)': 3, 'Destroyed (>50%)': 4})


In [None]:
# We created a new row with the building's age
housing['Age'] = 2025 - housing['Year Built (parcel)']
# housing = housing[housing['Year Built (parcel)'] > 1000]


In [None]:
#The column Incident Start Date includes badly recorded hours, so we cleaned it up to only include a date.
# The date now always appears in DD/MM/YYYY format.

def cleandate(date):
    date = date[:date.find(' ')]
    month = date[:date.find('/')]
    year = date[-4:]
    day = date[date.find('/')+1:-5]
    if len(month) == 1:
        month = '0' + month
    if len(day) == 1:
        day = '0' + day
    return(month + '/' + day + '/' + year) 

housing['Incident Start Date'] = housing['Incident Start Date'].apply(cleandate)


In [None]:
#We make a new column that has the date without the day.

def dayremove(date):
    month = date[:date.find('/')]
    year = date[-4:]
    return(month + '/' + year)

housing['Incident Month/Year'] = housing['Incident Start Date'].apply(dayremove)


In [None]:
housing.to_csv('Housing_Data_Cleaned_03052025.csv')
