In this notebook we cleaned up the data on 03/05/2025.

In [129]:
import numpy as np
import pandas as pd


In [130]:
housing = pd.read_csv('cal_fire_data.csv')
housing.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130722 entries, 0 to 130721
Data columns (total 46 columns):
 #   Column                                                        Non-Null Count   Dtype  
---  ------                                                        --------------   -----  
 0   OBJECTID                                                      130722 non-null  int64  
 1   * Damage                                                      130722 non-null  object 
 2   * Street Number                                               126302 non-null  float64
 3   * Street Name                                                 125236 non-null  object 
 4   * Street Type (e.g. road, drive, lane, etc.)                  116260 non-null  object 
 5   Street Suffix (e.g. apt. 23, blding C)                        62017 non-null   object 
 6   * City                                                        98991 non-null   object 
 7   State                                                   

  housing = pd.read_csv('cal_fire_data.csv')


In [131]:
#We start by removing the colums that had unusable or repeated information.

housing = housing.drop(columns=['GLOBALID', 'OBJECTID', 'Zip Code', '* Street Name',
                                '* Street Number', 'Site Address (parcel)', 'x', 'y',
                                'Incident Number (e.g. CAAEU 123456)', 'Battalion',
                                'State', 'APN (parcel)', 'If Affected 1-9% - Where did fire start?',
                                'If Affected 1-9% - What started fire?', 'Structure Category',
                                '* Street Type (e.g. road, drive, lane, etc.)', 'Street Suffix (e.g. apt. 23, blding C)',
                                '* City', '* CAL FIRE Unit', 'County', 'Community', '* Incident Name',
                                'Structure Defense Actions Taken', '# Units in Structure (if multi unit)',
                                '# of Damaged Outbuildings < 120 SQFT', '# of Non Damaged Outbuildings < 120 SQFT',
                                'Distance - Propane Tank to Structure', 'Distance - Residence to Utility/Misc Structure &gt; 120 SQFT',
                                'Fire Name (Secondary)'])



#We removed the 2 rows where the incident was not Fire, and then the column Hazard Type.

housing.drop(housing.index[housing['Hazard Type'] != 'Fire'], inplace = True)
housing = housing.drop(columns=['Hazard Type'])


In [132]:
# We cleaned Structure Type values

housing['* Structure Type'] = housing['* Structure Type'].replace({'Single Famliy Residence Single Story': 'Single Family Residence Single Story'})


In [133]:
# We created a new row with the building's age
housing['Age'] = 2025 - housing['Year Built (parcel)']
housing = housing.drop(columns=['Year Built (parcel)'])


In [134]:
#The column Incident Start Date includes badly recorded hours, so we cleaned it up to only include a date.
# The date now always appears in DD/MM/YYYY format.

def cleandate(date):
    date = date[:date.find(' ')]
    month = date[:date.find('/')]
    year = date[-4:]
    day = date[date.find('/')+1:-5]
    if len(month) == 1:
        month = '0' + month
    if len(day) == 1:
        day = '0' + day
    return(month + '/' + day + '/' + year) 

housing['Incident Start Date'] = housing['Incident Start Date'].apply(cleandate)


In [135]:
#We make a new column that has the date without the day.

def dayremove(date):
    month = date[:date.find('/')]
    year = date[-4:]
    return(month + '/' + year)

housing['Incident Month/Year'] = housing['Incident Start Date'].apply(dayremove)
housing = housing.drop(columns=['Incident Start Date'])


In [136]:
# We remove rows where the incident occurred before 2018, since incidents without damage weren't reported before then and it could bias the data
def getyear(date):
    return int(date[-2:])

housing['Year'] = housing['Incident Month/Year'].apply(getyear)
housing = housing[housing['Year'] > 17]
housing = housing.drop(columns=['Year', 'Incident Month/Year'])


In [None]:
# We removed the rows where Fire Damage was Inaccessible. We created a new row copying assessing damage with numerical values instead.

housing.drop(housing.index[housing['* Damage'] == 'Inaccessible'], inplace = True)
housing['Destroyed'] = housing['* Damage'].replace({'No Damage': 0, 'Affected (1-9%)': 0, 'Minor (10-25%)': 0, 'Major (26-50%)': 0, 'Destroyed (>50%)': 1})
housing = housing.drop(columns=['* Damage'])


  housing['Damage_Level'] = housing['* Damage'].replace({'No Damage': 0, 'Affected (1-9%)': 0, 'Minor (10-25%)': 0, 'Major (26-50%)': 0, 'Destroyed (>50%)': 1})


In [138]:
# We imputed missing feature values

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder

def missingyear(age):
    if age > 1000.0:
        return np.nan
    return age

housing['Age'] = housing['Age'].apply(missingyear)

les = []
cols = housing.select_dtypes('object').columns
for col in cols:
    le = LabelEncoder()
    housing[col] = le.fit_transform(housing[col])
    les.append(le)
impute = IterativeImputer(max_iter=10, random_state=216)
housing = pd.DataFrame(impute.fit_transform(housing), columns = housing.columns)
i = 0
for le in les:
    housing[cols[i]] = le.inverse_transform(housing[cols[i]].astype('int'))
    i += 1


In [139]:
housing.to_csv('Housing_Data_Cleaned_03052025.csv')
