In [1]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
import time
import requests
from sklearn.preprocessing import LabelEncoder
from scipy import stats

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('./fires_in_ph.csv')

  df = pd.read_csv('./fires_in_ph.csv')


-4 indicates the value was anonymized because the property had 10 or fewer residents.

In [3]:
df = df.replace(-4, np.nan)

In [4]:
df = df[
    (df.std_st == 'PA') |
    (df.std_st == 'MA') |
    (df.std_st == 'NY') |
    (df.std_st == 'IL')
]

The majority of many columns have a -4 value, which indicates the information was hidden for privacy reasons, which occurs when the number of residents is fewer than 10.

In [5]:
pd.set_option('display.max_rows', 200)
(df.isna().sum() / len(df)).sort_values(ascending=False)

dpvact                        1.000000
urb_out                       1.000000
dpvrc                         1.000000
dpv                           1.000000
place_inc2kx                  1.000000
dpvnost                       1.000000
zip_class                     0.995566
c1pdrc                        0.964201
c1pgrc                        0.951979
c1pzrc                        0.951943
apt_type                      0.931033
apt_no                        0.927969
necta_nm                      0.899344
c1psrc                        0.883950
stm2kx                        0.879011
annl_expns_amnt_prev_yr       0.865527
pct_utility_allow             0.865455
pct_disabled_ge62             0.865455
pct_disabled_all              0.865455
pct_lt24_head                 0.865455
pct_age25_50                  0.865455
pct_age51_61                  0.865455
pct_age62plus                 0.865455
pct_age85plus                 0.865455
pct_minority                  0.865455
pct_black                

In [7]:
columns_to_drop = [
    'x', 'y', 'objectid', 'participant_code', 'std_addr', 'apt_no', 'rc2kx', 'stm2kx',
    'dev_buil_nu_entrance', 'ha_phn_num', 'ha_fax_num', 'ha_email_addr_text',
    'exec_dir_phone', 'exec_dir_fax', 'exec_dir_email',
    'lat', 'lon', 'c1pgrc', 'c1pprb', 'c1pdrc', 'c1psrc', 'c1parc', 'c1pzrc',
    'project_name', 'building_name', 'building_number',
    'zcta2kx', 'dpbc', 'dpbc_cksum', 'std_zip11',
    'national_bldg_id', 'curcnty_nm', 'curcnty', 'curcosub', 'curcosub_nm',
    'place2kx', 'place_nm2kx', 'msa', 'cbsa', 'necta', 'necta_nm', 'last_updt_dttm', 
    'development_code',
    'state2kx', 'cnty_nm2kx','cnty2kx','tract2kx','bg2kx','block2kx',
    'fcd_fips91','hlc', 'std_city',
    'county_level'	,'place_level',	'tract_level',	'blkgrp_level',
    'place_cc2kx', 'place_inc2kx', 'formal_participant_name',
    'cbsa_nm', 'urb_out']

df.drop(columns=columns_to_drop, inplace=True)

In [8]:
df['construction_year'] = pd.to_datetime(df['construct_date']).dt.year
df['dofa_actual_year'] = pd.to_datetime(df['dofa_actual_dt']).dt.year
df = df.drop(['construct_date', 'dofa_actual_dt'], axis=1)

In [9]:
df.sample()

Unnamed: 0,building_type_code,building_status_type_code,total_dwelling_units,acc_units,total_occupied,regular_vacant,total_units,pct_occupied,number_reported,pct_reported,months_since_report,pct_movein,people_per_unit,people_total,rent_per_month,spending_per_month,spending_per_month_prev_yr,hh_income,person_income,pct_lt5k,pct_5k_lt10k,pct_10k_lt15k,pct_15k_lt20k,pct_ge20k,pct_wage_major,pct_welfare_major,pct_other_major,pct_median,pct_lt50_median,pct_lt30_median,pct_2adults,pct_1adult,pct_female_head,pct_female_head_child,pct_disabled_lt62,pct_disabled_ge62,pct_disabled_all,pct_lt24_head,pct_age25_50,pct_age51_61,pct_age62plus,pct_age85plus,pct_minority,pct_black,pct_native_american,pct_asian,pct_hispanic,months_waiting,months_from_movein,pct_utility_allow,ave_util_allow,pct_bed1,pct_bed2,pct_bed3,pct_overhoused,tminority,tpoverty,tpct_ownsfd,chldrn_mbr_cnt,eldly_prcnt,pct_disabled_lt62_all,pct_lt80_median,median_inc_amnt,dpvact,dpvnost,msa_nm,metro,micro,dpv,dpvrc,std_st,std_zip5,std_zip9,zip_class,addr_type,apt_type,msgusps,lvl2kx,ur,msg2kx,rad_chap_indr,rad_type,annl_expns_amnt,annl_expns_amnt_prev_yr,pha_total_units,building_fires_11_19,building_fires_20_21,construction_year,dofa_actual_year
15258,SF,INAPCP,1,1,1,0,1,100.0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,,,IL,62053.0,1000.0,,S,,90% Overall probable correctness,R,R,** Street-Level Rooftop **,N,,,,43,0,0,1954.0,1954.0


In [10]:
# Assuming your dataframe is named df
placeholder = "MISSING_VALUE"  # or any other string you'd like to use as placeholder

# List of columns to be one-hot encoded
columns_to_encode = [
    'std_st', 'building_type_code', 
    'building_status_type_code', 
    'addr_type', 'apt_type', 'zip_class', 'lvl2kx', 'ur', 'msg2kx', 
    'rad_chap_indr', 'rad_type'
]

# Fill NaN values with placeholder
for column in columns_to_encode:
    df[column] = df[column].fillna(placeholder)

# One-hot encode the columns
for column in columns_to_encode:
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df.drop([column], axis=1, inplace=True)

In [11]:
from sklearn.preprocessing import LabelEncoder

df.msgusps = LabelEncoder().fit_transform(df.msgusps)
df.msa_nm = LabelEncoder().fit_transform(df.msa_nm)

In [12]:
df.sample()

Unnamed: 0,total_dwelling_units,acc_units,total_occupied,regular_vacant,total_units,pct_occupied,number_reported,pct_reported,months_since_report,pct_movein,people_per_unit,people_total,rent_per_month,spending_per_month,spending_per_month_prev_yr,hh_income,person_income,pct_lt5k,pct_5k_lt10k,pct_10k_lt15k,pct_15k_lt20k,pct_ge20k,pct_wage_major,pct_welfare_major,pct_other_major,pct_median,pct_lt50_median,pct_lt30_median,pct_2adults,pct_1adult,pct_female_head,pct_female_head_child,pct_disabled_lt62,pct_disabled_ge62,pct_disabled_all,pct_lt24_head,pct_age25_50,pct_age51_61,pct_age62plus,pct_age85plus,pct_minority,pct_black,pct_native_american,pct_asian,pct_hispanic,months_waiting,months_from_movein,pct_utility_allow,ave_util_allow,pct_bed1,pct_bed2,pct_bed3,pct_overhoused,tminority,tpoverty,tpct_ownsfd,chldrn_mbr_cnt,eldly_prcnt,pct_disabled_lt62_all,pct_lt80_median,median_inc_amnt,dpvact,dpvnost,msa_nm,metro,micro,dpv,dpvrc,std_zip5,std_zip9,msgusps,annl_expns_amnt,annl_expns_amnt_prev_yr,pha_total_units,building_fires_11_19,building_fires_20_21,construction_year,dofa_actual_year,std_st_IL,std_st_MA,std_st_NY,std_st_PA,building_type_code_ES,building_type_code_RW,building_type_code_SD,building_type_code_SF,building_type_code_WU,building_type_code_sf,building_status_type_code_DDAPRD,building_status_type_code_DDDRFT,building_status_type_code_DDPROP,building_status_type_code_INAPCP,building_status_type_code_RMI,building_status_type_code_RMIPRP,addr_type_F,addr_type_G,addr_type_H,addr_type_MISSING_VALUE,addr_type_R,addr_type_S,apt_type_#,apt_type_Apt,apt_type_MISSING_VALUE,apt_type_Ste,apt_type_Unit,zip_class_MISSING_VALUE,zip_class_P,lvl2kx_4,lvl2kx_B,lvl2kx_R,lvl2kx_T,ur_MISSING_VALUE,ur_R,ur_U,msg2kx_** Centroid Lat/Long **,msg2kx_** Street-Level Rooftop **,rad_chap_indr_N,rad_chap_indr_Y,rad_type_,rad_type_PBRA,rad_type_PBV
105350,1,1,0,1,1,0.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,27,1.0,0.0,,,19121.0,1422.0,7,,,12799,0,0,1969.0,1969.0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0


In [13]:
correlations = df.corrwith(df['building_fires_11_19'], method='spearman').drop('building_fires_11_19')

n = len(df)

t_stats = correlations * np.sqrt((n-2) / (1-correlations**2))
p_values = 2*(1 - stats.t.cdf(np.abs(t_stats), df=n-2))  # two-tailed

results_df = pd.DataFrame({
    'Correlation': correlations,
    'p-value': p_values
})

significant_results = results_df[results_df['p-value'] < 0.05]

significant_results.sort_values('Correlation')

Unnamed: 0,Correlation,p-value
people_per_unit,-0.316708,0.0
months_from_movein,-0.315583,0.0
chldrn_mbr_cnt,-0.312164,0.0
pct_overhoused,-0.309704,0.0
pct_female_head,-0.309231,0.0
rent_per_month,-0.299415,0.0
pct_female_head_child,-0.290369,0.0
pct_1adult,-0.289612,0.0
hh_income,-0.287914,0.0
pct_ge20k,-0.285671,0.0


In [14]:
df.shape

(27738, 121)

In [15]:
df.to_csv('fires_in_ph_clean_4states.csv', index=False)