In [4]:
import requests
import zipfile
import os
import pandas as pd
import dask.dataframe as dd


In [5]:
# 远程文件的 URL
url = 'https://fema.gov/about/reports-and-data/openfema/nfirs_fire_hazmat_pdr_2022.zip'
# 本地保存的文件名
local_zip_file = 'nfirs_fire_hazmat_pdr_2022.zip'
# 第一级解压缩后的文件夹名
first_extracted_folder = 'nfirs_fire_hazmat_pdr_2022'
# 第二级解压缩后的文件夹名
second_extracted_folder = os.path.join(first_extracted_folder, 'NFIRS_FIRES_2022_102623')
# 需要读取的文件路径
fireincident_file = 'NFIRS_FIRES_2022_102623/fireincident.txt'
basicincident_file = 'NFIRS_FIRES_2022_102623/basicincident.txt'
civiliancasualty_file = 'NFIRS_FIRES_2022_102623/civiliancasualty.txt'
output_count_civiliancasualty = os.path.join(second_extracted_folder, 'count_civiliancasualty.txt')


In [6]:
# 下载文件
response = requests.get(url)
with open(local_zip_file, 'wb') as f:
    f.write(response.content)
print(f"Downloaded {local_zip_file}")


Downloaded nfirs_fire_hazmat_pdr_2022.zip


In [7]:
# 解压缩第一级文件
with zipfile.ZipFile(local_zip_file, 'r') as zip_ref:
    zip_ref.extractall(first_extracted_folder)
print(f"Extracted files to {first_extracted_folder}")


Extracted files to nfirs_fire_hazmat_pdr_2022


In [8]:
# 解压缩第二级文件
second_zip_file = os.path.join(first_extracted_folder, 'nfirs_fire_hazmat_pdr_2022', 'NFIRS_FIRES_2022_102623.zip')
with zipfile.ZipFile(second_zip_file, 'r') as zip_ref:
    zip_ref.extractall(second_extracted_folder)
print(f"Extracted files to {second_extracted_folder}")


Extracted files to nfirs_fire_hazmat_pdr_2022/NFIRS_FIRES_2022_102623


In [9]:
# 列出解压后的目录结构
def list_files(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print(f"{subindent}{f}")

list_files(second_extracted_folder)

NFIRS_FIRES_2022_102623/
    incidentaddress.txt
    merged_data.csv
    fireincident.txt
    merged_casualty_data.csv
    civiliancasualty.txt
    hazchem.txt
    arson.txt
    basicincident.txt
    hazmat.txt
    wildlands.txt
    arsonjuvsub.txt
    ems.txt
    hazmatequipinvolved.txt
    codelookup.txt
    ffcasualty.txt
    fdheader.txt
    hazmobprop.txt
    basicaid.txt
    ffequipfail.txt
    filtered_basicincident.csv
    count_civiliancasualty.txt
    arsonagencyreferal.txt


In [10]:
# 洛杉矶县城市列表
la_cities = [
    "Agoura Hills", "Alhambra", "Arcadia", "Artesia", "Avalon", "Azusa", "Baldwin Park", "Bell", "Bell Gardens", "Bellflower",
    "Beverly Hills", "Bradbury", "Burbank", "Calabasas", "Carson", "Cerritos", "Claremont", "Commerce", "Compton", "Covina",
    "Cudahy", "Culver City", "Diamond Bar", "Downey", "Duarte", "El Monte", "El Segundo", "Gardena", "Glendale", "Glendora",
    "Hawaiian Gardens", "Hawthorne", "Hermosa Beach", "Hidden Hills", "Huntington Park", "Industry", "Inglewood", "Irwindale",
    "La Cañada Flintridge", "La Habra Heights", "La Mirada", "La Puente", "La Verne", "Lakewood", "Lancaster", "Lawndale",
    "Lomita", "Long Beach", "Los Angeles", "Lynwood", "Malibu", "Manhattan Beach", "Maywood", "Monrovia", "Montebello",
    "Monterey Park", "Norwalk", "Palmdale", "Palos Verdes Estates", "Paramount", "Pasadena", "Pico Rivera", "Pomona",
    "Rancho Palos Verdes", "Redondo Beach", "Rolling Hills", "Rolling Hills Estates", "Rosemead", "San Dimas", "San Fernando",
    "San Gabriel", "San Marino", "Santa Clarita", "Santa Fe Springs", "Santa Monica", "Sierra Madre", "Signal Hill",
    "South El Monte", "South Gate", "South Pasadena", "Temple City", "Torrance", "Vernon", "Walnut", "West Covina",
    "West Hollywood", "Westlake Village", "Whittier"
]

In [13]:
# 读取并处理 civiliancasualty.txt 文件
civiliancasualty_path = os.path.join(second_extracted_folder, 'civiliancasualty.txt')
output_count_civiliancasualty = os.path.join(second_extracted_folder, 'count_civiliancasualty.txt')

if os.path.exists(civiliancasualty_path):
    try:
        civiliancasualty_df = dd.read_csv(civiliancasualty_path, delimiter='^', encoding='latin1')
        civiliancasualty_count = civiliancasualty_df.groupby('INCIDENT_KEY').size().compute().reset_index(name='civil_num')
        civiliancasualty_count.to_csv(output_count_civiliancasualty, sep='^', encoding='latin1', index=False)
        print(f"Generated {output_count_civiliancasualty}")
    except UnicodeDecodeError as e:
        print(f"Error reading the file with 'latin1' encoding: {e}")
else:
    print(f"Error: The file at path {civiliancasualty_path} does not exist.")

# 合并 count_civiliancasualty.txt, fireincident.txt, basicincident.txt 文件
fireincident_path = os.path.join(second_extracted_folder, 'fireincident.txt')
basicincident_path = os.path.join(second_extracted_folder, 'basicincident.txt')
output_merged_file = os.path.join(second_extracted_folder, 'merged_data.csv')

if os.path.exists(fireincident_path) and os.path.exists(basicincident_path):
    try:
        fireincident_df = dd.read_csv(fireincident_path, delimiter='^', encoding='latin1')
        basicincident_df = dd.read_csv(basicincident_path, delimiter='^', encoding='latin1')
        count_civiliancasualty_df = dd.read_csv(output_count_civiliancasualty, delimiter='^', encoding='latin1')

        merged_df = dd.merge(fireincident_df, basicincident_df, on='INCIDENT_KEY', how='left')
        merged_df = dd.merge(merged_df, count_civiliancasualty_df, on='INCIDENT_KEY', how='left')

        merged_df.to_csv(output_merged_file, single_file=True, sep='^', encoding='latin1', index=False)
        print(f"Merged data saved to {output_merged_file}")
    except UnicodeDecodeError as e:
        print(f"Error reading one of the files with 'latin1' encoding: {e}")
else:
    print("Error: One of the required files does not exist.")

# 显示合并后的数据
if os.path.exists(output_merged_file):
    merged_df = pd.read_csv(output_merged_file, delimiter='^', encoding='latin1')
    print(merged_df.head())
else:
    print(f"Error: The file at path {output_merged_file} does not exist.")

ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+------------+---------+----------+
| Column     | Found   | Expected |
+------------+---------+----------+
| ACTIV_INJ  | object  | float64  |
| AGE        | float64 | int64    |
| BODY_PART  | object  | int64    |
| CAUSE_INJ  | object  | int64    |
| FDID       | object  | int64    |
| GENDER     | float64 | int64    |
| GEN_LOC_IN | object  | float64  |
| LOC_INC    | object  | float64  |
| PRIM_SYMP  | object  | float64  |
| RACE       | object  | float64  |
| SEV        | object  | int64    |
| SPC_LOC_IN | object  | float64  |
+------------+---------+----------+

The following columns also raised exceptions on conversion:

- ACTIV_INJ
  ValueError("could not convert string to float: 'U'")
- BODY_PART
  ValueError('cannot convert float NaN to integer')
- CAUSE_INJ
  ValueError('cannot convert float NaN to integer')
- FDID
  ValueError("invalid literal for int() with base 10: 'AA211'")
- GEN_LOC_IN
  ValueError("could not convert string to float: 'U'")
- LOC_INC
  ValueError("could not convert string to float: 'U'")
- PRIM_SYMP
  ValueError("could not convert string to float: 'UU'")
- RACE
  ValueError("could not convert string to float: 'U'")
- SEV
  ValueError("invalid literal for int() with base 10: 'U'")
- SPC_LOC_IN
  ValueError("could not convert string to float: 'UU'")

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={'ACTIV_INJ': 'object',
       'AGE': 'float64',
       'BODY_PART': 'object',
       'CAUSE_INJ': 'object',
       'FDID': 'object',
       'GENDER': 'float64',
       'GEN_LOC_IN': 'object',
       'LOC_INC': 'object',
       'PRIM_SYMP': 'object',
       'RACE': 'object',
       'SEV': 'object',
       'SPC_LOC_IN': 'object'}

to the call to `read_csv`/`read_table`.

In [None]:
# 合并 count_civiliancasualty.txt, fireincident.txt, basicincident.txt 文件
fireincident_path = os.path.join(second_extracted_folder, 'fireincident.txt')
basicincident_path = os.path.join(second_extracted_folder, 'basicincident.txt')
output_merged_file = os.path.join(second_extracted_folder, 'merged_data.csv')

# 指定数据类型以避免混合类型警告
dtype_dict = {
    'INCIDENT_KEY': str,
    # 可以根据需要添加更多的列及其类型
}

if os.path.exists(fireincident_path) and os.path.exists(basicincident_path):
    try:
        chunk_size = 10000  # 分批处理的行数
        
        merged_df = pd.DataFrame()
        
        fireincident_chunks = pd.read_csv(fireincident_path, delimiter='^', encoding='latin1', dtype=dtype_dict, chunksize=chunk_size, low_memory=False)
        basicincident_chunks = pd.read_csv(basicincident_path, delimiter='^', encoding='latin1', dtype=dtype_dict, chunksize=chunk_size, low_memory=False)
        count_civiliancasualty_df = pd.read_csv(output_count_civiliancasualty, delimiter='^', encoding='latin1', dtype=dtype_dict)

        for fireincident_chunk in fireincident_chunks:
            for basicincident_chunk in basicincident_chunks:
                temp_df = fireincident_chunk.merge(basicincident_chunk, on='INCIDENT_KEY', how='left')
                # 移除全为空的列
                temp_df = temp_df.dropna(axis=1, how='all')
                merged_df = pd.concat([merged_df, temp_df], ignore_index=True)
        
        merged_df = merged_df.merge(count_civiliancasualty_df, on='INCIDENT_KEY', how='left')

        merged_df.to_csv(output_merged_file, index=False, sep='^', encoding='latin1')
        print(f"Merged data saved to {output_merged_file}")
    except UnicodeDecodeError as e:
        print(f"Error reading one of the files with 'latin1' encoding: {e}")
else:
    print("Error: One of the required files does not exist.")

# 显示合并后的数据
if os.path.exists(output_merged_file):
    merged_df = pd.read_csv(output_merged_file, delimiter='^', encoding='latin1')
    print(merged_df.head())
else:
    print(f"Error: The file at path {output_merged_file} does not exist.")