In [3]:
import requests
import zipfile
import os
import pandas as pd


In [4]:
#pip install dask

In [5]:
# 远程文件的 URL
url = 'https://fema.gov/about/reports-and-data/openfema/nfirs_fire_hazmat_pdr_2022.zip'
# 本地保存的文件名
local_zip_file = 'nfirs_fire_hazmat_pdr_2022.zip'
# 第一级解压缩后的文件夹名
first_extracted_folder = 'nfirs_fire_hazmat_pdr_2022'
# 第二级解压缩后的文件夹名
second_extracted_folder = os.path.join(first_extracted_folder, 'NFIRS_FIRES_2022_102623')
# 需要读取的文件路径
incidentaddress_file = 'NFIRS_FIRES_2022_102623/incidentaddress.txt'
basicincident_file ='NFIRS_FIRES_2022_102623/basicincident.txt'
ffcasualty_file ='NFIRS_FIRES_2022_102623/ffcasualty.txt'
civiliancasualty_file ='NFIRS_FIRES_2022_102623/civiliancasualty.txt'
fireincident_file ='NFIRS_FIRES_2022_102623/fireincident.txt'

# 输出的 CSV 文件路径
#output_csv_file = os.path.join(second_extracted_folder, 'NFIRS_FIRES_2022_102623', 'incidentaddress.csv')


In [6]:
# 下载文件
response = requests.get(url)
with open(local_zip_file, 'wb') as f:
    f.write(response.content)
print(f"Downloaded {local_zip_file}")


Downloaded nfirs_fire_hazmat_pdr_2022.zip


In [7]:
# 解压缩第一级文件
with zipfile.ZipFile(local_zip_file, 'r') as zip_ref:
    zip_ref.extractall(first_extracted_folder)
print(f"Extracted files to {first_extracted_folder}")


Extracted files to nfirs_fire_hazmat_pdr_2022


In [8]:
# 解压缩第二级文件
second_zip_file = os.path.join(first_extracted_folder, 'nfirs_fire_hazmat_pdr_2022', 'NFIRS_FIRES_2022_102623.zip')
with zipfile.ZipFile(second_zip_file, 'r') as zip_ref:
    zip_ref.extractall(second_extracted_folder)
print(f"Extracted files to {second_extracted_folder}")


Extracted files to nfirs_fire_hazmat_pdr_2022/NFIRS_FIRES_2022_102623


In [9]:
# 列出解压后的目录结构
def list_files(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print(f"{subindent}{f}")

list_files(second_extracted_folder)

NFIRS_FIRES_2022_102623/
    incidentaddress.txt
    fireincident.txt
    civiliancasualty.txt
    hazchem.txt
    arson.txt
    basicincident.txt
    hazmat.txt
    wildlands.txt
    arsonjuvsub.txt
    ems.txt
    hazmatequipinvolved.txt
    codelookup.txt
    ffcasualty.txt
    fdheader.txt
    hazmobprop.txt
    basicaid.txt
    ffequipfail.txt
    filtered_basicincident.csv
    arsonagencyreferal.txt


In [16]:
import os
import dask.dataframe as dd

# 定义文件路径
ffcasualty_path = os.path.join(second_extracted_folder, 'ffcasualty.txt')
civiliancasualty_path = os.path.join(second_extracted_folder, 'civiliancasualty.txt')

# 定义数据类型
dtypes = {
    'INCIDENT_KEY': 'object',
    'STATE': 'object',
    'FDID': 'object',
    'INC_DATE': 'object',
    'INC_NO': 'object',
    'EXP_NO': 'object',
    'FF_SEQ_NO': 'object',
    'VERSION': 'float64',
    'GENDER': 'object',
    'CAREER': 'float64',
    'PABI': 'object',
    'CAUSE': 'object',
    'FACTOR': 'object',
    'OBJECT': 'object',
    'WIO': 'object',
    'RELATION': 'object',
    'STORY': 'object',
    'LOCATION': 'object',
    'VEHICLE': 'object',
    'PROT_EQP': 'object',
    'SEQ_NUMBER': 'object',
    'AGE': 'float64',
    'FACT_INJ3': 'object',
    'ACTIV_INJ': 'object',
    'LOC_INC': 'object',
    'GEN_LOC_IN': 'object',
    'STORY_INC': 'object',
    'STORY_INJ': 'object',
    'SPC_LOC_IN': 'object',
    'PRIM_SYMP': 'object',
    'BODY_PART': 'object',
    'CC_DISPOS': 'object',
    'ACTIVITY': 'object',
    'PHYS_COND': 'object',
    'SEVERITY': 'object',
    'SYMPTOM': 'object',
    'CAUSE_INJ': 'object',
       'RACE': 'object',
       'SEV': 'object'
}

def read_large_file_with_dask(file_path, delimiter='^', encoding='latin1', dtypes=None):
    try:
        return dd.read_csv(file_path, delimiter=delimiter, encoding=encoding, dtype=dtypes)
    except Exception as e:
        print(f"Error reading the file {file_path}: {e}")
        return None

# 检查文件路径并读取文件
files = [
    ('ffcasualty', ffcasualty_path),
    ('civiliancasualty', civiliancasualty_path)
]

data_frames = {}

for name, path in files:
    if os.path.exists(path):
        print(f"Reading {name} from {path}...")
        data_frames[name] = read_large_file_with_dask(path, dtypes=dtypes)
    else:
        print(f"Error: The file at path {path} does not exist.")

# 检查是否成功读取数据框
if not data_frames:
    print("No data frames were loaded. Please check the file paths and try again.")
else:
    # 合并数据框
    merged_df = None
    for key, df in data_frames.items():
        if df is not None:
            if merged_df is None:
                merged_df = df
            else:
                merged_df = dd.merge(merged_df, df, on='INCIDENT_KEY', how='outer', suffixes=('', f'_{key}'))

    if merged_df is not None:
        # 将合并后的数据框保存到一个新文件
        print("Merged Data Frame:")
        print(merged_df.head())
        
        output_path = os.path.join(second_extracted_folder, 'merged_data.csv')
        merged_df.compute().to_csv(output_path, index=False)
        print(f"Merged data saved to {output_path}")
    else:
        print("Merging data frames failed. Please check the input files for consistency.")


Reading ffcasualty from nfirs_fire_hazmat_pdr_2022/NFIRS_FIRES_2022_102623/ffcasualty.txt...
Reading civiliancasualty from nfirs_fire_hazmat_pdr_2022/NFIRS_FIRES_2022_102623/civiliancasualty.txt...
Merged Data Frame:
                 INCIDENT_KEY STATE   FDID  INC_DATE  INC_NO EXP_NO FF_SEQ_NO  \
0  AK_11100_05222022_223075_0    AK  11100  05222022  223075      0         1   
1  AK_11100_11262022_227808_0    AK  11100  11262022  227808      0         1   
2  AK_11100_11262022_227808_0    AK  11100  11262022  227808      0         2   
3  AK_11100_11262022_227808_0    AK  11100  11262022  227808      0         3   
4  AK_11100_12202022_228285_0    AK  11100  12202022  228285      0         1   

   VERSION GENDER  CAREER  ...  FACT_INJ3  ACTIV_INJ  LOC_INC  GEN_LOC_IN  \
0      5.0      1     1.0  ...       <NA>       <NA>     <NA>        <NA>   
1      5.0      2     1.0  ...       <NA>       <NA>     <NA>        <NA>   
2      5.0      1     1.0  ...       <NA>       <NA>     <NA>    

  elif is_datetime64tz_dtype(block):
  elif is_datetime64tz_dtype(block):
  elif is_datetime64tz_dtype(block):
  elif is_datetime64tz_dtype(block):
  elif is_datetime64tz_dtype(block):
  elif is_datetime64tz_dtype(block):


Merged data saved to nfirs_fire_hazmat_pdr_2022/NFIRS_FIRES_2022_102623/merged_data.csv


NameError: name 'merged_data' is not defined

In [None]:
# 洛杉矶县城市列表
la_cities = [
    "Agoura Hills", "Alhambra", "Arcadia", "Artesia", "Avalon", "Azusa", "Baldwin Park", "Bell", "Bell Gardens", "Bellflower",
    "Beverly Hills", "Bradbury", "Burbank", "Calabasas", "Carson", "Cerritos", "Claremont", "Commerce", "Compton", "Covina",
    "Cudahy", "Culver City", "Diamond Bar", "Downey", "Duarte", "El Monte", "El Segundo", "Gardena", "Glendale", "Glendora",
    "Hawaiian Gardens", "Hawthorne", "Hermosa Beach", "Hidden Hills", "Huntington Park", "Industry", "Inglewood", "Irwindale",
    "La Cañada Flintridge", "La Habra Heights", "La Mirada", "La Puente", "La Verne", "Lakewood", "Lancaster", "Lawndale",
    "Lomita", "Long Beach", "Los Angeles", "Lynwood", "Malibu", "Manhattan Beach", "Maywood", "Monrovia", "Montebello",
    "Monterey Park", "Norwalk", "Palmdale", "Palos Verdes Estates", "Paramount", "Pasadena", "Pico Rivera", "Pomona",
    "Rancho Palos Verdes", "Redondo Beach", "Rolling Hills", "Rolling Hills Estates", "Rosemead", "San Dimas", "San Fernando",
    "San Gabriel", "San Marino", "Santa Clarita", "Santa Fe Springs", "Santa Monica", "Sierra Madre", "Signal Hill",
    "South El Monte", "South Gate", "South Pasadena", "Temple City", "Torrance", "Vernon", "Walnut", "West Covina",
    "West Hollywood", "Westlake Village", "Whittier"
]

In [10]:
import os
import pandas as pd

# 定义文件路径
incidentaddress_path = os.path.join(second_extracted_folder, 'incidentaddress.txt')
basicincident_path = os.path.join(second_extracted_folder,  'basicincident.txt')
ffcasualty_path = os.path.join(second_extracted_folder, 'ffcasualty.txt')
civiliancasualty_path = os.path.join(second_extracted_folder,  'civiliancasualty.txt')
fireincident_path = os.path.join(second_extracted_folder, 'fireincident.txt')

def read_large_file(file_path, delimiter='^', encoding='latin1'):
    try:
        chunk_size = 10000  # 每次读取10000行
        reader = pd.read_csv(file_path, delimiter=delimiter, encoding=encoding, chunksize=chunk_size)
        for chunk in reader:
            print(chunk.head(10))
            # 在这里可以对每个chunk进行处理
    except UnicodeDecodeError as e:
        print(f"Error reading the file {file_path} with 'latin1' encoding: {e}")
    except Exception as e:
        print(f"Error reading the file {file_path}: {e}")


# 读取第三个文件
if os.path.exists(ffcasualty_path):
    print("Firefighter casualty Data:")
    read_large_file(ffcasualty_path)
else:
    print(f"Error: The file at path {ffcasualty_path} does not exist.")

# 读取第一个文件
if os.path.exists(civiliancasualty_path):
    print("Civilian casualty Data:")
    read_large_file(civiliancasualty_path)
else:
    print(f"Error: The file at path {civiliancasualty_path} does not exist.")

# 读取第一个文件
if os.path.exists(fireincident_path):
    print("Fire Incident Data:")
    read_large_file(fireincident_path)
else:
    print(f"Error: The file at path {fireincident_path} does not exist.")

# 读取第一个文件
if os.path.exists(incidentaddress_path):
    print("Incident Address Data:")
    read_large_file(incidentaddress_path)
else:
    print(f"Error: The file at path {incidentaddress_path} does not exist.")

# 读取第二个文件
if os.path.exists(basicincident_path):
    print("Basic Incident Data:")
    read_large_file(basicincident_path)
else:
    print(f"Error: The file at path {basicincident_path} does not exist.")


Firefighter casualty Data:
                  INCIDENT_KEY STATE   FDID  INC_DATE  INC_NO  EXP_NO  \
0   AK_11100_05222022_223075_0    AK  11100   5222022  223075       0   
1   AK_11100_11262022_227808_0    AK  11100  11262022  227808       0   
2   AK_11100_11262022_227808_0    AK  11100  11262022  227808       0   
3   AK_11100_11262022_227808_0    AK  11100  11262022  227808       0   
4   AK_11100_12202022_228285_0    AK  11100  12202022  228285       0   
5     AK_13000_10132022_2088_0    AK  13000  10132022    2088       0   
6     AK_13000_10132022_2088_0    AK  13000  10132022    2088       0   
7  AK_15100_05032022_0000009_0    AK  15100   5032022       9       0   
8  AK_19702_10202022_0102022_0    AK  19702  10202022  102022       0   
9  AK_19702_10202022_0102022_0    AK  19702  10202022  102022       0   

   FF_SEQ_NO  VERSION  GENDER  CAREER  ...  PABI  CAUSE  FACTOR  OBJECT WIO  \
0          1      5.0       1     1.0  ...    61      7     NaN      27   6   
1          

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


                 INCIDENT_KEY STATE   FDID  INC_DATE  INC_NO  EXP_NO  VERSION  \
0   AK_11100_01012022_22004_0    AK  11100   1012022   22004       0      5.0   
1   AK_11100_01082022_22170_0    AK  11100   1082022   22170       0      5.0   
2   AK_11100_02132022_22985_0    AK  11100   2132022   22985       0      5.0   
3  AK_11100_02202022_221148_0    AK  11100   2202022  221148       0      5.0   
4  AK_11100_02242022_221244_0    AK  11100   2242022  221244       0      5.0   
5  AK_11100_03072022_221457_0    AK  11100   3072022  221457       0      5.0   
6  AK_11100_03082022_221487_0    AK  11100   3082022  221487       0      5.0   
7  AK_11100_03182022_221666_0    AK  11100   3182022  221666       0      5.0   
8  AK_11100_03272022_221850_0    AK  11100   3272022  221850       0      5.0   
9  AK_11100_04122022_222184_0    AK  11100   4122022  222184       0      5.0   

   NUM_UNIT NOT_RES  BLDG_INVOL  ...  DET_TYPE DET_POWER DET_OPERAT  \
0       0.0       N         0.0  ... 

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


                      INCIDENT_KEY STATE   FDID  INC_DATE  INC_NO  EXP_NO  \
30000  AR_72600_10222022_0010786_0    AR  72600  10222022   10786       0   
30001  AR_72600_10232022_0010839_0    AR  72600  10232022   10839       0   
30002  AR_72600_10292022_0011054_0    AR  72600  10292022   11054       0   
30003  AR_72600_10302022_0011123_0    AR  72600  10302022   11123       0   
30004  AR_72600_10302022_0011131_0    AR  72600  10302022   11131       0   
30005  AR_72600_11012022_0011190_0    AR  72600  11012022   11190       0   
30006  AR_72600_11022022_0011232_0    AR  72600  11022022   11232       0   
30007  AR_72600_11042022_0011291_0    AR  72600  11042022   11291       0   
30008  AR_72600_11052022_0011341_0    AR  72600  11052022   11341       0   
30009  AR_72600_11062022_0011392_0    AR  72600  11062022   11392       0   

       VERSION  NUM_UNIT NOT_RES  BLDG_INVOL  ...  DET_TYPE DET_POWER  \
30000      5.0       1.0       N         1.0  ...       NaN       NaN   
30001 

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


                      INCIDENT_KEY STATE   FDID  INC_DATE  INC_NO  EXP_NO  \
70000  CA_19200_11182022_0014925_0    CA  19200  11182022   14925       0   
70001  CA_19200_11202022_0014997_0    CA  19200  11202022   14997       0   
70002  CA_19200_11232022_0015119_0    CA  19200  11232022   15119       0   
70003  CA_19200_11262022_0015293_0    CA  19200  11262022   15293       0   
70004  CA_19200_11282022_0015392_0    CA  19200  11282022   15392       0   
70005  CA_19200_12062022_0015794_0    CA  19200  12062022   15794       0   
70006  CA_19200_12072022_0015828_0    CA  19200  12072022   15828       0   
70007  CA_19200_12082022_0015882_0    CA  19200  12082022   15882       0   
70008  CA_19200_12152022_0016202_0    CA  19200  12152022   16202       0   
70009  CA_19200_12172022_0016308_0    CA  19200  12172022   16308       0   

       VERSION  NUM_UNIT NOT_RES  BLDG_INVOL  ...  DET_TYPE DET_POWER  \
70000      5.0       NaN       Y         NaN  ...       NaN       NaN   
70001 

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


                     INCIDENT_KEY STATE   FDID  INC_DATE  INC_NO  EXP_NO  \
110000  CA_54045_01082022_22311_0    CA  54045   1082022   22311       0   
110001  CA_54045_01082022_22312_0    CA  54045   1082022   22312       0   
110002  CA_54045_01082022_22317_0    CA  54045   1082022   22317       0   
110003  CA_54045_01082022_22325_0    CA  54045   1082022   22325       0   
110004  CA_54045_01142022_22611_0    CA  54045   1142022   22611       0   
110005  CA_54045_01142022_22630_0    CA  54045   1142022   22630       0   
110006  CA_54045_01172022_22791_0    CA  54045   1172022   22791       0   
110007  CA_54045_01202022_22931_0    CA  54045   1202022   22931       0   
110008  CA_54045_01212022_22944_0    CA  54045   1212022   22944       0   
110009  CA_54045_01212022_22944_1    CA  54045   1212022   22944       1   

        VERSION  NUM_UNIT NOT_RES  BLDG_INVOL  ...  DET_TYPE DET_POWER  \
110000      5.0       3.0       N         NaN  ...       NaN       NaN   
110001      5.0

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


                      INCIDENT_KEY STATE   FDID  INC_DATE  INC_NO  EXP_NO  \
190000  IA_78004_12242022_221684_0    IA  78004  12242022  221684       0   
190001  IA_78004_12252022_221709_0    IA  78004  12252022  221709       0   
190002   IA_78005_05102022_22080_0    IA  78005   5102022   22080       0   
190003   IA_78005_07272022_22118_0    IA  78005   7272022   22118       0   
190004   IA_78006_10062022_22016_0    IA  78006  10062022   22016       0   
190005   IA_78007_02012022_22045_0    IA  78007   2012022   22045       0   
190006   IA_78007_03162022_22112_0    IA  78007   3162022   22112       0   
190007   IA_78007_03212022_22119_0    IA  78007   3212022   22119       0   
190008   IA_78007_07022022_22282_0    IA  78007   7022022   22282       0   
190009   IA_78007_07232022_22300_0    IA  78007   7232022   22300       0   

        VERSION  NUM_UNIT NOT_RES  BLDG_INVOL  ...  DET_TYPE DET_POWER  \
190000      5.0     107.0       N         1.0  ...       NaN       NaN   
1900

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


                       INCIDENT_KEY STATE   FDID  INC_DATE  INC_NO  EXP_NO  \
230000  KS_DG101_11162022_0013944_0    KS  DG101  11162022   13944       0   
230001  KS_DG101_11162022_0013945_0    KS  DG101  11162022   13945       0   
230002  KS_DG101_11202022_0014100_0    KS  DG101  11202022   14100       0   
230003  KS_DG101_11212022_0014103_0    KS  DG101  11212022   14103       0   
230004  KS_DG101_11282022_0014389_0    KS  DG101  11282022   14389       0   
230005  KS_DG101_12012022_0014516_0    KS  DG101  12012022   14516       0   
230006  KS_DG101_12032022_0014633_0    KS  DG101  12032022   14633       0   
230007  KS_DG101_12032022_0014644_0    KS  DG101  12032022   14644       0   
230008  KS_DG101_12052022_0014697_0    KS  DG101  12052022   14697       0   
230009  KS_DG101_12082022_0014848_0    KS  DG101  12082022   14848       0   

        VERSION  NUM_UNIT NOT_RES  BLDG_INVOL  ...  DET_TYPE DET_POWER  \
230000      5.0       0.0       Y         0.0  ...       NaN       

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


                    INCIDENT_KEY STATE   FDID  INC_DATE  INC_NO  EXP_NO  \
270000  MA_21244_07232022_3448_0    MA  21244   7232022    3448       0   
270001  MA_21244_07242022_3467_0    MA  21244   7242022    3467       0   
270002  MA_21244_07242022_3470_0    MA  21244   7242022    3470       0   
270003  MA_21244_07242022_3480_0    MA  21244   7242022    3480       0   
270004  MA_21244_07252022_3506_0    MA  21244   7252022    3506       0   
270005  MA_21244_07262022_3515_0    MA  21244   7262022    3515       0   
270006  MA_21244_07262022_3525_0    MA  21244   7262022    3525       0   
270007  MA_21244_07272022_3536_0    MA  21244   7272022    3536       0   
270008  MA_21244_07292022_3573_0    MA  21244   7292022    3573       0   
270009  MA_21244_07292022_3576_0    MA  21244   7292022    3576       0   

        VERSION  NUM_UNIT NOT_RES  BLDG_INVOL  ...  DET_TYPE DET_POWER  \
270000      5.0       0.0       Y         0.0  ...       NaN       NaN   
270001      5.0       0.0 

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


                     INCIDENT_KEY STATE   FDID  INC_DATE  INC_NO  EXP_NO  \
310000  MN_31110_11272022_22043_0    MN  31110  11272022   22043       0   
310001  MN_31110_12172022_22060_0    MN  31110  12172022   22060       0   
310002  MN_31111_01302022_22011_0    MN  31111   1302022   22011       0   
310003  MN_31111_09222022_22035_0    MN  31111   9222022   22035       0   
310004  MN_31111_11132022_22040_0    MN  31111  11132022   22040       0   
310005  MN_31111_12202022_23003_0    MN  31111  12202022   23003       0   
310006  MN_31111_12202022_23004_0    MN  31111  12202022   23004       0   
310007  MN_31112_01072022_23002_0    MN  31112   1072022   23002       0   
310008  MN_31112_03192022_23018_0    MN  31112   3192022   23018       0   
310009  MN_31112_05222022_23026_0    MN  31112   5222022   23026       0   

        VERSION  NUM_UNIT NOT_RES  BLDG_INVOL  ...  DET_TYPE DET_POWER  \
310000      5.0       NaN       N         NaN  ...       NaN       NaN   
310001      5.0

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


                       INCIDENT_KEY STATE  FDID  INC_DATE  INC_NO  EXP_NO  \
350000  NC_07101_12302022_0000287_0    NC  7101  12302022     287       0   
350001     NC_07102_01212022_0026_0    NC  7102   1212022      26       0   
350002     NC_07102_02092022_0069_0    NC  7102   2092022      69       0   
350003     NC_07102_02092022_0070_0    NC  7102   2092022      70       0   
350004     NC_07102_02162022_0081_0    NC  7102   2162022      81       0   
350005     NC_07102_02192022_0087_0    NC  7102   2192022      87       0   
350006     NC_07102_02192022_0088_0    NC  7102   2192022      88       0   
350007     NC_07102_02202022_0091_0    NC  7102   2202022      91       0   
350008     NC_07102_02212022_0093_0    NC  7102   2212022      93       0   
350009     NC_07102_02232022_0096_0    NC  7102   2232022      96       0   

        VERSION  NUM_UNIT NOT_RES  BLDG_INVOL  ...  DET_TYPE DET_POWER  \
350000      5.0       0.0       Y         0.0  ...       NaN       NaN   
3500

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


                   INCIDENT_KEY STATE   FDID  INC_DATE  INC_NO  EXP_NO  \
390000  NY_52016_04292022_306_0    NY  52016   4292022     306       0   
390001  NY_52016_08262022_652_0    NY  52016   8262022     652       0   
390002  NY_52016_08312022_662_0    NY  52016   8312022     662       0   
390003  NY_52016_09242022_726_0    NY  52016   9242022     726       0   
390004  NY_52016_09282022_742_0    NY  52016   9282022     742       0   
390005  NY_52016_10132022_796_0    NY  52016  10132022     796       0   
390006  NY_52016_10142022_797_0    NY  52016  10142022     797       0   
390007  NY_52016_10172022_805_0    NY  52016  10172022     805       0   
390008  NY_52016_10172022_806_0    NY  52016  10172022     806       0   
390009  NY_52016_10202022_812_0    NY  52016  10202022     812       0   

        VERSION  NUM_UNIT NOT_RES  BLDG_INVOL  ...  DET_TYPE DET_POWER  \
390000      5.0       6.0       N         NaN  ...       NaN       NaN   
390001      5.0       4.0       N    

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


                     INCIDENT_KEY STATE   FDID  INC_DATE  INC_NO  EXP_NO  \
430000  OK_55013_10172022_72769_0    OK  55013  10172022   72769       0   
430001  OK_55013_10172022_72839_0    OK  55013  10172022   72839       0   
430002  OK_55013_10172022_72861_0    OK  55013  10172022   72861       0   
430003  OK_55013_10172022_72882_0    OK  55013  10172022   72882       0   
430004  OK_55013_10172022_72885_0    OK  55013  10172022   72885       0   
430005  OK_55013_10172022_72896_0    OK  55013  10172022   72896       0   
430006  OK_55013_10172022_72911_0    OK  55013  10172022   72911       0   
430007  OK_55013_10172022_72912_0    OK  55013  10172022   72912       0   
430008  OK_55013_10172022_72925_0    OK  55013  10172022   72925       0   
430009  OK_55013_10172022_72927_0    OK  55013  10172022   72927       0   

        VERSION  NUM_UNIT NOT_RES  BLDG_INVOL  ...  DET_TYPE DET_POWER  \
430000      5.0       0.0       Y         0.0  ...       NaN       NaN   
430001      5.0

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


                       INCIDENT_KEY STATE   FDID  INC_DATE  INC_NO  EXP_NO  \
470000  PA_51001_01012022_0011058_0    PA  51001   1012022   11058       0   
470001  PA_51001_01012022_0011081_0    PA  51001   1012022   11081       0   
470002  PA_51001_01022022_0020039_0    PA  51001   1022022   20039       0   
470003  PA_51001_01022022_0020081_0    PA  51001   1022022   20081       0   
470004  PA_51001_01022022_0020265_0    PA  51001   1022022   20265       0   
470005  PA_51001_01022022_0020494_0    PA  51001   1022022   20494       0   
470006  PA_51001_01022022_0020505_0    PA  51001   1022022   20505       0   
470007  PA_51001_01022022_0020602_0    PA  51001   1022022   20602       0   
470008  PA_51001_01022022_0020632_0    PA  51001   1022022   20632       0   
470009  PA_51001_01022022_0020704_0    PA  51001   1022022   20704       0   

        VERSION  NUM_UNIT NOT_RES  BLDG_INVOL  ...  DET_TYPE DET_POWER  \
470000      5.0         0       Y           0  ...       NaN       

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


                       INCIDENT_KEY STATE   FDID  INC_DATE  INC_NO  EXP_NO  \
550000  TX_GD402_09092022_0000226_0    TX  GD402   9092022     226       0   
550001  TX_GD402_09142022_0000002_0    TX  GD402   9142022       2       0   
550002  TX_GD402_09242022_0000232_0    TX  GD402   9242022     232       0   
550003  TX_GD402_09282022_0000223_0    TX  GD402   9282022     223       0   
550004  TX_GD402_09302022_0000219_0    TX  GD402   9302022     219       0   
550005  TX_GD402_09302022_0000224_0    TX  GD402   9302022     224       0   
550006  TX_GD402_10022022_0000003_0    TX  GD402  10022022       3       0   
550007  TX_GD402_10042022_0000005_0    TX  GD402  10042022       5       0   
550008  TX_GD402_10062022_0000236_0    TX  GD402  10062022     236       0   
550009  TX_GD402_10082022_0000237_0    TX  GD402  10082022     237       0   

        VERSION  NUM_UNIT NOT_RES  BLDG_INVOL  ...  DET_TYPE DET_POWER  \
550000      5.0       1.0       N         0.0  ...       NaN       

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


                       INCIDENT_KEY STATE   FDID  INC_DATE   INC_NO  EXP_NO  \
590000   TX_XC601_06272022_020071_0    TX  XC601   6272022    20071       0   
590001  TX_XC601_06272022_2468324_0    TX  XC601   6272022  2468324       0   
590002  TX_XC601_06272022_2468330_0    TX  XC601   6272022  2468330       0   
590003   TX_XC601_06282022_020198_0    TX  XC601   6282022    20198       0   
590004   TX_XC601_07012022_020619_0    TX  XC601   7012022    20619       0   
590005   TX_XC601_07022022_020724_0    TX  XC601   7022022    20724       0   
590006   TX_XC601_07032022_020843_0    TX  XC601   7032022    20843       0   
590007   TX_XC601_07042022_020867_0    TX  XC601   7042022    20867       0   
590008   TX_XC601_07042022_020908_0    TX  XC601   7042022    20908       0   
590009   TX_XC601_07042022_020940_0    TX  XC601   7042022    20940       0   

        VERSION  NUM_UNIT NOT_RES  BLDG_INVOL  ...  DET_TYPE DET_POWER  \
590000      5.0       0.0       Y         0.0  ...      

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


                   INCIDENT_KEY STATE   FDID  INC_DATE  INC_NO  EXP_NO  \
620000  WA_29D03_06302022_260_0    WA  29D03   6302022     260       0   
620001  WA_29D03_07042022_265_0    WA  29D03   7042022     265       0   
620002  WA_29D03_07082022_273_0    WA  29D03   7082022     273       0   
620003  WA_29D03_07222022_292_0    WA  29D03   7222022     292       0   
620004  WA_29D03_08172022_340_0    WA  29D03   8172022     340       0   
620005  WA_29D03_08252022_351_0    WA  29D03   8252022     351       0   
620006  WA_29D03_08312022_361_0    WA  29D03   8312022     361       0   
620007  WA_29D03_08312022_363_0    WA  29D03   8312022     363       0   
620008  WA_29D03_09052022_367_0    WA  29D03   9052022     367       0   
620009  WA_29D03_09202022_385_0    WA  29D03   9202022     385       0   

        VERSION  NUM_UNIT NOT_RES  BLDG_INVOL  ...  DET_TYPE DET_POWER  \
620000      5.0       0.0       Y         0.0  ...       NaN       NaN   
620001      5.0       0.0       Y    

In [None]:
import os
import pandas as pd

# 定义文件路径
incidentaddress_path = os.path.join(second_extracted_folder, 'incidentaddress.txt')
basicincident_path = os.path.join(second_extracted_folder,  'basicincident.txt')
ffcasualty_path = os.path.join(second_extracted_folder, 'ffcasualty.txt')
civiliancasualty_path = os.path.join(second_extracted_folder,  'civiliancasualty.txt')
fireincident_path = os.path.join(second_extracted_folder, 'fireincident.txt')

def read_large_file(file_path, delimiter='^', encoding='latin1'):
    try:
        chunk_size = 10000  # 每次读取10000行
        chunks = []
        reader = pd.read_csv(file_path, delimiter=delimiter, encoding=encoding, chunksize=chunk_size)
        for chunk in reader:
            chunks.append(chunk)
        return pd.concat(chunks, ignore_index=True)
    except UnicodeDecodeError as e:
        print(f"Error reading the file {file_path} with 'latin1' encoding: {e}")
        return None
    except Exception as e:
        print(f"Error reading the file {file_path}: {e}")
        return None

# 读取所有文件并存储在字典中
data_frames = {}

if os.path.exists(ffcasualty_path):
    data_frames['ffcasualty'] = read_large_file(ffcasualty_path)
else:
    print(f"Error: The file at path {ffcasualty_path} does not exist.")

if os.path.exists(civiliancasualty_path):
    data_frames['civiliancasualty'] = read_large_file(civiliancasualty_path)
else:
    print(f"Error: The file at path {civiliancasualty_path} does not exist.")

if os.path.exists(fireincident_path):
    data_frames['fireincident'] = read_large_file(fireincident_path)
else:
    print(f"Error: The file at path {fireincident_path} does not exist.")

if os.path.exists(incidentaddress_path):
    data_frames['incidentaddress'] = read_large_file(incidentaddress_path)
else:
    print(f"Error: The file at path {incidentaddress_path} does not exist.")

if os.path.exists(basicincident_path):
    data_frames['basicincident'] = read_large_file(basicincident_path)
else:
    print(f"Error: The file at path {basicincident_path} does not exist.")

# 检查所有数据帧是否读取成功
for key, df in data_frames.items():
    if df is None:
        print(f"Failed to read the data frame for {key}")
    else:
        print(f"Successfully read the data frame for {key}")

# 合并数据框
merged_df = None
for key, df in data_frames.items():
    if merged_df is None:
        merged_df = df
    else:
        merged_df = pd.merge(merged_df, df, on='INCIDENT_KEY', how='outer')

# 打印合并后的数据框的前几行
print("Merged Data Frame:")
print(merged_df.head())

# 保存合并后的数据框到一个新文件
output_path = os.path.join(second_extracted_folder, 'merged_data.csv')
merged_df.to_csv(output_path, index=False, encoding='latin1')
print(f"Merged data saved to {output_path}")
