In [None]:
import os
import pandas as pd

# 定义根目录和年份范围
csv_directory = 'DATA_nfirs2000-2021'
txt_directory = 'DATA_nfirs2000-2021'
output_directory = './processed_data_inc'
years_csv = range(2005, 2012)
years_txt = range(2012, 2023)
selected_columns = ["INC_TYPE", "STATE"]

# 输出文件路径
combined_file_path = 'combined_inc_type_data.csv'

# 删除已存在的输出文件
if os.path.exists(combined_file_path):
    os.remove(combined_file_path)

# 读取2005到2011年的CSV数据
for year in years_csv:
    csv_file_path = os.path.join(csv_directory, f'NFIRS{year}', 'basicincident.csv')
    if os.path.exists(csv_file_path):
        print(f"Reading {csv_file_path}")
        for chunk in pd.read_csv(csv_file_path, usecols=selected_columns, chunksize=5000):
            chunk = chunk[chunk['STATE'] == 'CA']
            chunk['year'] = year
            # 逐步保存数据块
            chunk[['year', 'INC_TYPE']].to_csv(combined_file_path, mode='a', index=False, header=not os.path.exists(combined_file_path))
    else:
        print(f"File not found: {csv_file_path}")

# 读取2012到2022年的TXT数据
for year in years_txt:
    input_file_path = os.path.join(txt_directory, f'NFIRS{year}', 'basicincident.txt')
    if os.path.exists(input_file_path):
        print(f"Processing {input_file_path}")
        for chunk in pd.read_csv(input_file_path, delimiter='^', encoding='latin1', usecols=selected_columns, chunksize=5000):
            chunk = chunk[chunk['STATE'] == 'CA']
            chunk['year'] = year
            # 逐步保存数据块
            chunk[['year', 'INC_TYPE']].to_csv(combined_file_path, mode='a', index=False, header=not os.path.exists(combined_file_path))
    else:
        print(f"File not found: {input_file_path}")

print("All data successfully read and saved to combined_inc_type_data.csv")


In [5]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt

# 从本地文件读取数据
inc_type_df = pd.read_csv('combined_inc_type_data.csv')
print("Data loaded from combined_inc_type_data.csv")

# 对每个年份的INC_TYPE进行统计
inc_type_counts = inc_type_df.groupby(['year', 'INC_TYPE']).size().unstack(fill_value=0)

# 输出统计结果
print(inc_type_counts)



Data loaded from combined_inc_type_data.csv
INC_TYPE   100    111   112   113  114  115  116  117   118  120  ...  461  \
year                                                              ...        
2005      3963  14084   943  6593  926   72  185   41  4836   79  ...  300   
2006      3337  11249   671  5321  869   36  132   40  4522   69  ...  179   
2007      5001  14619  1015  5876  916   32  163   37  4829   95  ...  250   

INC_TYPE   462   463  471  480  481  482   561   631  632  
year                                                       
2005      1080  8393  252  645  248   48  3763  1282   68  
2006      1251  4752  185  530  194   47  3494  1306   76  
2007      2274  4762  262  692  266   51  4727  2081   84  

[3 rows x 71 columns]


In [None]:
# 预测2023年的INC_TYPE数量
predictions = {}

for inc_type in inc_type_counts.columns:
    model = ARIMA(inc_type_counts[inc_type], order=(1, 1, 1))
    model_fit = model.fit()
    forecast = model_fit.forecast(steps=1)
    predictions[inc_type] = forecast[0]

# 输出预测结果
print("Predicted INC_TYPE counts for 2023:")
for inc_type, count in predictions.items():
    print(f"INC_TYPE {inc_type}: {count}")

# 可视化结果
inc_type_counts.plot(kind='line', figsize=(15, 7))
plt.title('INC_TYPE Counts by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.show()

In [2]:

# 对每个年份的INC_TYPE进行统计
inc_type_counts = inc_type_df.groupby(['year', 'INC_TYPE']).size().unstack(fill_value=0)

# 输出统计结果 
print(inc_type_counts)

# 预测2023年的INC_TYPE数量
predictions = {}

for inc_type in inc_type_counts.columns:
    model = ARIMA(inc_type_counts[inc_type], order=(1, 1, 1))
    model_fit = model.fit()
    forecast = model_fit.forecast(steps=1)
    predictions[inc_type] = forecast[0]

# 输出预测结果
print("Predicted INC_TYPE counts for 2023:")
for inc_type, count in predictions.items():
    print(f"INC_TYPE {inc_type}: {count}")

# 可视化结果
inc_type_counts.plot(kind='line', figsize=(15, 7))
plt.title('INC_TYPE Counts by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.show()


NameError: name 'inc_type_df' is not defined