In [None]:
import pandas as pd
import numpy as np
import math

df = pd.read_csv("raw_data.csv", usecols=["county_code","county", "disease","long","lat","onset_date","onset_year"])
df = df[df['county']!='未知']
df = df[df['onset_year']!='#VALUE!']
df = df[df['onset_date']!='NA']
df = df.sort_values(by=['disease','county','onset_date'])
df["onset_year"] = df["onset_year"].apply(lambda x: int(x))
df['onset_date'] = pd.to_datetime(df['onset_date'],format='%Y-%m-%d')

diseases = df['disease'].unique()
counties = df['county'].unique()
county_codes = df['county_code'].unique()

df_counties = df[['county','long','lat']].drop_duplicates()
df_counties.to_csv('df_counties.csv',index=False)

df = df.value_counts(subset=['disease','county','onset_date']).reset_index(name='value')


(134,)
(135,)


  df = pd.read_csv("raw_data.csv", usecols=["county_code","county", "disease","long","lat","onset_date","onset_year"])


In [2]:
diseases

array(['发热伴', '布鲁氏菌病', '手足口', '猩红热', '百日咳', '肾综合'], dtype=object)

In [7]:
import math
from tqdm.notebook import tqdm

distance_data = {'source':[], 'target':[], 'distance':[]}
for source in tqdm(counties):
    for target in counties:
        if source != target:
            df_t1 = df_counties[df_counties['county']==source]
            df_t2 = df_counties[df_counties['county']==target]
            x1 = df_t1['long'].values[0]
            y1 = df_t1['lat'].values[0]
            x2 = df_t2['long'].values[0]
            y2 = df_t2['lat'].values[0]
            distance = math.dist([x1,y1],[x2,y2])
            distance_data['source'].append(source)
            distance_data['target'].append(target)
            distance_data['distance'].append(distance)
        else:
            distance_data['source'].append(source)
            distance_data['target'].append(target)
            distance_data['distance'].append(0.1)

df_distance = pd.DataFrame(data=distance_data)
df_distance = df_distance.sort_values(by=['source','target'])
df_distance.to_csv('distance_data.csv', index=False)

  0%|          | 0/134 [00:00<?, ?it/s]

In [9]:
from datetime import datetime, timedelta
import math
from tqdm.notebook import tqdm

raw_day_data = {
    'disease':[],
    'county':[],
    'onset_date':[],
    'value':[],
    'last_n_days':[],
    'last_n_days_neighbor':[]
}

latent_periods = {
    '发热伴':14, 
    '布鲁氏菌病':21, 
    '手足口':7, 
    '猩红热':7, 
    '百日咳':21, 
    '肾综合':14
}


for disease in tqdm(diseases):
    start_date = df[df['disease']==disease]['onset_date'].min()
    start_date = start_date + timedelta(days=(7-start_date.weekday())%7)
    end_date = df[df['disease']==disease]['onset_date'].max()
    end_date = end_date + timedelta(days=-end_date.isoweekday()%7)
    n_days = latent_periods[disease]
    days = (end_date-start_date).days+1

    for county in tqdm(counties):

        df_temp = df[(df['county']==county) & (df['disease']==disease)]
        
        if df_temp.empty:
            raw_day_data['disease'].extend([disease]*days)
            raw_day_data['county'].extend([county]*days)
            raw_day_data['onset_date'].extend(pd.date_range(start_date,end_date).to_list())
            raw_day_data['value'].extend([0]*days)
            raw_day_data['last_n_days'].extend([0]*days)
            raw_day_data['last_n_days_neighbor'].extend([0]*days)
        else:
            for onset_date in pd.date_range(start_date,end_date).to_list():
                
                start_last_n_days = onset_date + timedelta(days=-7-n_days)
                end_last_n_days = onset_date + timedelta(days=-7)
                
                value = df_temp[df_temp['onset_date']==onset_date]['value'].sum()
            
                last_n_days = df_temp[(df_temp['onset_date']<end_last_n_days) & (df_temp['onset_date']>=start_last_n_days)]['value'].sum()
                
                raw_day_data['disease'].append(disease)
                raw_day_data['county'].append(county)
                raw_day_data['onset_date'].append(onset_date)
                raw_day_data['value'].append(value)
                raw_day_data['last_n_days'].append(last_n_days)
                raw_day_data['last_n_days_neighbor'].append(0)

df_day = pd.DataFrame(data=raw_day_data)
df_day = df_day.sort_values(by=['disease','county','onset_date'])

df_day.to_csv('raw_day_data_no_neighbor.csv', index=False)


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

In [10]:
df_day = pd.read_csv('raw_day_data_no_neighbor.csv')

df_day['onset_date'] = pd.to_datetime(df_day['onset_date'],format='%Y-%m-%d')

for disease in tqdm(diseases):
    df_day_disease = df_day[df_day['disease']==disease]
    start_date = df_day_disease['onset_date'].min()
    end_date = df_day_disease['onset_date'].max()
    n_counties = len(counties)
    n_days = (end_date-start_date).days+1
    for county in tqdm(counties):
        df_temp = df_day_disease[(df_day_disease['county']==county)]
        distances = df_distance.loc[df_distance['source']==county,'distance'].to_numpy()
        
        last_n_days = df_day_disease['value'].to_numpy()
        last_n_days = np.reshape(last_n_days,(n_days,n_counties),order='F')
       
        last_n_days_neighbor = np.sum(last_n_days/distances, axis=1)

        df_day.loc[(df_day['county']==county) & (df_day['disease']==disease),'last_n_days_neighbor'] = last_n_days_neighbor


df_day.to_csv('raw_day_data.csv', index=False)

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  df_day.loc[(df_day['county']==county) & (df_day['disease']==disease),'last_n_days_neighbor'] = last_n_days_neighbor


  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

In [None]:
import pandas as pd
df_day = pd.read_csv('raw_day_data.csv')

df_day['onset_date'] = pd.to_datetime(df_day['onset_date'],format='%Y-%m-%d')

df_day['year'] = df_day['onset_date'].dt.year
df_day['month'] = df_day['onset_date'].dt.month

df_year = df_day.groupby(['disease','county','year'])['value'].sum().reset_index(name='value')



Unnamed: 0,disease,county,year,value
0,发热伴,东平县,2013,0
1,发热伴,东平县,2014,2
2,发热伴,东平县,2015,0
3,发热伴,东平县,2016,0
4,发热伴,东平县,2017,3


In [7]:
df_year.sort_values(by=['value'],ascending=False).head(5)

Unnamed: 0,disease,county,year,value
4284,手足口,市北区,2023,2582
4148,手足口,城阳区,2023,2472
4092,手足口,历城区,2023,2156
4452,手足口,李沧区,2023,1931
4940,手足口,黄岛区,2023,1587
