In [4]:
import pandas as pd
import numpy as np
import math

df = pd.read_csv("raw_data.csv", usecols=["county_code","county", "disease","long","lat","onset_date","onset_year"])
df = df[df['county']!='未知']
df = df[df['onset_year']!='#VALUE!']
df = df[df['onset_date']!='NA']
df = df.sort_values(by=['disease','county','onset_date'])
df["onset_year"] = df["onset_year"].apply(lambda x: int(x))
df['onset_date'] = pd.to_datetime(df['onset_date'],format='%Y-%m-%d')

diseases = df['disease'].unique()
counties = df['county'].unique()

df_counties = df[['county','long','lat']].drop_duplicates()

df = df.value_counts(subset=['disease','county','onset_date']).reset_index(name='value')




In [5]:
import math
from tqdm.notebook import tqdm

distance_data = {'source':[], 'target':[], 'distance':[]}
for source in tqdm(counties):
    for target in counties:
        if source != target:
            df_t1 = df_counties[df_counties['county']==source]
            df_t2 = df_counties[df_counties['county']==target]
            x1 = df_t1['long'].values[0]
            y1 = df_t1['lat'].values[0]
            x2 = df_t2['long'].values[0]
            y2 = df_t2['lat'].values[0]
            distance = math.dist([x1,y1],[x2,y2])
            distance_data['source'].append(source)
            distance_data['target'].append(target)
            distance_data['distance'].append(distance)

df_distance = pd.DataFrame(data=distance_data)
df_distance = df_distance.sort_values(by=['source','target'])
df_distance.to_csv('distance_data.csv', index=False)

  0%|          | 0/134 [00:00<?, ?it/s]

In [None]:
from datetime import datetime, timedelta
import math
from tqdm.notebook import tqdm

raw_day_data = {
    'disease':[],
    'county':[],
    'onset_date':[],
    'value':[],
    'last_7_days':[],
    'last_7_days_neighbor':[]
}


for disease in tqdm(diseases):
    start_date = df[df['disease']==disease]['onset_date'].min()
    end_date = df[df['disease']==disease]['onset_date'].max()
    
    for county in tqdm(counties):

        df_temp = df[(df['county']==county) & (df['disease']==disease)]
        
        if df_temp.empty:
            days = (end_date-start_date).days+1
            raw_day_data['disease'].extend([disease]*days)
            raw_day_data['county'].extend([county]*days)
            raw_day_data['onset_date'].extend(pd.date_range(start_date,end_date).to_list())
            raw_day_data['value'].extend([0]*days)
            raw_day_data['last_7_days'].extend([0]*days)
            raw_day_data['last_7_days_neighbor'].extend([0]*days)
        else:
            for onset_date in pd.date_range(start_date,end_date).to_list():
                
                onset_date_7_days_ago = onset_date + timedelta(days=-7)
                
                value = df_temp[df_temp['onset_date']==onset_date]['value'].sum()
            
                last_7_days = df_temp[(df_temp['onset_date']<onset_date) & (df_temp['onset_date']>=onset_date_7_days_ago)]['value'].sum()
                
                raw_day_data['disease'].append(disease)
                raw_day_data['county'].append(county)
                raw_day_data['onset_date'].append(onset_date)
                raw_day_data['value'].append(value)
                raw_day_data['last_7_days'].append(last_7_days)
                raw_day_data['last_7_days_neighbor'].append(0)

df_day = pd.DataFrame(data=raw_day_data)
df_day = df_day.sort_values(by=['disease','county','onset_date'])

df_day.to_csv('raw_day_data_no_neighbor.csv', index=False)


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
df_day = pd.read_csv('raw_day_data_no_neighbor.csv')

df_day['onset_date'] = pd.to_datetime(df_day['onset_date'],format='%Y-%m-%d')

for disease in tqdm(diseases):
    start_date = df_day[df_day['disease']==disease]['onset_date'].min()
    end_date = df_day[df_day['disease']==disease]['onset_date'].max()
    for county in tqdm(counties):
        df_temp = df_day[(df_day['county']==county) & (df_day['disease']==disease)]
        distances = df_distance.loc[df_distance['source']==county,'distance'].to_numpy()
        
        for onset_date in tqdm(pd.date_range(start_date,end_date).to_list()):
            
            last_7_days = df_day[(df_day['county']!=county) & (df_day['disease']==disease) & (df_day['onset_date']==onset_date)]['value'].to_numpy()
            
            last_7_days_neighbor = np.sum(last_7_days/distances)
            
            df_day.loc[(df_day['county']==county) & (df_day['disease']==disease) & (df_day['onset_date']==onset_date),'last_7_days_neighbor'] = last_7_days_neighbor

df_day.to_csv('raw_day_data.csv', index=False)