# Data Preprocess (New ver. 기간 ~5/19)

- country_info.csv: 국가영문이름, 대륙, 해외유입 확진자 방문여부, 확진자 수, iso코드, 인구, 면적
- roaming_all.csv: 날짜, iso코드, 도착일, 출발일, 수

### (1) roaming_all.csv 파일을 전처리하는 코드임
1. 질병관리본부에서 해외유입 확진자 발생했던 지역만 추림
2. arrival, departure 무시하고 return 날짜로 합침

In [1]:
import pandas as pd
from countryinfo import CountryInfo
from tqdm import tqdm

ci = pd.read_csv('country_info.csv')
roam = pd.read_csv('roaming_all.csv')
visit_iso_list = list(ci[ci.visit == 1].iso) # 해외확진자가 방문했던 국가들 리스트

In [2]:
ci.head()

Unnamed: 0,Country,country_lib,continent,visit,n_confirmed,iso,population,area
0,Argentina,Argentina,America,1,5371,AR,42669500,2780400
1,Australia,Australia,Australia,1,6913,AU,23696900,7692024
2,Austria,Austria,Europe,1,15752,AT,8527230,83871
3,Bangladesh,Bangladesh,Asia,0,12425,BD,157486000,147570
4,Belarus,Belarus,Europe,0,20168,BY,9475100,207600


In [3]:
# 3월부터 5월까지
date_range = pd.date_range(start='20200301', end='20200505')
date_list = date_range.strftime("%Y%m%d").tolist()

roam_new = []

for iso in tqdm(visit_iso_list):
    for date in date_list:
        target_df = roam[(roam.date==int(date)) & (roam.iso==iso.lower())]
        counts = target_df.counts.sum()
        roam_new.append([date,iso,counts])

100%|██████████| 48/48 [00:41<00:00,  1.16it/s]


In [4]:
roam_new_df = pd.DataFrame(roam_new, columns=['date','iso','counts'])
# roam_new_df.to_csv('roaming_preprocess.csv', index=False)

In [5]:
roam_new_df

Unnamed: 0,date,iso,counts
0,20200301,AR,31
1,20200302,AR,9
2,20200303,AR,25
3,20200304,AR,16
4,20200305,AR,18
...,...,...,...
3163,20200501,VN,58
3164,20200502,VN,17
3165,20200503,VN,52
3166,20200504,VN,91


테스트 및 시각화

In [None]:
roam_new_df = pd.read_csv('roaming_preprocess.csv')
roam_new_df[roam_new_df.iso == 'US'].counts.plot() # 미국 유입

### 로밍데이터 확보가 안됨 (5/5까지만 보유중)

---

### (2) pytrends로 검색어 트렌드 추출
- ref: https://towardsdatascience.com/google-trends-api-for-python-a84bc25db88f

In [1]:
from pytrends.request import TrendReq
import pandas as pd
import numpy as np
from tqdm import tqdm

ci = pd.read_csv('country_info.csv')
visit_iso_list = list(ci[ci.visit == 1].iso) # 해외확진자가 방문했던 국가들 리스트

date_range = pd.date_range(start='20200301', end='20200519')
date_list = date_range.strftime("%Y%m%d").tolist()

In [2]:
def getKeywordTrend(keyword):

    iso_keyword_list = []
    pytrend = TrendReq()
    
    for iso in tqdm(visit_iso_list):
        success = False
        while not success:
            pytrend.build_payload(kw_list=[keyword], geo=iso, timeframe='2020-03-01 2020-05-19')
            df = pytrend.interest_over_time()
            print(len(df), len(date_list))
            if len(df) == len(date_list):
                success = True
        iso_keyword_list.append(list(df[keyword]))
    
    return iso_keyword_list

#### keyword: coronavirus

In [3]:
iso_keyword_list = getKeywordTrend(keyword='coronavirus')

  2%|▏         | 1/48 [00:01<00:57,  1.23s/it]

80 80


  4%|▍         | 2/48 [00:02<00:57,  1.25s/it]

80 80


  6%|▋         | 3/48 [00:03<00:55,  1.23s/it]

80 80


  8%|▊         | 4/48 [00:04<00:53,  1.21s/it]

80 80


 10%|█         | 5/48 [00:05<00:48,  1.12s/it]

80 80


 12%|█▎        | 6/48 [00:06<00:47,  1.13s/it]

80 80


 15%|█▍        | 7/48 [00:08<00:46,  1.14s/it]

80 80


 17%|█▋        | 8/48 [00:09<00:49,  1.23s/it]

80 80


 19%|█▉        | 9/48 [00:10<00:48,  1.23s/it]

80 80


 21%|██        | 10/48 [00:13<01:00,  1.60s/it]

80 80


 23%|██▎       | 11/48 [00:14<00:54,  1.46s/it]

80 80


 25%|██▌       | 12/48 [00:15<00:48,  1.35s/it]

80 80


 27%|██▋       | 13/48 [00:16<00:45,  1.30s/it]

80 80


 29%|██▉       | 14/48 [00:17<00:44,  1.31s/it]

80 80


 31%|███▏      | 15/48 [00:18<00:39,  1.19s/it]

80 80


 33%|███▎      | 16/48 [00:19<00:36,  1.14s/it]

80 80


 35%|███▌      | 17/48 [00:20<00:34,  1.13s/it]

80 80


 38%|███▊      | 18/48 [00:27<01:19,  2.64s/it]

80 80


 40%|███▉      | 19/48 [00:28<01:01,  2.13s/it]

80 80


 42%|████▏     | 20/48 [00:29<00:51,  1.84s/it]

80 80


 44%|████▍     | 21/48 [00:30<00:42,  1.59s/it]

80 80


 46%|████▌     | 22/48 [00:31<00:38,  1.49s/it]

80 80


 48%|████▊     | 23/48 [00:32<00:33,  1.33s/it]

80 80


 50%|█████     | 24/48 [00:33<00:29,  1.23s/it]

80 80


 52%|█████▏    | 25/48 [00:34<00:27,  1.20s/it]

80 80


 54%|█████▍    | 26/48 [00:35<00:26,  1.19s/it]

80 80


 56%|█████▋    | 27/48 [00:36<00:24,  1.15s/it]

80 80


 58%|█████▊    | 28/48 [00:37<00:22,  1.14s/it]

80 80


 60%|██████    | 29/48 [00:39<00:21,  1.12s/it]

80 80


 62%|██████▎   | 30/48 [00:40<00:20,  1.14s/it]

80 80


 65%|██████▍   | 31/48 [00:41<00:19,  1.14s/it]

80 80


 67%|██████▋   | 32/48 [00:42<00:18,  1.15s/it]

80 80


 69%|██████▉   | 33/48 [00:43<00:16,  1.10s/it]

80 80


 71%|███████   | 34/48 [00:44<00:15,  1.11s/it]

80 80


 73%|███████▎  | 35/48 [00:45<00:15,  1.17s/it]

80 80


 75%|███████▌  | 36/48 [00:46<00:13,  1.11s/it]

80 80


 77%|███████▋  | 37/48 [00:48<00:13,  1.18s/it]

80 80


 79%|███████▉  | 38/48 [00:49<00:12,  1.21s/it]

80 80


 81%|████████▏ | 39/48 [00:50<00:10,  1.13s/it]

80 80


 83%|████████▎ | 40/48 [00:51<00:08,  1.09s/it]

80 80


 85%|████████▌ | 41/48 [00:52<00:07,  1.14s/it]

80 80


 88%|████████▊ | 42/48 [00:53<00:06,  1.13s/it]

80 80


 90%|████████▉ | 43/48 [00:54<00:05,  1.08s/it]

80 80


 92%|█████████▏| 44/48 [00:56<00:04,  1.18s/it]

80 80


 94%|█████████▍| 45/48 [00:57<00:03,  1.11s/it]

80 80


 96%|█████████▌| 46/48 [00:58<00:02,  1.21s/it]

80 80


 98%|█████████▊| 47/48 [00:59<00:01,  1.11s/it]

80 80


100%|██████████| 48/48 [01:00<00:00,  1.26s/it]

80 80





In [4]:
keyword_df = pd.DataFrame(np.transpose(iso_keyword_list), columns=visit_iso_list)
keyword_df['date'] = date_list 

In [6]:
# save
# keyword_df.to_csv('trend_coronavirus.csv')

#### keyword: korea

In [7]:
iso_keyword_list = getKeywordTrend(keyword='korea')

  2%|▏         | 1/48 [00:00<00:40,  1.15it/s]

80 80


  4%|▍         | 2/48 [00:01<00:40,  1.15it/s]

80 80


  6%|▋         | 3/48 [00:02<00:38,  1.17it/s]

80 80


  8%|▊         | 4/48 [00:03<00:37,  1.18it/s]

80 80


 10%|█         | 5/48 [00:04<00:38,  1.13it/s]

80 80


 12%|█▎        | 6/48 [00:05<00:37,  1.13it/s]

80 80


 15%|█▍        | 7/48 [00:06<00:36,  1.13it/s]

80 80


 17%|█▋        | 8/48 [00:07<00:35,  1.12it/s]

80 80


 19%|█▉        | 9/48 [00:08<00:35,  1.08it/s]

80 80


 21%|██        | 10/48 [00:09<00:35,  1.06it/s]

80 80


 23%|██▎       | 11/48 [00:09<00:33,  1.09it/s]

80 80


 25%|██▌       | 12/48 [00:10<00:32,  1.10it/s]

80 80


 27%|██▋       | 13/48 [00:11<00:30,  1.14it/s]

80 80


 29%|██▉       | 14/48 [00:12<00:29,  1.16it/s]

80 80


 31%|███▏      | 15/48 [00:13<00:28,  1.14it/s]

80 80


 33%|███▎      | 16/48 [00:14<00:28,  1.14it/s]

80 80


 35%|███▌      | 17/48 [00:15<00:26,  1.16it/s]

80 80


 38%|███▊      | 18/48 [00:15<00:25,  1.16it/s]

80 80


 40%|███▉      | 19/48 [00:16<00:25,  1.16it/s]

80 80


 42%|████▏     | 20/48 [00:17<00:24,  1.13it/s]

80 80


 44%|████▍     | 21/48 [00:18<00:23,  1.13it/s]

80 80


 46%|████▌     | 22/48 [00:19<00:22,  1.15it/s]

80 80


 48%|████▊     | 23/48 [00:20<00:22,  1.11it/s]

80 80


 50%|█████     | 24/48 [00:21<00:22,  1.08it/s]

80 80


 52%|█████▏    | 25/48 [00:22<00:23,  1.02s/it]

80 80


 54%|█████▍    | 26/48 [00:23<00:21,  1.05it/s]

80 80


 56%|█████▋    | 27/48 [00:24<00:20,  1.01it/s]

80 80


 58%|█████▊    | 28/48 [00:25<00:19,  1.02it/s]

80 80


 60%|██████    | 29/48 [00:26<00:18,  1.05it/s]

80 80


 62%|██████▎   | 30/48 [00:27<00:17,  1.05it/s]

80 80


 65%|██████▍   | 31/48 [00:28<00:15,  1.09it/s]

80 80


 67%|██████▋   | 32/48 [00:28<00:14,  1.12it/s]

80 80


 69%|██████▉   | 33/48 [00:29<00:13,  1.07it/s]

80 80


 71%|███████   | 34/48 [00:31<00:13,  1.04it/s]

80 80


 73%|███████▎  | 35/48 [00:31<00:11,  1.09it/s]

80 80


 75%|███████▌  | 36/48 [00:32<00:11,  1.09it/s]

80 80


 77%|███████▋  | 37/48 [00:33<00:10,  1.08it/s]

80 80


 79%|███████▉  | 38/48 [00:34<00:09,  1.08it/s]

80 80


 81%|████████▏ | 39/48 [00:35<00:08,  1.12it/s]

80 80


 83%|████████▎ | 40/48 [00:36<00:07,  1.09it/s]

80 80


 85%|████████▌ | 41/48 [00:37<00:06,  1.07it/s]

80 80


 88%|████████▊ | 42/48 [00:38<00:05,  1.04it/s]

80 80


 90%|████████▉ | 43/48 [00:39<00:04,  1.06it/s]

80 80


 92%|█████████▏| 44/48 [00:40<00:03,  1.10it/s]

80 80


 94%|█████████▍| 45/48 [00:41<00:02,  1.10it/s]

80 80


 96%|█████████▌| 46/48 [00:41<00:01,  1.13it/s]

80 80


 98%|█████████▊| 47/48 [00:43<00:01,  1.05s/it]

80 80


100%|██████████| 48/48 [00:44<00:00,  1.08it/s]

80 80





In [8]:
keyword_df = pd.DataFrame(np.transpose(iso_keyword_list), columns=visit_iso_list)
keyword_df['date'] = date_list # 5/4까지만 수집되고있음 (5/9기준)

In [9]:
# save
keyword_df.to_csv('trend_korea.csv')

covid-19

In [17]:
iso_keyword_list = getKeywordTrend(keyword = 'covid-19')





  0%|          | 0/48 [00:00<?, ?it/s][A[A[A[A



  2%|▏         | 1/48 [00:01<00:50,  1.07s/it][A[A[A[A

80 80






  4%|▍         | 2/48 [00:02<00:53,  1.16s/it][A[A[A[A

80 80






  6%|▋         | 3/48 [00:03<00:48,  1.08s/it][A[A[A[A

80 80






  8%|▊         | 4/48 [00:04<00:45,  1.02s/it][A[A[A[A

80 80






 10%|█         | 5/48 [00:05<00:42,  1.02it/s][A[A[A[A

80 80






 12%|█▎        | 6/48 [00:05<00:38,  1.08it/s][A[A[A[A

80 80






 15%|█▍        | 7/48 [00:06<00:39,  1.03it/s][A[A[A[A

80 80






 17%|█▋        | 8/48 [00:08<00:44,  1.11s/it][A[A[A[A

80 80






 19%|█▉        | 9/48 [00:09<00:39,  1.02s/it][A[A[A[A

80 80






 21%|██        | 10/48 [00:10<00:36,  1.05it/s][A[A[A[A

80 80






 23%|██▎       | 11/48 [00:11<00:42,  1.14s/it][A[A[A[A

80 80






 25%|██▌       | 12/48 [00:12<00:38,  1.06s/it][A[A[A[A

80 80






 27%|██▋       | 13/48 [00:13<00:34,  1.01it/s][A[A[A[A

80 80






 29%|██▉       | 14/48 [00:14<00:35,  1.04s/it][A[A[A[A

80 80






 31%|███▏      | 15/48 [00:15<00:33,  1.01s/it][A[A[A[A

80 80






 33%|███▎      | 16/48 [00:16<00:30,  1.03it/s][A[A[A[A

80 80






 35%|███▌      | 17/48 [00:17<00:29,  1.04it/s][A[A[A[A

80 80






 38%|███▊      | 18/48 [00:18<00:28,  1.04it/s][A[A[A[A

80 80






 40%|███▉      | 19/48 [00:19<00:27,  1.07it/s][A[A[A[A

80 80






 42%|████▏     | 20/48 [00:19<00:25,  1.09it/s][A[A[A[A

80 80






 44%|████▍     | 21/48 [00:20<00:24,  1.08it/s][A[A[A[A

80 80






 46%|████▌     | 22/48 [00:21<00:23,  1.12it/s][A[A[A[A

80 80






 48%|████▊     | 23/48 [00:22<00:23,  1.06it/s][A[A[A[A

80 80






 50%|█████     | 24/48 [00:23<00:22,  1.09it/s][A[A[A[A

80 80






 52%|█████▏    | 25/48 [00:24<00:22,  1.02it/s][A[A[A[A

80 80






 54%|█████▍    | 26/48 [00:25<00:20,  1.08it/s][A[A[A[A

80 80






 56%|█████▋    | 27/48 [00:26<00:18,  1.12it/s][A[A[A[A

80 80






 58%|█████▊    | 28/48 [00:27<00:17,  1.13it/s][A[A[A[A

80 80






 60%|██████    | 29/48 [00:28<00:16,  1.13it/s][A[A[A[A

80 80






 62%|██████▎   | 30/48 [00:28<00:15,  1.13it/s][A[A[A[A

80 80






 65%|██████▍   | 31/48 [00:30<00:15,  1.08it/s][A[A[A[A

80 80






 67%|██████▋   | 32/48 [00:30<00:14,  1.11it/s][A[A[A[A

80 80






 69%|██████▉   | 33/48 [00:31<00:13,  1.13it/s][A[A[A[A

80 80






 71%|███████   | 34/48 [00:32<00:12,  1.15it/s][A[A[A[A

80 80






 73%|███████▎  | 35/48 [00:33<00:11,  1.15it/s][A[A[A[A

80 80






 75%|███████▌  | 36/48 [00:34<00:10,  1.16it/s][A[A[A[A

80 80






 77%|███████▋  | 37/48 [00:35<00:09,  1.16it/s][A[A[A[A

80 80






 79%|███████▉  | 38/48 [00:35<00:08,  1.15it/s][A[A[A[A

80 80






 81%|████████▏ | 39/48 [00:36<00:07,  1.16it/s][A[A[A[A

80 80






 83%|████████▎ | 40/48 [00:37<00:07,  1.13it/s][A[A[A[A

80 80






 85%|████████▌ | 41/48 [00:38<00:06,  1.10it/s][A[A[A[A

80 80






 88%|████████▊ | 42/48 [00:39<00:05,  1.04it/s][A[A[A[A

80 80






 90%|████████▉ | 43/48 [00:40<00:04,  1.07it/s][A[A[A[A

80 80






 92%|█████████▏| 44/48 [00:41<00:03,  1.09it/s][A[A[A[A

80 80






 94%|█████████▍| 45/48 [00:42<00:02,  1.12it/s][A[A[A[A

80 80






 96%|█████████▌| 46/48 [00:43<00:01,  1.14it/s][A[A[A[A

80 80






 98%|█████████▊| 47/48 [00:44<00:00,  1.16it/s][A[A[A[A

80 80






100%|██████████| 48/48 [00:44<00:00,  1.07it/s][A[A[A[A

80 80





In [18]:
keyword_df = pd.DataFrame(np.transpose(iso_keyword_list), columns=visit_iso_list)
keyword_df['date'] = date_list 

In [19]:
# save
keyword_df.to_csv('trend_covid-19.csv')

mask

In [20]:
iso_keyword_list = getKeywordTrend(keyword = 'mask')
keyword_df = pd.DataFrame(np.transpose(iso_keyword_list), columns=visit_iso_list)
keyword_df['date'] = date_list 





  0%|          | 0/48 [00:00<?, ?it/s][A[A[A[A



  2%|▏         | 1/48 [00:01<00:52,  1.12s/it][A[A[A[A

80 80






  4%|▍         | 2/48 [00:02<00:48,  1.06s/it][A[A[A[A

80 80






  6%|▋         | 3/48 [00:03<00:46,  1.04s/it][A[A[A[A

80 80






  8%|▊         | 4/48 [00:03<00:43,  1.01it/s][A[A[A[A

80 80






 10%|█         | 5/48 [00:04<00:42,  1.01it/s][A[A[A[A

80 80






 12%|█▎        | 6/48 [00:06<00:43,  1.04s/it][A[A[A[A

80 80






 15%|█▍        | 7/48 [00:07<00:44,  1.09s/it][A[A[A[A

80 80






 17%|█▋        | 8/48 [00:08<00:41,  1.04s/it][A[A[A[A

80 80






 19%|█▉        | 9/48 [00:09<00:42,  1.09s/it][A[A[A[A

80 80






 21%|██        | 10/48 [00:10<00:45,  1.20s/it][A[A[A[A

80 80






 23%|██▎       | 11/48 [00:11<00:42,  1.15s/it][A[A[A[A

80 80






 25%|██▌       | 12/48 [00:13<00:42,  1.18s/it][A[A[A[A

80 80






 27%|██▋       | 13/48 [00:14<00:42,  1.22s/it][A[A[A[A

80 80






 29%|██▉       | 14/48 [00:15<00:38,  1.14s/it][A[A[A[A

80 80






 31%|███▏      | 15/48 [00:16<00:37,  1.12s/it][A[A[A[A

80 80






 33%|███▎      | 16/48 [00:17<00:34,  1.07s/it][A[A[A[A

80 80






 35%|███▌      | 17/48 [00:18<00:32,  1.05s/it][A[A[A[A

80 80






 38%|███▊      | 18/48 [00:19<00:34,  1.13s/it][A[A[A[A

80 80






 40%|███▉      | 19/48 [00:20<00:31,  1.09s/it][A[A[A[A

80 80






 42%|████▏     | 20/48 [00:21<00:29,  1.06s/it][A[A[A[A

80 80






 44%|████▍     | 21/48 [00:22<00:29,  1.08s/it][A[A[A[A

80 80






 46%|████▌     | 22/48 [00:24<00:30,  1.16s/it][A[A[A[A

80 80






 48%|████▊     | 23/48 [00:25<00:27,  1.10s/it][A[A[A[A

80 80






 50%|█████     | 24/48 [00:26<00:27,  1.14s/it][A[A[A[A

80 80






 52%|█████▏    | 25/48 [00:27<00:26,  1.15s/it][A[A[A[A

80 80






 54%|█████▍    | 26/48 [00:29<00:27,  1.26s/it][A[A[A[A

80 80






 56%|█████▋    | 27/48 [00:30<00:25,  1.22s/it][A[A[A[A

80 80






 58%|█████▊    | 28/48 [00:31<00:23,  1.15s/it][A[A[A[A

80 80






 60%|██████    | 29/48 [00:32<00:22,  1.16s/it][A[A[A[A

80 80






 62%|██████▎   | 30/48 [00:33<00:20,  1.15s/it][A[A[A[A

80 80






 65%|██████▍   | 31/48 [00:34<00:20,  1.18s/it][A[A[A[A

80 80






 67%|██████▋   | 32/48 [00:35<00:17,  1.12s/it][A[A[A[A

80 80






 69%|██████▉   | 33/48 [00:36<00:16,  1.10s/it][A[A[A[A

80 80






 71%|███████   | 34/48 [00:37<00:15,  1.12s/it][A[A[A[A

80 80






 73%|███████▎  | 35/48 [00:38<00:14,  1.08s/it][A[A[A[A

80 80






 75%|███████▌  | 36/48 [00:39<00:12,  1.00s/it][A[A[A[A

80 80






 77%|███████▋  | 37/48 [00:40<00:11,  1.03s/it][A[A[A[A

80 80






 79%|███████▉  | 38/48 [00:41<00:09,  1.02it/s][A[A[A[A

80 80






 81%|████████▏ | 39/48 [00:42<00:08,  1.05it/s][A[A[A[A

80 80






 83%|████████▎ | 40/48 [00:43<00:07,  1.04it/s][A[A[A[A

80 80






 85%|████████▌ | 41/48 [00:44<00:06,  1.10it/s][A[A[A[A

80 80






 88%|████████▊ | 42/48 [00:45<00:05,  1.02it/s][A[A[A[A

80 80






 90%|████████▉ | 43/48 [00:46<00:04,  1.00it/s][A[A[A[A

80 80






 92%|█████████▏| 44/48 [00:47<00:03,  1.06it/s][A[A[A[A

80 80






 94%|█████████▍| 45/48 [00:48<00:02,  1.08it/s][A[A[A[A

80 80






 96%|█████████▌| 46/48 [00:49<00:01,  1.13it/s][A[A[A[A

80 80






 98%|█████████▊| 47/48 [00:49<00:00,  1.12it/s][A[A[A[A

80 80






100%|██████████| 48/48 [00:50<00:00,  1.06s/it][A[A[A[A

80 80





In [21]:
# save
keyword_df.to_csv('trend_mask.csv')

hygiene measure

In [24]:
iso_keyword_list = getKeywordTrend(keyword = 'flu')
keyword_df = pd.DataFrame(np.transpose(iso_keyword_list), columns=visit_iso_list)
keyword_df['date'] = date_list 






  0%|          | 0/48 [00:00<?, ?it/s][A[A[A[A[A




  2%|▏         | 1/48 [00:00<00:40,  1.16it/s][A[A[A[A[A

80 80







  4%|▍         | 2/48 [00:01<00:40,  1.13it/s][A[A[A[A[A

80 80







  6%|▋         | 3/48 [00:02<00:40,  1.12it/s][A[A[A[A[A

80 80







  8%|▊         | 4/48 [00:03<00:37,  1.16it/s][A[A[A[A[A

80 80







 10%|█         | 5/48 [00:04<00:36,  1.17it/s][A[A[A[A[A

80 80







 12%|█▎        | 6/48 [00:05<00:37,  1.11it/s][A[A[A[A[A

80 80







 15%|█▍        | 7/48 [00:06<00:36,  1.13it/s][A[A[A[A[A

80 80







 17%|█▋        | 8/48 [00:07<00:34,  1.15it/s][A[A[A[A[A

80 80







 19%|█▉        | 9/48 [00:07<00:34,  1.13it/s][A[A[A[A[A

80 80







 21%|██        | 10/48 [00:08<00:32,  1.17it/s][A[A[A[A[A

80 80







 23%|██▎       | 11/48 [00:09<00:33,  1.10it/s][A[A[A[A[A

80 80







 25%|██▌       | 12/48 [00:10<00:31,  1.14it/s][A[A[A[A[A

80 80







 27%|██▋       | 13/48 [00:11<00:35,  1.00s/it][A[A[A[A[A

80 80







 29%|██▉       | 14/48 [00:12<00:32,  1.03it/s][A[A[A[A[A

80 80







 31%|███▏      | 15/48 [00:13<00:31,  1.05it/s][A[A[A[A[A

80 80







 33%|███▎      | 16/48 [00:14<00:29,  1.08it/s][A[A[A[A[A

80 80







 35%|███▌      | 17/48 [00:15<00:27,  1.11it/s][A[A[A[A[A

80 80







 38%|███▊      | 18/48 [00:16<00:28,  1.05it/s][A[A[A[A[A

80 80







 40%|███▉      | 19/48 [00:17<00:27,  1.06it/s][A[A[A[A[A

80 80







 42%|████▏     | 20/48 [00:18<00:28,  1.02s/it][A[A[A[A[A

80 80







 44%|████▍     | 21/48 [00:19<00:26,  1.01it/s][A[A[A[A[A

80 80







 46%|████▌     | 22/48 [00:20<00:26,  1.02s/it][A[A[A[A[A

80 80







 48%|████▊     | 23/48 [00:21<00:25,  1.00s/it][A[A[A[A[A

80 80







 50%|█████     | 24/48 [00:22<00:25,  1.07s/it][A[A[A[A[A

80 80







 52%|█████▏    | 25/48 [00:24<00:27,  1.19s/it][A[A[A[A[A

80 80







 54%|█████▍    | 26/48 [00:25<00:23,  1.07s/it][A[A[A[A[A

80 80







 56%|█████▋    | 27/48 [00:25<00:20,  1.00it/s][A[A[A[A[A

80 80







 58%|█████▊    | 28/48 [00:26<00:18,  1.06it/s][A[A[A[A[A

80 80







 60%|██████    | 29/48 [00:27<00:17,  1.08it/s][A[A[A[A[A

80 80







 62%|██████▎   | 30/48 [00:28<00:17,  1.04it/s][A[A[A[A[A

80 80







 65%|██████▍   | 31/48 [00:29<00:15,  1.08it/s][A[A[A[A[A

80 80







 67%|██████▋   | 32/48 [00:30<00:16,  1.00s/it][A[A[A[A[A

80 80







 69%|██████▉   | 33/48 [00:31<00:14,  1.00it/s][A[A[A[A[A

80 80







 71%|███████   | 34/48 [00:32<00:13,  1.05it/s][A[A[A[A[A

80 80







 73%|███████▎  | 35/48 [00:33<00:12,  1.06it/s][A[A[A[A[A

80 80







 75%|███████▌  | 36/48 [00:34<00:10,  1.11it/s][A[A[A[A[A

80 80







 77%|███████▋  | 37/48 [00:35<00:10,  1.05it/s][A[A[A[A[A

80 80







 79%|███████▉  | 38/48 [00:36<00:09,  1.04it/s][A[A[A[A[A

80 80







 81%|████████▏ | 39/48 [00:37<00:08,  1.08it/s][A[A[A[A[A

80 80







 83%|████████▎ | 40/48 [00:38<00:07,  1.07it/s][A[A[A[A[A

80 80







 85%|████████▌ | 41/48 [00:38<00:06,  1.12it/s][A[A[A[A[A

80 80







 88%|████████▊ | 42/48 [00:39<00:05,  1.13it/s][A[A[A[A[A

80 80







 90%|████████▉ | 43/48 [00:40<00:04,  1.15it/s][A[A[A[A[A

80 80







 92%|█████████▏| 44/48 [00:41<00:03,  1.16it/s][A[A[A[A[A

80 80







 94%|█████████▍| 45/48 [00:42<00:02,  1.03it/s][A[A[A[A[A

80 80







 96%|█████████▌| 46/48 [00:43<00:02,  1.01s/it][A[A[A[A[A

80 80







 98%|█████████▊| 47/48 [00:44<00:01,  1.03s/it][A[A[A[A[A

80 80







100%|██████████| 48/48 [00:45<00:00,  1.05it/s][A[A[A[A[A

80 80





In [25]:
# save
keyword_df.to_csv('trend_flu.csv')

COVID

In [26]:
iso_keyword_list = getKeywordTrend(keyword = 'covid')
keyword_df = pd.DataFrame(np.transpose(iso_keyword_list), columns=visit_iso_list)
keyword_df['date'] = date_list 






  0%|          | 0/48 [00:00<?, ?it/s][A[A[A[A[A




  2%|▏         | 1/48 [00:00<00:44,  1.04it/s][A[A[A[A[A

80 80







  4%|▍         | 2/48 [00:01<00:43,  1.07it/s][A[A[A[A[A

80 80







  6%|▋         | 3/48 [00:02<00:42,  1.05it/s][A[A[A[A[A

80 80







  8%|▊         | 4/48 [00:03<00:42,  1.04it/s][A[A[A[A[A

80 80







 10%|█         | 5/48 [00:04<00:41,  1.04it/s][A[A[A[A[A

80 80







 12%|█▎        | 6/48 [00:05<00:41,  1.00it/s][A[A[A[A[A

80 80







 15%|█▍        | 7/48 [00:06<00:41,  1.02s/it][A[A[A[A[A

80 80







 17%|█▋        | 8/48 [00:07<00:38,  1.04it/s][A[A[A[A[A

80 80







 19%|█▉        | 9/48 [00:09<00:40,  1.05s/it][A[A[A[A[A

80 80







 21%|██        | 10/48 [00:09<00:39,  1.03s/it][A[A[A[A[A

80 80







 23%|██▎       | 11/48 [00:11<00:40,  1.09s/it][A[A[A[A[A

80 80







 25%|██▌       | 12/48 [00:12<00:38,  1.06s/it][A[A[A[A[A

80 80







 27%|██▋       | 13/48 [00:13<00:34,  1.01it/s][A[A[A[A[A

80 80







 29%|██▉       | 14/48 [00:14<00:34,  1.00s/it][A[A[A[A[A

80 80







 31%|███▏      | 15/48 [00:14<00:31,  1.04it/s][A[A[A[A[A

80 80







 33%|███▎      | 16/48 [00:15<00:31,  1.01it/s][A[A[A[A[A

80 80







 35%|███▌      | 17/48 [00:16<00:29,  1.04it/s][A[A[A[A[A

80 80







 38%|███▊      | 18/48 [00:17<00:28,  1.06it/s][A[A[A[A[A

80 80







 40%|███▉      | 19/48 [00:18<00:27,  1.06it/s][A[A[A[A[A

80 80







 42%|████▏     | 20/48 [00:19<00:26,  1.05it/s][A[A[A[A[A

80 80







 44%|████▍     | 21/48 [00:20<00:25,  1.07it/s][A[A[A[A[A

80 80







 46%|████▌     | 22/48 [00:21<00:23,  1.09it/s][A[A[A[A[A

80 80







 48%|████▊     | 23/48 [00:22<00:24,  1.03it/s][A[A[A[A[A

80 80







 50%|█████     | 24/48 [00:23<00:22,  1.07it/s][A[A[A[A[A

80 80







 52%|█████▏    | 25/48 [00:24<00:21,  1.06it/s][A[A[A[A[A

80 80







 54%|█████▍    | 26/48 [00:25<00:20,  1.09it/s][A[A[A[A[A

80 80







 56%|█████▋    | 27/48 [00:26<00:19,  1.10it/s][A[A[A[A[A

80 80







 58%|█████▊    | 28/48 [00:27<00:19,  1.00it/s][A[A[A[A[A

80 80







 60%|██████    | 29/48 [00:28<00:17,  1.06it/s][A[A[A[A[A

80 80







 62%|██████▎   | 30/48 [00:28<00:16,  1.11it/s][A[A[A[A[A

80 80







 65%|██████▍   | 31/48 [00:30<00:16,  1.03it/s][A[A[A[A[A

80 80







 67%|██████▋   | 32/48 [00:31<00:16,  1.02s/it][A[A[A[A[A

80 80







 69%|██████▉   | 33/48 [00:32<00:15,  1.06s/it][A[A[A[A[A

80 80







 71%|███████   | 34/48 [00:38<00:37,  2.67s/it][A[A[A[A[A

80 80







 73%|███████▎  | 35/48 [00:39<00:28,  2.22s/it][A[A[A[A[A

80 80







 75%|███████▌  | 36/48 [00:40<00:22,  1.86s/it][A[A[A[A[A

80 80







 77%|███████▋  | 37/48 [00:42<00:19,  1.73s/it][A[A[A[A[A

80 80







 79%|███████▉  | 38/48 [00:43<00:16,  1.62s/it][A[A[A[A[A

80 80







 81%|████████▏ | 39/48 [00:44<00:12,  1.42s/it][A[A[A[A[A

80 80







 83%|████████▎ | 40/48 [00:45<00:09,  1.25s/it][A[A[A[A[A

80 80







 85%|████████▌ | 41/48 [00:46<00:08,  1.25s/it][A[A[A[A[A

80 80







 88%|████████▊ | 42/48 [00:47<00:07,  1.18s/it][A[A[A[A[A

80 80







 90%|████████▉ | 43/48 [00:48<00:05,  1.15s/it][A[A[A[A[A

80 80







 92%|█████████▏| 44/48 [00:50<00:04,  1.19s/it][A[A[A[A[A

80 80







 94%|█████████▍| 45/48 [00:51<00:03,  1.12s/it][A[A[A[A[A

80 80







 96%|█████████▌| 46/48 [00:52<00:02,  1.11s/it][A[A[A[A[A

80 80







 98%|█████████▊| 47/48 [00:53<00:01,  1.09s/it][A[A[A[A[A

80 80







100%|██████████| 48/48 [00:54<00:00,  1.13s/it][A[A[A[A[A

80 80





In [27]:
# save
keyword_df.to_csv('trend_covid.csv')

fever

In [31]:
KEYWORD = 'covid test'#'fever'

In [32]:
iso_keyword_list = getKeywordTrend(keyword = KEYWORD)
keyword_df = pd.DataFrame(np.transpose(iso_keyword_list), columns=visit_iso_list)
keyword_df['date'] = date_list 






  0%|          | 0/48 [00:00<?, ?it/s][A[A[A[A[A




  2%|▏         | 1/48 [00:00<00:39,  1.20it/s][A[A[A[A[A

80 80







  4%|▍         | 2/48 [00:01<00:40,  1.14it/s][A[A[A[A[A

80 80







  6%|▋         | 3/48 [00:02<00:39,  1.15it/s][A[A[A[A[A

80 80







  8%|▊         | 4/48 [00:03<00:41,  1.06it/s][A[A[A[A[A

80 80







 10%|█         | 5/48 [00:04<00:40,  1.05it/s][A[A[A[A[A

80 80







 12%|█▎        | 6/48 [00:05<00:38,  1.09it/s][A[A[A[A[A

80 80







 15%|█▍        | 7/48 [00:06<00:37,  1.10it/s][A[A[A[A[A

80 80







 17%|█▋        | 8/48 [00:07<00:35,  1.12it/s][A[A[A[A[A

80 80







 19%|█▉        | 9/48 [00:08<00:36,  1.07it/s][A[A[A[A[A

80 80







 21%|██        | 10/48 [00:09<00:40,  1.06s/it][A[A[A[A[A

80 80







 23%|██▎       | 11/48 [00:11<00:42,  1.15s/it][A[A[A[A[A

80 80







 25%|██▌       | 12/48 [00:11<00:38,  1.08s/it][A[A[A[A[A

80 80







 27%|██▋       | 13/48 [00:12<00:35,  1.01s/it][A[A[A[A[A

80 80







 29%|██▉       | 14/48 [00:14<00:36,  1.06s/it][A[A[A[A[A

80 80







 31%|███▏      | 15/48 [00:15<00:35,  1.07s/it][A[A[A[A[A

80 80







 33%|███▎      | 16/48 [00:16<00:34,  1.07s/it][A[A[A[A[A

80 80







 35%|███▌      | 17/48 [00:17<00:33,  1.09s/it][A[A[A[A[A

80 80







 38%|███▊      | 18/48 [00:18<00:31,  1.05s/it][A[A[A[A[A

80 80







 40%|███▉      | 19/48 [00:19<00:30,  1.04s/it][A[A[A[A[A

80 80







 42%|████▏     | 20/48 [00:20<00:27,  1.03it/s][A[A[A[A[A

80 80







 44%|████▍     | 21/48 [00:21<00:27,  1.02s/it][A[A[A[A[A

80 80







 46%|████▌     | 22/48 [00:22<00:24,  1.04it/s][A[A[A[A[A

80 80







 48%|████▊     | 23/48 [00:23<00:28,  1.12s/it][A[A[A[A[A

80 80







 50%|█████     | 24/48 [00:24<00:26,  1.12s/it][A[A[A[A[A

80 80







 52%|█████▏    | 25/48 [00:25<00:25,  1.12s/it][A[A[A[A[A

80 80







 54%|█████▍    | 26/48 [00:27<00:28,  1.28s/it][A[A[A[A[A

80 80







 56%|█████▋    | 27/48 [00:28<00:25,  1.21s/it][A[A[A[A[A

80 80







 58%|█████▊    | 28/48 [00:29<00:24,  1.22s/it][A[A[A[A[A

80 80







 60%|██████    | 29/48 [00:30<00:21,  1.12s/it][A[A[A[A[A

80 80







 62%|██████▎   | 30/48 [00:31<00:18,  1.02s/it][A[A[A[A[A

80 80







 65%|██████▍   | 31/48 [00:32<00:16,  1.01it/s][A[A[A[A[A

80 80







 67%|██████▋   | 32/48 [00:33<00:14,  1.07it/s][A[A[A[A[A

80 80







 69%|██████▉   | 33/48 [00:34<00:13,  1.07it/s][A[A[A[A[A

80 80







 71%|███████   | 34/48 [00:34<00:13,  1.07it/s][A[A[A[A[A

80 80







 73%|███████▎  | 35/48 [00:35<00:12,  1.08it/s][A[A[A[A[A

80 80







 75%|███████▌  | 36/48 [00:36<00:11,  1.07it/s][A[A[A[A[A

80 80







 77%|███████▋  | 37/48 [00:37<00:10,  1.04it/s][A[A[A[A[A

80 80







 79%|███████▉  | 38/48 [00:38<00:09,  1.07it/s][A[A[A[A[A

80 80







 81%|████████▏ | 39/48 [00:39<00:08,  1.02it/s][A[A[A[A[A

80 80







 83%|████████▎ | 40/48 [00:40<00:07,  1.07it/s][A[A[A[A[A

80 80







 85%|████████▌ | 41/48 [00:41<00:06,  1.05it/s][A[A[A[A[A

80 80







 88%|████████▊ | 42/48 [00:42<00:05,  1.03it/s][A[A[A[A[A

80 80







 90%|████████▉ | 43/48 [00:43<00:05,  1.05s/it][A[A[A[A[A

80 80







 92%|█████████▏| 44/48 [00:44<00:03,  1.02it/s][A[A[A[A[A

80 80







 94%|█████████▍| 45/48 [00:45<00:02,  1.05it/s][A[A[A[A[A

80 80







 96%|█████████▌| 46/48 [00:46<00:02,  1.03s/it][A[A[A[A[A

80 80







 98%|█████████▊| 47/48 [00:47<00:01,  1.01s/it][A[A[A[A[A

80 80







100%|██████████| 48/48 [00:48<00:00,  1.02s/it][A[A[A[A[A

80 80





In [33]:
# save
keyword_df.to_csv('trend_'+KEYWORD+'.csv')

---

### (3) 항공편 데이터
- 항공정보포탈시스템 http://www.airportal.co.kr/life/airinfo/RbHanFrmMain.jsp

In [105]:
import pandas as pd
from collections import Counter

In [34]:
ci = pd.read_csv('country_info.csv')  # country info
visit_iso_list = list(ci[ci.visit == 1].iso)       # 해외확진자가 방문했던 국가 iso 리스트
visit_country_list = list(ci[ci.visit==1].Country) # 해외확진자가 방문했던 국가명 리스트

air = pd.read_csv('airport_info.csv') # airport info
icn = pd.read_csv('ICN_arrive.csv')   # flight plans (3/22~5/19)

In [35]:
country_name_list = []
for iata in icn.IATA:
    try:
        iso = air[air.iata_code == iata].iso_country.iloc[0]
        country_name = ci[ci.iso==iso].Country.iloc[0]
        country_name_list.append(country_name)        
    except:
        country_name_list.append('empty')

In [36]:
icn['Country'] = country_name_list
icn_preprocess = icn[icn.Country != 'empty']
icn_preprocess = icn_preprocess[icn_preprocess.arrive != '취소']

In [37]:
icn_preprocess

Unnamed: 0,date,airline,airplane,IATA,arrive,Country
0,20200322,대한항공,KE624,MNL,도착,Philippines
1,20200322,대한항공,KE012,LAX,도착,US
2,20200322,대한항공,KE026,SFO,도착,US
3,20200322,아시아나항공,OZ704,MNL,도착,Philippines
4,20200322,대한항공,KE632,CEB,도착,Philippines
...,...,...,...,...,...,...
2518,20200519,제주항공,7C1103,NRT,계획,Japan
2520,20200519,비엣제트 항공,VJ874,DAD,계획,Vietnam
2521,20200519,싱가폴항공,SQ602,SIN,계획,Singapore
2524,20200519,아시아나항공,OZ203,LAX,계획,US


In [151]:
date_range = pd.date_range(start='20200322', end='20200519')
date_list = date_range.strftime("%Y%m%d").tolist()

country_flight = {country:[] for country in visit_country_list}

for date in date_list:
    counter = Counter(icn_preprocess[icn_preprocess.date == int(date)].Country)
    for country in country_flight:
        country_flight[country].append(counter[country])

In [167]:
def transform(date):
    y=int(date[2:4])
    m=int(date[4:6])
    d=int(date[6:])
    return str(m)+'/'+str(d)+'/'+str(y)

In [174]:
icn_result = pd.DataFrame(country_flight, index=[transform(date) for date in date_list]).transpose()
icn_result.to_csv('ICN_arrive_preprocess.csv')

In [176]:
icn_result.head()

Unnamed: 0,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20,3/28/20,3/29/20,3/30/20,3/31/20,...,5/10/20,5/11/20,5/12/20,5/13/20,5/14/20,5/15/20,5/16/20,5/17/20,5/18/20,5/19/20
Argentina,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Australia,0,0,0,0,0,0,0,1,1,1,...,0,0,0,1,0,0,0,0,0,0
Austria,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bolivia,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Brazil,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
