In [None]:
# 자료 출저 : EU open data
# https://data.europa.eu/euodp/en/data/dataset/covid-19-coronavirus-data/resource/55e8f966-d5c8-438e-85bc-c7a5a26f4863

In [1]:
import pandas as pd

covid = pd.read_csv('data/COVID19.csv', encoding = 'CP949')
covid.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,continentExp
0,2020-04-23,23,4,2020,84,4,Afghanistan,AF,AFG,37172386.0,Asia
1,2020-04-22,22,4,2020,61,1,Afghanistan,AF,AFG,37172386.0,Asia
2,2020-04-21,21,4,2020,35,2,Afghanistan,AF,AFG,37172386.0,Asia
3,2020-04-20,20,4,2020,88,3,Afghanistan,AF,AFG,37172386.0,Asia
4,2020-04-19,19,4,2020,63,0,Afghanistan,AF,AFG,37172386.0,Asia


### 데이터 변수 설명
- dateRep : 날자
- day
- month
- year
- cases : 확진자 수
- deaths : 사망자 수
- countriesAndTerritories : 국가
- geold : 지오이드
- countryterritoryCode : 국가코드
- popData2018 : 2018년 인구데이터
- continentExp : 대륙 구분


- 국가의 Cases_on_an_international_conveyance_Japan : 일본 유람선인듯 ( geoid = JPG11668 )

## data check

In [2]:
covid.dtypes

dateRep                     object
day                          int64
month                        int64
year                         int64
cases                        int64
deaths                       int64
countriesAndTerritories     object
geoId                       object
countryterritoryCode        object
popData2018                float64
continentExp                object
dtype: object

In [3]:
covid.isnull().sum()

dateRep                      0
day                          0
month                        0
year                         0
cases                        0
deaths                       0
countriesAndTerritories      0
geoId                       40
countryterritoryCode       134
popData2018                103
continentExp                 0
dtype: int64

In [19]:
covid.shape

(12596, 11)

## 데이터 결측값 처리

In [4]:
# geoId가 null 값인 데이터 확인 후 해당 하는 국가의 geoID, countryterritoryCode 결측치 입력
# Namibia의 geoId는 NA
# 원본 확인 결과 Namibia의 geoId는 입력되어 있으나 NA를 NaN값으로 인식하는 듯

covid[covid['geoId'].isnull()].head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,continentExp
8031,2020-04-23,23,4,2020,0,0,Namibia,,NAM,2448255.0,Africa
8032,2020-04-22,22,4,2020,0,0,Namibia,,NAM,2448255.0,Africa
8033,2020-04-21,21,4,2020,0,0,Namibia,,NAM,2448255.0,Africa
8034,2020-04-20,20,4,2020,0,0,Namibia,,NAM,2448255.0,Africa
8035,2020-04-19,19,4,2020,0,0,Namibia,,NAM,2448255.0,Africa


In [5]:
covid.loc[covid["countriesAndTerritories"] == "Namibia", "geoId"] = "NA"

In [6]:
display(covid[covid["countriesAndTerritories"] == "Namibia"].head(2))

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,continentExp
8031,2020-04-23,23,4,2020,0,0,Namibia,,NAM,2448255.0,Africa
8032,2020-04-22,22,4,2020,0,0,Namibia,,NAM,2448255.0,Africa


In [7]:
covid[covid['countryterritoryCode'].isnull()].head()

# Anguilla의 countryterritoryCode는 AIA, 인구는 16000
# Bonaire, Saint Eustatius and Saba의 countryterritoryCode는 BES, 인구는 25160
# Falkland_Islands_(Malvinas)의 countryterritoryCode는 FLK, 인구는 2840
# countryterritoryCode의 출저는 ISO(국제표준화기구), 인구는 구글 검색 참고

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,continentExp
335,2020-04-23,23,4,2020,0,0,Anguilla,AI,,,America
336,2020-04-22,22,4,2020,0,0,Anguilla,AI,,,America
337,2020-04-21,21,4,2020,0,0,Anguilla,AI,,,America
338,2020-04-20,20,4,2020,0,0,Anguilla,AI,,,America
339,2020-04-19,19,4,2020,0,0,Anguilla,AI,,,America


In [8]:
covid.loc[covid["countriesAndTerritories"] == "Anguilla", "countryterritoryCode"] = "AIA"
covid.loc[covid["countriesAndTerritories"] == "Anguilla", "popData2018"] = 16000

covid.loc[covid["countriesAndTerritories"] == "Bonaire, Saint Eustatius and Saba", "countryterritoryCode"] = "BES"
covid.loc[covid["countriesAndTerritories"] == "Bonaire, Saint Eustatius and Saba", "popData2018"] = 25160

covid.loc[covid["countriesAndTerritories"] == "Falkland_Islands_(Malvinas)", "countryterritoryCode"] = "FLK"
covid.loc[covid["countriesAndTerritories"] == "Falkland_Islands_(Malvinas)", "popData2018"] = 2840

In [9]:
display(covid[covid["countriesAndTerritories"] == "Anguilla"].head(2))
display(covid[covid["countriesAndTerritories"] == "Bonaire, Saint Eustatius and Saba"].head(2))
display(covid[covid["countriesAndTerritories"] == "Falkland_Islands_(Malvinas)"].head(2))

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,continentExp
335,2020-04-23,23,4,2020,0,0,Anguilla,AI,AIA,16000.0,America
336,2020-04-22,22,4,2020,0,0,Anguilla,AI,AIA,16000.0,America


Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,continentExp
1559,2020-04-23,23,4,2020,0,0,"Bonaire, Saint Eustatius and Saba",BQ,BES,25160.0,America
1560,2020-04-22,22,4,2020,1,0,"Bonaire, Saint Eustatius and Saba",BQ,BES,25160.0,America


Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,continentExp
3942,2020-04-23,23,4,2020,0,0,Falkland_Islands_(Malvinas),FK,FLK,2840.0,America
3943,2020-04-22,22,4,2020,1,0,Falkland_Islands_(Malvinas),FK,FLK,2840.0,America


In [10]:
covid.isnull().sum()

dateRep                     0
day                         0
month                       0
year                        0
cases                       0
deaths                      0
countriesAndTerritories     0
geoId                       0
countryterritoryCode       64
popData2018                33
continentExp                0
dtype: int64

In [11]:
covid[covid['popData2018'].isnull()].head(2)

# popData가 빠진 Eritrea의 인구데이터는 3214000

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,continentExp
3718,2020-04-23,23,4,2020,0,0,Eritrea,ER,ERI,,Africa
3719,2020-04-22,22,4,2020,0,0,Eritrea,ER,ERI,,Africa


In [12]:
covid.loc[covid["countriesAndTerritories"] == "Eritrea", "popData2018"] = 3214000

covid[covid["countriesAndTerritories"] == "Eritrea"].head(2)

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,continentExp
3718,2020-04-23,23,4,2020,0,0,Eritrea,ER,ERI,3214000.0,Africa
3719,2020-04-22,22,4,2020,0,0,Eritrea,ER,ERI,3214000.0,Africa


In [13]:
covid[covid['countryterritoryCode'].isnull()].head(2)

# Cases_on_an_international_conveyance_Japan의 국가코드는 임의로 JPG라고 지정

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,continentExp
2240,2020-03-10,10,3,2020,-9,1,Cases_on_an_international_conveyance_Japan,JPG11668,,3000.0,Other
2241,2020-03-02,2,3,2020,0,0,Cases_on_an_international_conveyance_Japan,JPG11668,,3000.0,Other


In [14]:
covid.loc[covid["countriesAndTerritories"] == "Cases_on_an_international_conveyance_Japan", "countryterritoryCode"] = "JPG"

covid[covid["countriesAndTerritories"] == "Cases_on_an_international_conveyance_Japan"].head(2)

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,continentExp
2240,2020-03-10,10,3,2020,-9,1,Cases_on_an_international_conveyance_Japan,JPG11668,JPG,3000.0,Other
2241,2020-03-02,2,3,2020,0,0,Cases_on_an_international_conveyance_Japan,JPG11668,JPG,3000.0,Other


In [15]:
covid.isnull().sum()

dateRep                    0
day                        0
month                      0
year                       0
cases                      0
deaths                     0
countriesAndTerritories    0
geoId                      0
countryterritoryCode       0
popData2018                0
continentExp               0
dtype: int64

In [16]:
covid["continentExp"].value_counts()

Europe     4388
Asia       3417
America    2220
Africa     2087
Oceania     420
Other        64
Name: continentExp, dtype: int64

In [20]:
#!pip install plotly

Collecting plotly
  Downloading plotly-4.6.0-py2.py3-none-any.whl (7.1 MB)
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py): started
  Building wheel for retrying (setup.py): finished with status 'done'
  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11435 sha256=7d5999a8e0668bc0bad1abb94f1c618980581b29b6be1e73b1110e5abcf3029f
  Stored in directory: c:\users\tj\appdata\local\pip\cache\wheels\f9\8d\8d\f6af3f7f9eea3553bc2fe6d53e4b287dad18b06a861ac56ddf
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-4.6.0 retrying-1.3.3


# 데이터 시각화(Data Visualization)

In [21]:
import plotly
import plotly.express as px

### 각 나라별 dateRep에 따른 cases 수치

In [None]:
Country_Cases = px.bar(covid, x="dateRep", y="cases",
             animation_frame="countriesAndTerritories", title = "나라별 날짜의 경과에 따른 확진자 수",)
Country_Cases.show()

In [None]:
plotly.offline.plot(Country_Cases, filename = "Country_Cases_bar.html", auto_play = False)

### 각 나라별 dateRep에 따른 deaths 수치

In [None]:
Country_Deaths = px.bar(covid, x="dateRep", y="deaths",
             animation_frame="countriesAndTerritories", title = "나라별 날짜의 경과에 따른 사망자 수")
Country_Deaths.show()

In [None]:
plotly.offline.plot(Country_Deaths, filename = "Country_Deaths_bar.html", auto_play = False)

## dateRep에 따른 전세계의 cases, deaths

In [None]:
import plotly.graph_objs as go

In [None]:
df1 = covid.groupby(by='dateRep').sum().reset_index()
df1

In [None]:
B_1 = go.Bar(x = df1["dateRep"], y = df1["cases"])
B_2 = go.Bar(x = df1["dateRep"], y = df1["deaths"])

data = [B_1, B_2]
layout = go.Layout(title = "날짜의 경과에 따른 전세계의 추이", barmode = "stack")
Global_covid = go.Figure(data=data, layout=layout)
Global_covid.show()

In [None]:
plotly.offline.plot(Global_covid, filename = "Global_covid.html", auto_play = False)

In [71]:
df = covid.sort_values(by = ["dateRep", "continentExp", "countriesAndTerritories"])
df.head(10)

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,continentExp
260,2019-12-31,31,12,2019,0,0,Algeria,DZ,DZA,42228429.0,Africa
3641,2019-12-31,31,12,2019,0,0,Egypt,EG,EGY,98423595.0,Africa
8601,2019-12-31,31,12,2019,0,0,Nigeria,NG,NGA,195874740.0,Africa
1760,2019-12-31,31,12,2019,0,0,Brazil,BR,BRA,209469333.0,America
2205,2019-12-31,31,12,2019,0,0,Canada,CA,CAN,37058856.0,America
3420,2019-12-31,31,12,2019,0,0,Dominican_Republic,DO,DOM,10627165.0,America
3530,2019-12-31,31,12,2019,0,0,Ecuador,EC,ECU,17084357.0,America
7664,2019-12-31,31,12,2019,0,0,Mexico,MX,MEX,126190788.0,America
12251,2019-12-31,31,12,2019,0,0,United_States_of_America,US,USA,327167434.0,America
104,2019-12-31,31,12,2019,0,0,Afghanistan,AF,AFG,37172386.0,Asia


In [None]:
# df["con_sum"] = 

In [70]:
Continent_Cases = px.bar(df, x="continentExp", y="cases", color = "continentExp",
             animation_frame="dateRep", title = "대륙별 확진자 수")
Continent_Cases.show()