## 결측치 확인

In [1]:
import pandas as pd
covidtotals = pd.read_csv("data/covidtotalswithmissings.csv")

In [2]:
covidtotals.head()

Unnamed: 0,iso_code,lastdate,location,total_cases,total_deaths,total_cases_pm,total_deaths_pm,population,pop_density,median_age,gdp_per_capita,hosp_beds
0,AFG,2020-06-01,Afghanistan,15205,257,390.589,6.602,38928341.0,54.422,18.6,1803.987,0.5
1,ALB,2020-06-01,Albania,1137,33,395.093,11.467,2877800.0,104.871,38.0,11803.431,2.89
2,DZA,2020-06-01,Algeria,9394,653,214.225,14.891,43851043.0,17.348,29.1,13913.839,1.9
3,AND,2020-06-01,Andorra,764,51,9888.048,660.066,77265.0,163.755,,,
4,AGO,2020-06-01,Angola,86,4,2.617,0.122,32866268.0,23.89,16.8,5819.495,


- Missing Values 확인

In [3]:
covidtotals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   iso_code         210 non-null    object 
 1   lastdate         210 non-null    object 
 2   location         210 non-null    object 
 3   total_cases      210 non-null    int64  
 4   total_deaths     210 non-null    int64  
 5   total_cases_pm   209 non-null    float64
 6   total_deaths_pm  209 non-null    float64
 7   population       210 non-null    float64
 8   pop_density      198 non-null    float64
 9   median_age       186 non-null    float64
 10  gdp_per_capita   182 non-null    float64
 11  hosp_beds        164 non-null    float64
dtypes: float64(7), int64(2), object(3)
memory usage: 19.8+ KB


- 리스트 작성
    - 인구통계 관련 column
    - Covid 관련 column

In [4]:
case_vars = ["location", "total_cases", "total_deaths", "total_cases_pm", "total_deaths_pm"]
demo_vars = ["population", "pop_density", "median_age", "gdp_per_capita", "hosp_beds"]

In [5]:
covidtotals[demo_vars].isnull().sum(axis = 0) # column별로 결측치 측정

population         0
pop_density       12
median_age        24
gdp_per_capita    28
hosp_beds         46
dtype: int64

In [6]:
covidtotals[case_vars].isnull().sum(axis = 0)

location           0
total_cases        0
total_deaths       0
total_cases_pm     1
total_deaths_pm    1
dtype: int64

- 행 방향으로 발생한 결측치 확인

In [7]:
demovars_misscnt = covidtotals[demo_vars].isnull().sum(axis = 1)
demovars_misscnt

0      0
1      0
2      0
3      3
4      1
      ..
205    0
206    3
207    0
208    0
209    0
Length: 210, dtype: int64

In [8]:
demovars_misscnt.value_counts()

0    156
1     24
2     12
3     10
4      8
dtype: int64

In [9]:
covidtotals[case_vars].isnull().sum(axis = 1).value_counts()

0    209
2      1
dtype: int64

- 인구통계 데이터가 3가지 이상 누락된 국가를 나열

In [10]:
covidtotals.loc[demovars_misscnt >= 3, ["location"] + demo_vars].head(5)

Unnamed: 0,location,population,pop_density,median_age,gdp_per_capita,hosp_beds
3,Andorra,77265.0,163.755,,,
5,Anguilla,15002.0,,,,
24,Bonaire Sint Eustatius and Saba,26221.0,,,,
28,British Virgin Islands,30237.0,207.973,,,
64,Faeroe Islands,48865.0,35.308,,,


In [11]:
covidtotals.loc[demovars_misscnt >= 3, ["location"] + demo_vars].T # T: 행과 열을 바꿔줌

Unnamed: 0,3,5,24,28,64,65,75,77,81,95,100,104,130,142,185,194,203,206
location,Andorra,Anguilla,Bonaire Sint Eustatius and Saba,British Virgin Islands,Faeroe Islands,Falkland Islands,Gibraltar,Greenland,Guernsey,Isle of Man,Jersey,Kosovo,Montserrat,Northern Mariana Islands,Taiwan,Turks and Caicos Islands,Vatican,Western Sahara
population,77265.0,15002.0,26221.0,30237.0,48865.0,3483.0,33691.0,56772.0,67052.0,85032.0,101073.0,1932774.0,4999.0,57557.0,23816775.0,38718.0,809.0,597330.0
pop_density,163.755,,,207.973,35.308,,3457.1,0.137,,147.872,,,,119.878,,37.312,,
median_age,,,,,,,,,,,,,,,42.2,,,28.4
gdp_per_capita,,,,,,,,,,,,,,,,,,
hosp_beds,,,,,,,,,,,,,,,,,,


- case 관련 누락국가 확인

In [12]:
casevars_misscnt = covidtotals[case_vars].isnull().sum(axis = 1)
casevars_misscnt.value_counts()

0    209
2      1
dtype: int64

In [13]:
covidtotals.loc[casevars_misscnt >= 1, ["location"] + case_vars]

Unnamed: 0,location,location.1,total_cases,total_deaths,total_cases_pm,total_deaths_pm
87,Hong Kong,Hong Kong,0,0,,


In [14]:
covidtotals[covidtotals["location"] == "Hong Kong"]

Unnamed: 0,iso_code,lastdate,location,total_cases,total_deaths,total_cases_pm,total_deaths_pm,population,pop_density,median_age,gdp_per_capita,hosp_beds
87,HKG,2020-05-26,Hong Kong,0,0,,,7496988.0,7039.714,44.8,56054.92,


In [15]:
temp = covidtotals.copy()
temp

Unnamed: 0,iso_code,lastdate,location,total_cases,total_deaths,total_cases_pm,total_deaths_pm,population,pop_density,median_age,gdp_per_capita,hosp_beds
0,AFG,2020-06-01,Afghanistan,15205,257,390.589,6.602,38928341.0,54.422,18.6,1803.987,0.50
1,ALB,2020-06-01,Albania,1137,33,395.093,11.467,2877800.0,104.871,38.0,11803.431,2.89
2,DZA,2020-06-01,Algeria,9394,653,214.225,14.891,43851043.0,17.348,29.1,13913.839,1.90
3,AND,2020-06-01,Andorra,764,51,9888.048,660.066,77265.0,163.755,,,
4,AGO,2020-06-01,Angola,86,4,2.617,0.122,32866268.0,23.890,16.8,5819.495,
...,...,...,...,...,...,...,...,...,...,...,...,...
205,VNM,2020-06-01,Vietnam,328,0,3.370,0.000,97338583.0,308.127,32.6,6171.884,2.60
206,ESH,2020-06-01,Western Sahara,23,1,38.505,1.674,597330.0,,28.4,,
207,YEM,2020-06-01,Yemen,323,80,10.829,2.682,29825968.0,53.508,20.3,1479.147,0.70
208,ZMB,2020-06-01,Zambia,1057,7,57.496,0.381,18383956.0,22.995,17.7,3689.251,2.00


In [16]:
temp.total_cases_pm.fillna(temp.total_cases / (temp.population/10000000), inplace = True)
temp.total_deaths_pm.fillna(temp.total_cases / (temp.population/10000000), inplace = True)

In [17]:
temp[case_vars].isnull().sum(axis = 0)

location           0
total_cases        0
total_deaths       0
total_cases_pm     0
total_deaths_pm    0
dtype: int64

## 이상치 판별