## Pandas라이브러리를 이용한 Data 처리법

### [ 1. Series로 feature를 보다 디테일하게 ]

In [2]:
import pandas as pd
path = 'COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/'
csv_day = '04-01-2020.csv'
doc = pd.read_csv(path + csv_day, encoding = 'utf-8-sig')

In [3]:
doc.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-04-01 21:58:49,34.223334,-82.461707,4,0,0,0,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-04-01 21:58:49,30.295065,-92.414197,47,1,0,0,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-04-01 21:58:49,37.767072,-75.632346,7,0,0,0,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-04-01 21:58:49,43.452658,-116.241552,195,3,0,0,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-04-01 21:58:49,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"


#### (1) Dataframe에서 Series 추출하기

In [5]:
countries = doc['Country_Region']
countries.head() # head를 안붙이면 전체가 출력되기에, 일부 출력확인을 위해 head함수 사용

0    US
1    US
2    US
3    US
4    US
Name: Country_Region, dtype: object

#### (2) Series로 feature 보다 상세하게 탐색하기
- size           : series의 size 반환
- count()        : 데이터가 없는 경우를 뺀 사이즈 반환
- unique()       : 유일한 값만 반환
- value_counts() : 데이터가 없는 경우를 제외하고, 각 값의 갯수를 반환

In [8]:
# size와 count의 값이 같으므로, 누락된 데이터가 없다는 뜻이다.
print(countries.size,countries.count())

2483 2483


In [11]:
print(countries.unique(),', length :',len(countries.unique()))

['US' 'Canada' 'United Kingdom' 'China' 'Netherlands' 'Australia'
 'Denmark' 'France' 'Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Austria' 'Azerbaijan'
 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize'
 'Benin' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina' 'Botswana' 'Brazil'
 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burma' 'Burundi' 'Cabo Verde'
 'Cambodia' 'Cameroon' 'Central African Republic' 'Chad' 'Chile'
 'Colombia' 'Congo (Brazzaville)' 'Congo (Kinshasa)' 'Costa Rica'
 "Cote d'Ivoire" 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Diamond Princess'
 'Djibouti' 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt'
 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini'
 'Ethiopia' 'Fiji' 'Finland' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana'
 'Greece' 'Grenada' 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti'
 'Holy See' 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran'
 'Iraq' 'Ireland' 'Israel' 'It

In [12]:
countries.value_counts()

US                2228
China               33
Canada              15
United Kingdom      10
France              10
                  ... 
Mauritius            1
Venezuela            1
Ecuador              1
Panama               1
Namibia              1
Name: Country_Region, Length: 180, dtype: int64

### [ 2.  Dataframe에서 필요 Column만 선택하기 ]

In [20]:
# 여러 칼럼을 선택하면, 별도의 데이터프레임이 된다.\
print(doc.columns)

covid_stat = doc[['Confirmed', 'Deaths', 'Recovered']]
covid_stat.head()

Index(['FIPS', 'Admin2', 'Province_State', 'Country_Region', 'Last_Update',
       'Lat', 'Long_', 'Confirmed', 'Deaths', 'Recovered', 'Active',
       'Combined_Key'],
      dtype='object')


Unnamed: 0,Confirmed,Deaths,Recovered
0,4,0,0
1,47,1,0
2,7,0,0
3,195,3,0
4,1,0,0


### [ 3. 특정 조건에 맞는 row 검색하기 ]

In [23]:
csv_day = '04-01-2020.csv'
doc = pd.read_csv( path + csv_day, encoding = 'utf-8-sig')
doc_us = doc[doc['Country_Region'] == 'US']
doc_us.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-04-01 21:58:49,34.223334,-82.461707,4,0,0,0,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-04-01 21:58:49,30.295065,-92.414197,47,1,0,0,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-04-01 21:58:49,37.767072,-75.632346,7,0,0,0,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-04-01 21:58:49,43.452658,-116.241552,195,3,0,0,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-04-01 21:58:49,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"


### [ 4. 없는 데이터(NaN) 처리하기 ]

#### (1) 없는 데이터(결측지) 확인하기
- isnull() : 없는 데이터가 있는지 확인 (반환값은 bool)
- sum() : isnull은 단지 T/F값만 반환하므로, 결측지의 총 결산을 확인하기 위해 사용
- 따라서, isnull().sum()을 통상적으로 사용

In [26]:
doc.isnull().sum()

FIPS              312
Admin2            262
Province_State    176
Country_Region      0
Last_Update         0
Lat                 1
Long_               1
Confirmed           0
Deaths              0
Recovered           0
Active              0
Combined_Key        0
dtype: int64

#### (2) 없는 데이터(결측지) 삭제하기
- dropna() : 결측치를 가진 '행'을 모두 삭제

In [31]:
doc = doc.dropna()
doc.isnull().sum() # 결측치의 총 갯수가 0이 되었음을 확인 가능

FIPS              0
Admin2            0
Province_State    0
Country_Region    0
Last_Update       0
Lat               0
Long_             0
Confirmed         0
Deaths            0
Recovered         0
Active            0
Combined_Key      0
dtype: int64

#### (3) 특정 컬럼값이 없는 데이터만 삭제하기
- dropna(subset = [특정컬럼])

In [36]:
csv_day = '01-22-2020.csv'
doc = pd.read_csv( path + csv_day, encoding = 'utf-8-sig')
doc = doc.dropna(subset = ['Confirmed']) # Confrimed가 NaN인 행만 삭제됨
doc.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,Anhui,Mainland China,1/22/2020 17:00,1.0,,
1,Beijing,Mainland China,1/22/2020 17:00,14.0,,
2,Chongqing,Mainland China,1/22/2020 17:00,6.0,,
3,Fujian,Mainland China,1/22/2020 17:00,1.0,,
5,Guangdong,Mainland China,1/22/2020 17:00,26.0,,


#### (4) 없는 데이터를 특정값으로 일괄 변경하기
- fillna()

In [39]:
doc = pd.read_csv( path + csv_day, encoding = 'utf-8-sig')
doc = doc.fillna(0) # NaN 데이터가 모두 0로 일괄 변경됨
doc.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


#### (5) 없는 데이터를 특정값으로 특정 컬럼만 변경하기
- fillna(딕셔너리 형식 데이터)

In [42]:
doc = pd.read_csv(path + csv_day, encoding = 'utf-8-sig')
nan_data = { 'Deaths' : 'No_data', 'Recovered' : 'No_data'}
doc = doc.fillna(nan_data) # Deaths 와 Recovered 컬럼의 NaN 데이터가 No_data로 일괄 변경됨
doc.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,Anhui,Mainland China,1/22/2020 17:00,1.0,No_data,No_data
1,Beijing,Mainland China,1/22/2020 17:00,14.0,No_data,No_data
2,Chongqing,Mainland China,1/22/2020 17:00,6.0,No_data,No_data
3,Fujian,Mainland China,1/22/2020 17:00,1.0,No_data,No_data
4,Gansu,Mainland China,1/22/2020 17:00,,No_data,No_data


### [ 5. 특정 키 값을 기준으로 데이터 합치기 ]
- groupby(), sum()

In [45]:
# groupby()에 의해서 index가 Country_Region으로 변경되었음을 확인 할 수 있다.
csv_day = '04-01-2020.csv'
doc = pd.read_csv( path + csv_day, encoding = 'utf-8-sig')
doc = doc.groupby('Country_Region').sum()
doc.head()

Unnamed: 0_level_0,FIPS,Lat,Long_,Confirmed,Deaths,Recovered,Active
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Afghanistan,0.0,33.93911,67.709953,237,4,5,228
Albania,0.0,41.1533,20.1683,259,15,67,177
Algeria,0.0,28.0339,1.6596,847,58,61,728
Andorra,0.0,42.5063,1.5218,390,14,10,366
Angola,0.0,-11.2027,17.8739,8,2,1,5


In [47]:
doc[doc.index == 'US'] 

Unnamed: 0_level_0,FIPS,Lat,Long_,Confirmed,Deaths,Recovered,Active
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
US,65168934.0,82956.96013,-197553.963757,213372,4757,8474,0
