### pandas
1. 데이터 분석에서 필수 라이브러리 
2. 데이터의 구조는 Series, DataFrame 
3. Series - 1차원 데이터 
4. DataFrame - 2차원 데이터

In [None]:
## 외부의 라이브러리 다운 
!pip install pandas

In [1]:
# 라이브러리 로드 
import pandas as pd

In [2]:
# 스리즈 형태의 데이터를 생성 
pd_series = pd.Series(
    [5000, 6000, 6500, 6500], 
    index = ['아메리카노', '카페라떼', '카페모카', '카푸치노']
)

In [3]:
pd_series

아메리카노    5000
카페라떼     6000
카페모카     6500
카푸치노     6500
dtype: int64

In [4]:
# Series class에는 values, index 변수가 존재
print(pd_series.index)
print(pd_series.values)

Index(['아메리카노', '카페라떼', '카페모카', '카푸치노'], dtype='object')
[5000 6000 6500 6500]


In [5]:
# DataFrame 2차원 데이터 생성 
_values = [
    [1,2,3], 
    [4,5,6], 
    [7,8,9]
]
_index = ['a', 'b', 'c']
_columns = ['A', 'B', 'C']

df = pd.DataFrame(_values, index = _index, columns=_columns)


In [6]:
print(df)

   A  B  C
a  1  2  3
b  4  5  6
c  7  8  9


In [7]:
df

Unnamed: 0,A,B,C
a,1,2,3
b,4,5,6
c,7,8,9


In [8]:
# DataFrame class 안에는 적어도 3개의 독립적인 변수(values, index, columns)
print(df.values)
print(df.index)
print(df.columns)

[[1 2 3]
 [4 5 6]
 [7 8 9]]
Index(['a', 'b', 'c'], dtype='object')
Index(['A', 'B', 'C'], dtype='object')


In [9]:
df.columns = [1,2,3]

In [10]:
df

Unnamed: 0,1,2,3
a,1,2,3
b,4,5,6
c,7,8,9


In [11]:
df.columns = ['A', 'B', 'C']

In [12]:
df

Unnamed: 0,A,B,C
a,1,2,3
b,4,5,6
c,7,8,9


In [13]:
df = df.rename(columns={'C' : 'F'})

In [14]:
df

Unnamed: 0,A,B,F
a,1,2,3
b,4,5,6
c,7,8,9


In [15]:
# dict 형태 데이터를 이용하여 DataFrame 대입 
dict1 = {
    'name' : ['test', 'test2', 'test3'], 
    'age' : [20, 30, 40]
}

In [16]:
df2 = pd.DataFrame(dict1)
df2

Unnamed: 0,name,age
0,test,20
1,test2,30
2,test3,40


In [17]:
dict2 = [
    {
        'name' : 'test', 
        'age' : 20
    },
    {
        'name' : 'test2', 
        'age' : 30
    }, 
    {
        'name' : 'test3', 
        'age' : 40
    }
]

In [18]:
# test2 데이터만 출력하려면
print(dict2)
print(dict2[1])
print(dict2[1]['name'])

[{'name': 'test', 'age': 20}, {'name': 'test2', 'age': 30}, {'name': 'test3', 'age': 40}]
{'name': 'test2', 'age': 30}
test2


In [19]:
df3 = pd.DataFrame(dict2)
df3

Unnamed: 0,name,age
0,test,20
1,test2,30
2,test3,40


### pandas를 이용하여 외부의 파일을 로드 
- read_xxxx({path}) : 해당하는 path에 있는 파일을 로드 

### 경로
- 절대 경로
    - 절대적인 주소를 의미
    - 환경이 변하더라도 같은 위치를 지정 
    - ex) (c:/users/admin/document/a.txt)
    - ex) WEB (https://www.google.com)
- 상대 경로
    - 상대적인 주소를 의미
    - 환경이 변하면 환경에 따라 위치도 변경
    - 현재 작업중인 디렉토리에서 상위로 이동하거나 하위로 이동
    - ./ : 현재 작업중인 디렉토리 
    - ../ : 상위 폴더로 이동
    - 폴더명/ : 하위 폴더로 이동

In [None]:
# 절대경로로 파일을 로드 
corona = pd.read_csv('D:\\python_\\ezen_python\\csv\\corona.csv')

In [20]:
# 상대경로로 파일을 로드 
# 상위 폴더로 이동(../) -> csv 하위폴더 이동(csv/) -> corona.csv
corona2 = pd.read_csv("../csv/corona.csv")

In [21]:
corona2

Unnamed: 0.1,Unnamed: 0,createDt,deathCnt,decideCnt,seq,stateDt,stateTime,updateDt,accExamCnt,accDefRate
0,0,2022-06-08 09:09:05.982,24305,18188200,904,20220608,00:00,,,
1,1,2022-06-07 09:09:00.897,24299,18174842,903,20220607,00:00,2022-06-08 09:10:36.846,,
2,2,2022-06-06 09:00:06.734,24279,18168670,902,20220606,00:00,2022-06-08 09:10:50.441,,
3,3,2022-06-05 08:53:19.426,24258,18163648,901,20220605,00:00,2022-06-08 09:11:04.758,,
4,4,2022-06-04 08:56:49.219,24238,18153814,900,20220604,00:00,2022-06-08 09:11:26.303,,
...,...,...,...,...,...,...,...,...,...,...
815,815,2020-03-14 00:00:00.000,72,8086,55,20200314,00:00,2021-10-07 10:30:51.51,261335.0,3.318000
816,816,2020-03-13 00:00:00.000,67,7979,54,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499
817,817,2020-03-12 00:00:00.000,66,7869,53,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744
818,818,2020-03-11 00:00:00.000,60,7755,52,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175


In [22]:
# 데이프레임의 정보 
corona2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 820 entries, 0 to 819
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  820 non-null    int64  
 1   createDt    820 non-null    object 
 2   deathCnt    820 non-null    int64  
 3   decideCnt   820 non-null    int64  
 4   seq         820 non-null    int64  
 5   stateDt     820 non-null    int64  
 6   stateTime   820 non-null    object 
 7   updateDt    817 non-null    object 
 8   accExamCnt  692 non-null    float64
 9   accDefRate  623 non-null    float64
dtypes: float64(2), int64(5), object(3)
memory usage: 64.2+ KB


In [23]:
# 데이터프레임의 통계 분석용 요약 정보 출력 
corona2.describe()

Unnamed: 0.1,Unnamed: 0,deathCnt,decideCnt,seq,stateDt,accExamCnt,accDefRate
count,820.0,820.0,820.0,820.0,820.0,692.0,623.0
mean,409.5,3927.832927,1913097.0,472.393902,20208970.0,7545057.0,1.556435
std,236.857904,6164.310693,4857030.0,249.701095,7118.265,6172020.0,0.543014
min,0.0,54.0,7513.0,51.0,20200310.0,210144.0,0.902205
25%,204.75,415.75,23935.25,256.75,20201000.0,1934309.0,1.078089
50%,409.5,1812.5,118564.0,461.5,20210420.0,6368310.0,1.416159
75%,614.25,3120.5,397991.5,698.25,20211120.0,12168900.0,1.816009
max,819.0,24305.0,18188200.0,904.0,20220610.0,21518070.0,3.919308


In [25]:
# 결측치인가 아닌가를 판단하는 함수
# isXXX() : 특정한 데이터가 존재하는가? -> 결과값은 bool
# na : 결측치를 의미
corona2.isna().sum()

Unnamed: 0      0
createDt        0
deathCnt        0
decideCnt       0
seq             0
stateDt         0
stateTime       0
updateDt        3
accExamCnt    128
accDefRate    197
dtype: int64

In [27]:
# 특정 컬럼을 제거 
# drop({조건식}, axis={0,1 | 'rows','columns'}, inplace={bool})
# axis : 행을 지울것인가 열을 지울것인가 선택
# inpalce : 기준이 되는 데이터프레임을 변경할것인가?

corona2.drop('Unnamed: 0', axis=1, inplace=True)
# corona2 = corona2.drop('Unnamed: 0', axis=1)

In [None]:
corona2.head()

In [28]:
df = corona2.copy()

In [30]:
# df라는 데이터에서 seq라는 컬럼을 삭제
df = df.drop('seq', axis='columns')

KeyError: "['seq'] not found in axis"

In [None]:
df.head(1)

In [None]:
# 컬럼의 이름을 변경
df.rename(columns={
    'createDt' : '등록일시', 
    'deathCnt' : '총사망자', 
    'decideCnt' : '총확진자', 
    'stateDt' : '기준일', 
    'stateTime' : '기준시간', 
    'updateDt' : '수정일시', 
    'accExamCnt' : '누적의심자', 
    'accDefRate' : '누적확진율'
})

In [31]:
df.columns = ['등록일시', '총사망자', '총확진자', '기준일', '기준시간', '수정일시',
              '누적의심자', '누적확진율']

In [32]:
df.head(5)

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율
0,2022-06-08 09:09:05.982,24305,18188200,20220608,00:00,,,
1,2022-06-07 09:09:00.897,24299,18174842,20220607,00:00,2022-06-08 09:10:36.846,,
2,2022-06-06 09:00:06.734,24279,18168670,20220606,00:00,2022-06-08 09:10:50.441,,
3,2022-06-05 08:53:19.426,24258,18163648,20220605,00:00,2022-06-08 09:11:04.758,,
4,2022-06-04 08:56:49.219,24238,18153814,20220604,00:00,2022-06-08 09:11:26.303,,


In [33]:
# 특정한 컬럼의 데이터를 기준으로 정렬을 변경
# sort_values( {기준이 컬럼} , ascending = {bool} , inplace = {bool} )
df.sort_values(['등록일시'], ascending=True, inplace=True)

In [34]:
df.head()

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율
819,2020-03-10 00:00:00.000,54,7513,20200310,00:00,2021-10-07 10:30:51.51,210144.0,3.919308
818,2020-03-11 00:00:00.000,60,7755,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175
817,2020-03-12 00:00:00.000,66,7869,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744
816,2020-03-13 00:00:00.000,67,7979,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499
815,2020-03-14 00:00:00.000,72,8086,20200314,00:00,2021-10-07 10:30:51.51,261335.0,3.318


In [35]:
# 인덱스를 재지정
# reset_index(drop = {bool}, inplace= {bool})
# drop : 기존의 인덱스를 제거할것인가?
df.reset_index(drop=True, inplace=True)
# df.reset_index().drop('index', axis=1)

In [36]:
df.tail(10)

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율
810,2022-05-30 09:01:30.847,24167,18086392,20220530,00:00,2022-06-04 08:59:37.203,,
811,2022-05-31 08:55:39.977,24176,18103577,20220531,00:00,2022-06-04 08:59:23.637,,
812,2022-06-01 09:07:30.461,24197,18119345,20220601,00:00,2022-06-08 09:12:05.316,,
813,2022-06-02 08:58:19.746,24212,18129236,20220602,00:00,2022-06-08 09:11:51.985,,
814,2022-06-03 09:08:18.729,24229,18141775,20220603,00:00,2022-06-08 09:11:38.938,,
815,2022-06-04 08:56:49.219,24238,18153814,20220604,00:00,2022-06-08 09:11:26.303,,
816,2022-06-05 08:53:19.426,24258,18163648,20220605,00:00,2022-06-08 09:11:04.758,,
817,2022-06-06 09:00:06.734,24279,18168670,20220606,00:00,2022-06-08 09:10:50.441,,
818,2022-06-07 09:09:00.897,24299,18174842,20220607,00:00,2022-06-08 09:10:36.846,,
819,2022-06-08 09:09:05.982,24305,18188200,20220608,00:00,,,


In [37]:
# 결측치을 특정값으로 대체
# fillna()
# fillna(n) : 결측치를 n으로 대체
# fillna(method = {'ffill' | 'bfill'})

# 특정컬럼의 데이터만 확인 
df['수정일시'] = df['수정일시'].fillna('-')

In [None]:
df.isna().sum()

In [38]:
# fillna(method='ffill') : 결측치 전의 데이터로 결측치를 채워준다. 
df['누적의심자'] = df['누적의심자'].fillna(method='ffill')

In [39]:
# fillna(method='bfill') : 결측치 후의 데이터로 결측치를 채워준다. 
df['누적확진율'] = df['누적확진율'].fillna(method='bfill')

In [40]:
df.isna().sum()

등록일시       0
총사망자       0
총확진자       0
기준일        0
기준시간       0
수정일시       0
누적의심자      0
누적확진율    197
dtype: int64

In [None]:
df.head(5)

### 데이터프레임의 필터
- loc[{행의 조건}, {열의 조건}]
    - 특정한 행의 조건과 열의 조건이 맞는 데이터를 출력
- iloc[{행의 위치}, {열의 위치}]
    - 특정한 행의 위치와 열의 위치가 맞는 데이터를 출력

In [41]:
df2 = df.copy()

In [42]:
df2.drop(0, axis=0, inplace=True)

In [43]:
# 행과 열의 조건식이 모두 존재하는 경우
df2.loc[1:3, '총사망자':'기준일']

Unnamed: 0,총사망자,총확진자,기준일
1,60,7755,20200311
2,66,7869,20200312
3,67,7979,20200313


In [44]:
# 인덱스의 조건식만 존재하는 경우
df2.loc[1:3]

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율
1,2020-03-11 00:00:00.000,60,7755,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175
2,2020-03-12 00:00:00.000,66,7869,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744
3,2020-03-13 00:00:00.000,67,7979,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499


In [45]:
# 컬럼의 조건만 존재하는 경우
df2.loc[:, ['총사망자', '기준시간']]

Unnamed: 0,총사망자,기준시간
1,60,00:00
2,66,00:00
3,67,00:00
4,72,00:00
5,75,00:00
...,...,...
815,24238,00:00
816,24258,00:00
817,24279,00:00
818,24299,00:00


In [46]:
df2[ ['총사망자', '기준시간'] ]

Unnamed: 0,총사망자,기준시간
1,60,00:00
2,66,00:00
3,67,00:00
4,72,00:00
5,75,00:00
...,...,...
815,24238,00:00
816,24258,00:00
817,24279,00:00
818,24299,00:00


In [47]:
df2[ 1:3 ][ ['총사망자', '총확진자'] ]

Unnamed: 0,총사망자,총확진자
2,66,7869
3,67,7979


In [48]:
df2.iloc[ 1:3 , 1:3]

Unnamed: 0,총사망자,총확진자
2,66,7869
3,67,7979


In [49]:
len(df)

820

In [50]:
# 일일확진자 = 오늘의 총확진자 - 전날의 총확진자
# 데이터프레임의 길이 - 1 만큼 반복 

# 새로운 리스트 생성
data_list = [0]
for i in range(0, len(df)-1, 1):
    data = df.loc[i+1, '총확진자'] - df.loc[i, '총확진자']
    data_list.append(data)
print(data_list)

[0, 242, 114, 110, 107, 76, 74, 84, 93, 152, 87, 147, 98, 64, 76, 100, 104, 91, 146, 105, 78, 125, 101, 89, 86, 94, 81, 47, 47, 53, 39, 27, 30, 32, 25, 27, 27, 22, 22, 18, 8, 13, 9, 11, 8, 6, 10, 10, 10, 14, 9, 4, 9, 6, 13, 8, 3, 2, 4, 12, 18, 34, 35, 27, 26, 56, 19, 13, 15, 13, 32, 12, 20, 23, 25, 16, 19, 40, 79, 58, 39, 27, 35, 38, 49, 39, 39, 51, 57, 37, 38, 50, 45, 56, 48, 34, 37, 34, 43, 59, 49, 67, 48, 17, 46, 51, 28, 39, 51, 62, 42, 42, 51, 54, 61, 63, 61, 48, 44, 62, 50, 45, 35, 44, 62, 33, 39, 61, 60, 39, 34, 26, 45, 63, 59, 41, 113, 58, 25, 28, 48, 18, 36, 31, 30, 23, 34, 33, 43, 20, 43, 36, 28, 34, 54, 56, 103, 166, 279, 197, 246, 297, 288, 324, 332, 396, 266, 280, 320, 441, 371, 323, 299, 248, 235, 267, 195, 198, 168, 167, 119, 136, 156, 155, 176, 136, 121, 109, 106, 113, 153, 126, 110, 82, 70, 61, 110, 125, 114, 61, 95, 50, 38, 113, 77, 63, 75, 64, 73, 75, 114, 69, 54, 72, 58, 98, 91, 84, 110, 47, 73, 91, 76, 58, 89, 121, 155, 77, 61, 119, 88, 103, 125, 113, 127, 124, 97, 

In [51]:
df['일일확진자'] = data_list

In [52]:
df.head()

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일확진자
0,2020-03-10 00:00:00.000,54,7513,20200310,00:00,2021-10-07 10:30:51.51,210144.0,3.919308,0
1,2020-03-11 00:00:00.000,60,7755,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175,242
2,2020-03-12 00:00:00.000,66,7869,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744,114
3,2020-03-13 00:00:00.000,67,7979,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499,110
4,2020-03-14 00:00:00.000,72,8086,20200314,00:00,2021-10-07 10:30:51.51,261335.0,3.318,107


In [53]:
# shift(n) : n만큼 인덱스가 이동한 데이터를 생성
df['일일확진자2'] = (df['총확진자'] - df['총확진자'].shift(1)).fillna(0)

In [54]:
df.head(5)

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일확진자,일일확진자2
0,2020-03-10 00:00:00.000,54,7513,20200310,00:00,2021-10-07 10:30:51.51,210144.0,3.919308,0,0.0
1,2020-03-11 00:00:00.000,60,7755,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175,242,242.0
2,2020-03-12 00:00:00.000,66,7869,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744,114,114.0
3,2020-03-13 00:00:00.000,67,7979,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499,110,110.0
4,2020-03-14 00:00:00.000,72,8086,20200314,00:00,2021-10-07 10:30:51.51,261335.0,3.318,107,107.0


In [55]:
# diff(n) : 현재 위치에서 n만큼 인덱스를 이동한 데이터와의 차이를 출력
df['일일확진자3'] = df['총확진자'].diff().fillna(0)

In [56]:
df.head()

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일확진자,일일확진자2,일일확진자3
0,2020-03-10 00:00:00.000,54,7513,20200310,00:00,2021-10-07 10:30:51.51,210144.0,3.919308,0,0.0,0.0
1,2020-03-11 00:00:00.000,60,7755,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175,242,242.0,242.0
2,2020-03-12 00:00:00.000,66,7869,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744,114,114.0,114.0
3,2020-03-13 00:00:00.000,67,7979,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499,110,110.0,110.0
4,2020-03-14 00:00:00.000,72,8086,20200314,00:00,2021-10-07 10:30:51.51,261335.0,3.318,107,107.0,107.0


In [57]:
# 일일확진자가 음수인 데이터가 존재하는가?
# 반복문 + 조건문 
# 필터링 인덱스값에 일일확진자 조건 -> 반복실행 음수인 데이터만 출력
for i in range(0, len(df), 1):
    if( df.loc[i, '일일확진자'] < 0 ):
        print(i)

print('반복문 종료')

446
반복문 종료


In [58]:
df.loc[ df['일일확진자'] < 0 ]

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일확진자,일일확진자2,일일확진자3
446,2021-05-30 00:00:00.000,1957,139907,20210530,00:00,2021-10-07 10:30:51.51,9747612.0,1.454166,-430,-430.0,-430.0


In [59]:
df.loc[444:448]

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일확진자,일일확진자2,일일확진자3
444,2021-05-29 00:00:00.000,1951,139427,20210529,00:00,2021-10-07 10:30:51.51,9733588.0,1.450884,533,533.0,533.0
445,2021-05-30 00:00:00.0,1959,140337,20210531,00:00,2021-10-14 13:48:56.821,9761156.0,1.456805,910,910.0,910.0
446,2021-05-30 00:00:00.000,1957,139907,20210530,00:00,2021-10-07 10:30:51.51,9747612.0,1.454166,-430,-430.0,-430.0
447,2021-06-01 00:00:00.000,1963,140796,20210601,00:00,2021-10-07 10:30:51.51,9798400.0,1.45589,889,889.0,889.0
448,2021-06-02 00:00:00.000,1965,141473,20210602,00:00,2021-10-07 10:30:51.51,9834348.0,1.457668,677,677.0,677.0


In [None]:
df.loc[445, '등록일시'] = '2021-05-31 00:00:00.000'

In [None]:
df.loc[444:448]

In [None]:
## 일일확진자 잘못되었다. 
# 등록일시를 기준으로 오름차순 정렬
df.sort_values('등록일시', ascending=True, inplace=True)
# 인덱스를 초기화(기존의 인덱스를 제거)
df.reset_index(drop=True, inplace=True)
# ['일일확진자', '일일확진자2', '일일확진자3'] 모두 제거 
df.drop(['일일확진자', '일일확진자2', '일일확진자3'], axis=1, inplace=True)
# 일일확진자, 일일사망자 파생변수를 생성하여 전날의 데이터와 오늘의 데이터의 차이로 데이터를 채워준다. 
df['일일확진자'] = df['총확진자'].diff().fillna(0)
df['일일사망자'] = (df['총사망자'] - df['총사망자'].shift()).fillna(0)
# 일일확진자, 일일사망자의 결측치는 0으로 대체
df.head()

In [None]:
df.loc[ df['일일확진자'] < 0  ]

In [None]:
# 일일사망자를 while문을 이용하여 새로운 파생변수를 생성
# data_list = []

i = 0

while i < len(df):
    if (i == 0):
        df.loc[i, '일일사망자2'] = 0
    else:
        data = df.loc[i, '총사망자'] - df.loc[i-1, '총사망자']
        df.loc[i, '일일사망자2'] = data
    i += 1

In [None]:
df.head()

In [None]:
!pip install matplotlib

In [None]:
import matplotlib.pyplot as plt

In [None]:
# plot() : 라인그래프 
x = df.tail(100).index
y = df.tail(100)['일일확진자']

plt.plot(x, y)
plt.show()

In [None]:
# 일일사망자는 바형 그래프 표시 
x = df.tail(50).index
y = df.tail(50)['일일사망자']

plt.bar(x, y)
plt.show()