# Pandas 란?



데이터분석용 라이브러리로, 행과 열로 처리하여 대용량 데이터 처리하는데 편리


출처: https://doorbw.tistory.com/172 [Tigercow.Door]

In [1]:
# pandas 사용하기
import numpy as np # numpy 도 함께 import
import pandas as pd

##  1. Pandas 자료구조

Pandas에서는 Series와 Data Frame 사용



### 1-1. Series
열(필드) 단위의 데이터로 Index 당 1개의 Value를 갖는 자료 형태

- Series 입력하기

In [2]:
# 데이터만 입력시
data1 = pd.Series([200, 300, -500, 400])
print(data1) #인덱스는 0,1,2...방식으로 인식|
print()

# 인덳스 및 값 확인
print(data1.index) # 0~4까지의 인덱스가 1씩 증가하는 것을 보여줌
print(data1.values) # Series내의 값, 순서대로 출력

0    200
1    300
2   -500
3    400
dtype: int64

RangeIndex(start=0, stop=4, step=1)
[ 200  300 -500  400]


In [3]:
# 데이터 + 인덱스 입력
data2 = pd.Series([100, 150, -50, 30], index=['17년', '18년', '19년', '20년'])
print(data2)

17년    100
18년    150
19년    -50
20년     30
dtype: int64


In [4]:
# Dictionary 형태의 자료 pandas로 입력하기
sales = {'17년': 100, '18년': 150, '19년': -50, '20년': 30}
data3 = pd.Series(sales) # pd. Series(dict)를 활용
print(data3)

17년    100
18년    150
19년    -50
20년     30
dtype: int64


- 데이터 이름지정

In [6]:
# Series이름, Index이름 지정 : Excel에서 테이블명, 필드명과 동일
data3.name = 'sales'
data3.index.name = "years" # Series에서 Value는 sales이므로 별도 이름을 갖지않음
print(data3)

years
17년    100
18년    150
19년    -50
20년     30
Name: sales, dtype: int64


In [7]:
# index 변경 : 인덱스 입력 시, 순서대로 변경
data3.index = ['1년차', '2년차', '3년차', '4년차']
print(data3)

1년차    100
2년차    150
3년차    -50
4년차     30
Name: sales, dtype: int64


### 1-2. Data Frame

엑셀로는 Table(표)형식의 데이터로 Index 당 2개 이상의 Value 존재

In [11]:
# Dictionary로 입력
data1_1 = {'name': ['E', 'W', 'S', 'N', 'W'],
        'year': [2017, 2018, 2018, 2019, 2020],
        'sales': [150, 100, -80, 50, 10]}
df1 = pd.DataFrame(data1_1)

print(df1)
print()

# 인덱스 및 값 확인
print(df1.index) # 인덱스 출력
print(df1.columns) # 데이터 필드명 출력 
print(df1.values) # value출력

  name  year  sales
0    E  2017    150
1    W  2018    100
2    S  2018    -80
3    N  2019     50
4    W  2020     10

RangeIndex(start=0, stop=5, step=1)
Index(['name', 'year', 'sales'], dtype='object')
[['E' 2017 150]
 ['W' 2018 100]
 ['S' 2018 -80]
 ['N' 2019 50]
 ['W' 2020 10]]


In [28]:
# 이름 설정

df1.index.name = 'raw_db'
df1.columns.name = 'category'
print(df1)

category name  year  sales
raw_db                    
0           E  2017    150
1           W  2018    100
2           S  2018    -80
3           N  2019     50
4           W  2020     10


In [29]:
df1

1년차    100
2년차    150
3년차    -50
4년차     30
Name: sales, dtype: int64

In [30]:
# pd 모듈 통해 기존데이터 끌어오기

df2 = pd.DataFrame(data1_1, columns=['name', 'year', 'sales', 'profit'],
                   # 기존 데이터 data1_1에 존재하는 열은 value 가져옴
                  index=['1', '2', '3', '4', '5'])
print(df2)

  name  year  sales profit
1    E  2017    150    NaN
2    W  2018    100    NaN
3    S  2018    -80    NaN
4    N  2019     50    NaN
5    W  2020     10    NaN


In [31]:
# 열데이터 생성
df2['cost'] = [30,40,30,30,20]
print(df2)

  name  year  sales profit  cost
1    E  2017    150    NaN    30
2    W  2018    100    NaN    40
3    S  2018    -80    NaN    30
4    N  2019     50    NaN    30
5    W  2020     10    NaN    20


In [32]:
# Series방식으로 원하는 인덱스에만 value 추가 가능
val = pd.Series([20, 30, 30], index=['2','3','5'])
df2['expenses'] =val
df2

Unnamed: 0,name,year,sales,profit,cost,expenses
1,E,2017,150,,30,
2,W,2018,100,,40,20.0
3,S,2018,-80,,30,30.0
4,N,2019,50,,30,
5,W,2020,10,,20,30.0


## 2. 계산하기

### 2-1. DataFrame에서 열을 선택하고 조작하기

In [33]:
# describe() : DataFrame의 다양한 계산 값을 보여줌
round(df2.describe()) # round로 소수점 1자리까지만 정리

Unnamed: 0,year,sales,cost,expenses
count,5.0,5.0,5.0,3.0
mean,2018.0,46.0,30.0,27.0
std,1.0,88.0,7.0,6.0
min,2017.0,-80.0,20.0,20.0
25%,2018.0,10.0,30.0,25.0
50%,2018.0,50.0,30.0,30.0
75%,2019.0,100.0,30.0,30.0
max,2020.0,150.0,40.0,30.0


In [34]:
# 년도 불러오기
print(df2['year'])
print(df2.year)

1    2017
2    2018
3    2018
4    2019
5    2020
Name: year, dtype: int64
1    2017
2    2018
3    2018
4    2019
5    2020
Name: year, dtype: int64


In [35]:
# 두개이상 불러오기
df2[['year','sales']]

Unnamed: 0,year,sales
1,2017,150
2,2018,100
3,2018,-80
4,2019,50
5,2020,10


In [36]:
# 열 계산으로 신규컬럼 생성
df2['sales_profit'] = df2['sales'] - df2['cost']
print(df2)

  name  year  sales profit  cost  expenses  sales_profit
1    E  2017    150    NaN    30       NaN           120
2    W  2018    100    NaN    40      20.0            60
3    S  2018    -80    NaN    30      30.0          -110
4    N  2019     50    NaN    30       NaN            20
5    W  2020     10    NaN    20      30.0           -10


In [37]:
# 열 조건으로 신규컬럼 생성
df2['ErrorData'] = df2['sales'] < 0
print(df2)

  name  year  sales profit  cost  expenses  sales_profit  ErrorData
1    E  2017    150    NaN    30       NaN           120      False
2    W  2018    100    NaN    40      20.0            60      False
3    S  2018    -80    NaN    30      30.0          -110       True
4    N  2019     50    NaN    30       NaN            20      False
5    W  2020     10    NaN    20      30.0           -10      False


In [19]:
# 열 삭제하기

del df2['profit']

print(df2)

  name  year  sales  cost  expenses  sales_profit  ErrorData
1    E  2017    150    30       NaN           120      False
2    W  2018    100    40      20.0            60      False
3    S  2018    -80    30      30.0          -110       True
4    N  2019     50    30       NaN            20      False
5    W  2020     10    20      30.0           -10      False


In [38]:
df2.index.name = 'raw'
df2.columns.name = 'Info'
print(df2)

Info name  year  sales profit  cost  expenses  sales_profit  ErrorData
raw                                                                   
1       E  2017    150    NaN    30       NaN           120      False
2       W  2018    100    NaN    40      20.0            60      False
3       S  2018    -80    NaN    30      30.0          -110       True
4       N  2019     50    NaN    30       NaN            20      False
5       W  2020     10    NaN    20      30.0           -10      False


### 2-2. DataFrame에서 행 선택/조작




In [39]:
# 0번째 부터 2 번째까지 가져온다.
# 뒤에 써준 숫자번째의 행은 뺀다.

df2[0:3]

Info,name,year,sales,profit,cost,expenses,sales_profit,ErrorData
raw,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,E,2017,150,,30,,120,False
2,W,2018,100,,40,20.0,60,False
3,S,2018,-80,,30,30.0,-110,True


In [40]:
# .loc 또는 .iloc 함수를 사용하는 방법.

df2.loc['2'] # 반환 형태는 Series

Info
name                W
year             2018
sales             100
profit            NaN
cost               40
expenses           20
sales_profit       60
ErrorData       False
Name: 2, dtype: object

In [41]:
df2.loc['2':'5']

Info,name,year,sales,profit,cost,expenses,sales_profit,ErrorData
raw,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,W,2018,100,,40,20.0,60,False
3,S,2018,-80,,30,30.0,-110,True
4,N,2019,50,,30,,20,False
5,W,2020,10,,20,30.0,-10,False


In [42]:
df2.loc[:,'year'] # == df2['year']

raw
1    2017
2    2018
3    2018
4    2019
5    2020
Name: year, dtype: int64

In [43]:
df2.loc[:,['year','name']]

Info,year,name
raw,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2017,E
2,2018,W
3,2018,S
4,2019,N
5,2020,W


In [44]:
df2.loc['2':'5','year':'sales']

Info,year,sales
raw,Unnamed: 1_level_1,Unnamed: 2_level_1
2,2018,100
3,2018,-80
4,2019,50
5,2020,10


In [45]:
# .iloc 사용:: index 번호를 사용한다.
df2.iloc[3] # 3번째 행(개체)을 가져온다.

Info
name                N
year             2019
sales              50
profit            NaN
cost               30
expenses          NaN
sales_profit       20
ErrorData       False
Name: 4, dtype: object

In [46]:
df2.iloc[3:5, 0:2]

Info,name,year
raw,Unnamed: 1_level_1,Unnamed: 2_level_1
4,N,2019
5,W,2020


In [47]:
df2.iloc[[0,1,3], [1,2]]

Info,year,sales
raw,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2017,150
2,2018,100
4,2019,50


In [48]:
df2.iloc[:,1:4]

Info,year,sales,profit
raw,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2017,150,
2,2018,100,
3,2018,-80,
4,2019,50,
5,2020,10,


In [49]:
df2.iloc[:,:]

Info,name,year,sales,profit,cost,expenses,sales_profit,ErrorData
raw,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,E,2017,150,,30,,120,False
2,W,2018,100,,40,20.0,60,False
3,S,2018,-80,,30,30.0,-110,True
4,N,2019,50,,30,,20,False
5,W,2020,10,,20,30.0,-10,False


## 3. DataFrame에서의 boolean Indexing


In [50]:
df = df2

In [51]:
# year가 2018보다 큰 boolean data

df['year'] > 2018


raw
1    False
2    False
3    False
4     True
5     True
Name: year, dtype: bool

In [52]:
# year가 2014보다 큰 모든 행의 값

df.loc[df['year']>2018,:]


Info,name,year,sales,profit,cost,expenses,sales_profit,ErrorData
raw,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4,N,2019,50,,30,,20,False
5,W,2020,10,,20,30.0,-10,False


In [55]:
df.loc[df['name']=='S',['name','sales_profit']]

Info,name,sales_profit
raw,Unnamed: 1_level_1,Unnamed: 2_level_1
3,S,-110


In [54]:
# numpy처럼 논리연산 가능
df.loc[(df['sales']>30)&(df['expenses']<100),:]


Info,name,year,sales,profit,cost,expenses,sales_profit,ErrorData
raw,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,W,2018,100,,40,20.0,60,False


In [65]:
# numpy 활용한 데이터프레임 생성
df = pd.DataFrame(np.random.randn(7,5))
df.columns = ['A', 'B', 'C', 'D','E']
df.index = pd.date_range('20200101', periods=7)
df['F'] = [1.0, np.nan, 3.5, 6.1, np.nan, 7.0, np.nan] # 행의 개수를 채우지못하면 오류발생

df

Unnamed: 0,A,B,C,D,E,F
2020-01-01,1.742053,-0.788054,-0.839879,-0.211362,0.047091,1.0
2020-01-02,1.173555,0.164195,-0.179552,-0.962066,0.494186,
2020-01-03,0.149286,-0.728721,-0.146526,-0.548742,-0.434494,3.5
2020-01-04,0.0806,0.547438,-0.879781,2.288326,-0.016586,6.1
2020-01-05,0.368398,0.321375,-1.234131,0.39126,1.070239,
2020-01-06,-2.703554,2.084043,-0.886767,0.648173,0.015561,7.0
2020-01-07,-1.307747,0.206029,1.148636,-1.877311,2.361918,


In [62]:
# 행의 값중 하나라도 nan인 경우 그 행을 없앤다.
df.dropna(how='any') 

Unnamed: 0,A,B,C,D,E,F
2020-01-01,-0.283601,0.292299,-1.494457,0.903979,1.060669,1.0
2020-01-03,0.310787,0.077306,-0.057392,-1.104253,0.374995,3.5
2020-01-04,-0.317666,0.391633,-0.676106,0.953892,0.627172,6.1
2020-01-06,-1.193013,0.001356,0.058417,-0.35891,-0.8821,7.0


In [64]:
# nan값에 값 넣기
df.fillna(value=0.5)

Unnamed: 0,A,B,C,D,E,F
2020-01-01,-0.615369,0.068362,0.462835,0.316797,-0.437941,1.0
2020-01-02,0.290338,0.052942,0.329015,-0.599951,0.57444,0.5
2020-01-03,-0.257151,1.0804,1.511118,0.97939,1.114283,3.5
2020-01-04,-0.100681,1.8241,1.746037,1.442105,0.511086,6.1
2020-01-05,0.082111,0.561319,0.401956,-0.603143,-0.678921,0.5
2020-01-06,0.048674,-0.205709,-0.534169,1.548701,-0.176826,7.0
2020-01-07,-1.007683,-2.574694,0.65476,-1.281772,0.673427,0.5


In [68]:
# nan값인지 확인하기
df.isnull()

Unnamed: 0,A,B,C,D,E,F
2020-01-01,False,False,False,False,False,False
2020-01-02,False,False,False,False,False,True
2020-01-03,False,False,False,False,False,False
2020-01-04,False,False,False,False,False,False
2020-01-05,False,False,False,False,False,True
2020-01-06,False,False,False,False,False,False
2020-01-07,False,False,False,False,False,True


In [75]:
# 특정 행 drop하기

df.drop(pd.to_datetime('20200106'))
df.drop('F', axis = 1) #특정열
df.drop(['B','D'], axis = 1) 

Unnamed: 0,A,C,E,F
2020-01-01,1.742053,-0.839879,0.047091,1.0
2020-01-02,1.173555,-0.179552,0.494186,
2020-01-03,0.149286,-0.146526,-0.434494,3.5
2020-01-04,0.0806,-0.879781,-0.016586,6.1
2020-01-05,0.368398,-1.234131,1.070239,
2020-01-06,-2.703554,-0.886767,0.015561,7.0
2020-01-07,-1.307747,1.148636,2.361918,
