<a href="https://colab.research.google.com/github/hanna-joo/bigdata_edu/blob/master/python_edu/pandas_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

# 1. 변수 생성 (Object creation)

In [2]:
# Series 생성
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
# DataFrame 생성: Numpy 행렬
dates = pd.date_range('20130101',periods=6)
print(dates)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


Unnamed: 0,A,B,C,D
2013-01-01,-1.471857,-0.480625,1.198916,0.021897
2013-01-02,0.360208,0.552296,0.612337,0.029651
2013-01-03,2.432118,0.019141,0.48998,-0.931976
2013-01-04,1.232998,0.751265,1.879437,-1.26287
2013-01-05,-1.459957,1.417531,0.487158,-2.013769
2013-01-06,-0.04757,0.808607,1.607757,0.960547


In [4]:
# DataFrame 생성: 딕셔너리
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3]*4, dtype='int32'),
                    'E': pd.Categorical(['test','train','test','train']),
                    'F': 'foo'})
print(df2.dtypes)
df2

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


# 2. 데이터 확인 (Viewing data)


In [5]:
df.head(2)

Unnamed: 0,A,B,C,D
2013-01-01,-1.471857,-0.480625,1.198916,0.021897
2013-01-02,0.360208,0.552296,0.612337,0.029651


In [6]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [8]:
# Numpy 행렬에는 단 한가지의 dtype만 가능
# Pandas 데이터 프레임에는 컬럼당 한가지의 dtype만 가능
df.to_numpy()

array([[-1.47185675, -0.48062529,  1.19891619,  0.02189731],
       [ 0.36020827,  0.55229581,  0.61233686,  0.02965101],
       [ 2.43211837,  0.01914117,  0.48997989, -0.93197553],
       [ 1.23299807,  0.75126483,  1.87943654, -1.26287007],
       [-1.459957  ,  1.4175306 ,  0.48715814, -2.01376946],
       [-0.04756987,  0.80860734,  1.60775725,  0.96054732]])

In [9]:
# 여러 dtype이 존재하는 df2를 numpy로 변환 시 모든 것을 아우르는 dtype 선택
  # DataFrame.to_numpy(): 인덱스와 컬럼명은 포함되지 않음
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [10]:
# df의 통계 요약
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.174324,0.511369,1.045931,-0.532753
std,1.528594,0.662578,0.60714,1.071423
min,-1.471857,-0.480625,0.487158,-2.013769
25%,-1.10686,0.15243,0.520569,-1.180146
50%,0.156319,0.65178,0.905627,-0.455039
75%,1.014801,0.794272,1.505547,0.027713
max,2.432118,1.417531,1.879437,0.960547


In [11]:
# df의 행과 열 바꾸기
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-1.471857,0.360208,2.432118,1.232998,-1.459957,-0.04757
B,-0.480625,0.552296,0.019141,0.751265,1.417531,0.808607
C,1.198916,0.612337,0.48998,1.879437,0.487158,1.607757
D,0.021897,0.029651,-0.931976,-1.26287,-2.013769,0.960547


In [12]:
# 행 또는 열 값에 따라 나열하기
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.021897,1.198916,-0.480625,-1.471857
2013-01-02,0.029651,0.612337,0.552296,0.360208
2013-01-03,-0.931976,0.48998,0.019141,2.432118
2013-01-04,-1.26287,1.879437,0.751265,1.232998
2013-01-05,-2.013769,0.487158,1.417531,-1.459957
2013-01-06,0.960547,1.607757,0.808607,-0.04757


In [13]:
df.sort_index(ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,-0.04757,0.808607,1.607757,0.960547
2013-01-05,-1.459957,1.417531,0.487158,-2.013769
2013-01-04,1.232998,0.751265,1.879437,-1.26287
2013-01-03,2.432118,0.019141,0.48998,-0.931976
2013-01-02,0.360208,0.552296,0.612337,0.029651
2013-01-01,-1.471857,-0.480625,1.198916,0.021897


In [14]:
# 값에 따라 나열하기
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-01,-1.471857,-0.480625,1.198916,0.021897
2013-01-03,2.432118,0.019141,0.48998,-0.931976
2013-01-02,0.360208,0.552296,0.612337,0.029651
2013-01-04,1.232998,0.751265,1.879437,-1.26287
2013-01-06,-0.04757,0.808607,1.607757,0.960547
2013-01-05,-1.459957,1.417531,0.487158,-2.013769


In [15]:
df.sort_values(by="B", ascending=False)

Unnamed: 0,A,B,C,D
2013-01-05,-1.459957,1.417531,0.487158,-2.013769
2013-01-06,-0.04757,0.808607,1.607757,0.960547
2013-01-04,1.232998,0.751265,1.879437,-1.26287
2013-01-02,0.360208,0.552296,0.612337,0.029651
2013-01-03,2.432118,0.019141,0.48998,-0.931976
2013-01-01,-1.471857,-0.480625,1.198916,0.021897


# 3. 데이터 선택 (Selection)
- .at / .iat
- .loc / .iloc
- .isin

In [16]:
# 해당하는 열 가져오기
df['A']  # df.A

2013-01-01   -1.471857
2013-01-02    0.360208
2013-01-03    2.432118
2013-01-04    1.232998
2013-01-05   -1.459957
2013-01-06   -0.047570
Freq: D, Name: A, dtype: float64

In [17]:
# 해당하는 행 가져오기
df[2:4]  # df['20130103':'20130104']

Unnamed: 0,A,B,C,D
2013-01-03,2.432118,0.019141,0.48998,-0.931976
2013-01-04,1.232998,0.751265,1.879437,-1.26287


## 인덱스 또는 컬럼명으로 가져오기 (.loc)

In [18]:
# 인덱스 값으로 가져오기
df.loc['20130101']

A   -1.471857
B   -0.480625
C    1.198916
D    0.021897
Name: 2013-01-01 00:00:00, dtype: float64

In [19]:
# 지정한 행과 열에 해당하는 값 가져오기
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-1.471857,-0.480625
2013-01-02,0.360208,0.552296
2013-01-03,2.432118,0.019141
2013-01-04,1.232998,0.751265
2013-01-05,-1.459957,1.417531
2013-01-06,-0.04757,0.808607


In [20]:
df.loc['20130102':'20130104', 'A':'C']

Unnamed: 0,A,B,C
2013-01-02,0.360208,0.552296,0.612337
2013-01-03,2.432118,0.019141,0.48998
2013-01-04,1.232998,0.751265,1.879437


In [21]:
df.loc['20130102':'20130104', ['A','C']]

Unnamed: 0,A,C
2013-01-02,0.360208,0.612337
2013-01-03,2.432118,0.48998
2013-01-04,1.232998,1.879437


In [22]:
df.loc[dates[0],'A']

-1.4718567497391846

In [23]:
# .at: scalar 가져올 때 .loc 보다 더 빠른 방법
df.at[dates[0],'A']

-1.4718567497391846

## 좌표값으로 가져오기 (.iloc)

In [24]:
df.iloc[2]

A    2.432118
B    0.019141
C    0.489980
D   -0.931976
Name: 2013-01-03 00:00:00, dtype: float64

In [25]:
df.iloc[3:5]  # df[3:5] / df['20130104':'20130105']

Unnamed: 0,A,B,C,D
2013-01-04,1.232998,0.751265,1.879437,-1.26287
2013-01-05,-1.459957,1.417531,0.487158,-2.013769


In [26]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,1.232998,0.751265
2013-01-05,-1.459957,1.417531


In [27]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,0.360208,0.612337
2013-01-03,2.432118,0.48998
2013-01-05,-1.459957,0.487158


In [28]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.360208,0.552296,0.612337,0.029651
2013-01-03,2.432118,0.019141,0.48998,-0.931976


In [29]:
df.iloc[1,1]

0.5522958085327175

In [30]:
# .iat: scalar 가져올 때 .iloc 보다 더 빠른 방법
df.iat[1,1]

0.5522958085327175

## 조건에 해당하는 값 가져오기 (.isin)

In [31]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2013-01-02,0.360208,0.552296,0.612337,0.029651
2013-01-03,2.432118,0.019141,0.48998,-0.931976
2013-01-04,1.232998,0.751265,1.879437,-1.26287


In [32]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,,,1.198916,0.021897
2013-01-02,0.360208,0.552296,0.612337,0.029651
2013-01-03,2.432118,0.019141,0.48998,
2013-01-04,1.232998,0.751265,1.879437,
2013-01-05,,1.417531,0.487158,
2013-01-06,,0.808607,1.607757,0.960547


In [33]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.471857,-0.480625,1.198916,0.021897,one
2013-01-02,0.360208,0.552296,0.612337,0.029651,one
2013-01-03,2.432118,0.019141,0.48998,-0.931976,two
2013-01-04,1.232998,0.751265,1.879437,-1.26287,three
2013-01-05,-1.459957,1.417531,0.487158,-2.013769,four
2013-01-06,-0.04757,0.808607,1.607757,0.960547,three


In [34]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,2.432118,0.019141,0.48998,-0.931976,two
2013-01-05,-1.459957,1.417531,0.487158,-2.013769,four


## 값 설정하기

In [35]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [36]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.471857,-0.480625,1.198916,0.021897
2013-01-02,0.360208,0.552296,0.612337,0.029651
2013-01-03,2.432118,0.019141,0.48998,-0.931976
2013-01-04,1.232998,0.751265,1.879437,-1.26287
2013-01-05,-1.459957,1.417531,0.487158,-2.013769
2013-01-06,-0.04757,0.808607,1.607757,0.960547


In [37]:
# 새로운 컬럼 추가(인덱스에 해당하는 값만 가져옴)
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.471857,-0.480625,1.198916,0.021897,
2013-01-02,0.360208,0.552296,0.612337,0.029651,1.0
2013-01-03,2.432118,0.019141,0.48998,-0.931976,2.0
2013-01-04,1.232998,0.751265,1.879437,-1.26287,3.0
2013-01-05,-1.459957,1.417531,0.487158,-2.013769,4.0
2013-01-06,-0.04757,0.808607,1.607757,0.960547,5.0


In [38]:
# 새로운 값으로 바꾸기
df.at[dates[0],'A'] = 0
df.iat[0, 1] = 0
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.198916,5,
2013-01-02,0.360208,0.552296,0.612337,5,1.0
2013-01-03,2.432118,0.019141,0.48998,5,2.0
2013-01-04,1.232998,0.751265,1.879437,5,3.0
2013-01-05,-1.459957,1.417531,0.487158,5,4.0
2013-01-06,-0.04757,0.808607,1.607757,5,5.0


In [39]:
df2 = df.copy()
df2[df2>0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.198916,-5,
2013-01-02,-0.360208,-0.552296,-0.612337,-5,-1.0
2013-01-03,-2.432118,-0.019141,-0.48998,-5,-2.0
2013-01-04,-1.232998,-0.751265,-1.879437,-5,-3.0
2013-01-05,-1.459957,-1.417531,-0.487158,-5,-4.0
2013-01-06,-0.04757,-0.808607,-1.607757,-5,-5.0


# 4. 결측치 처리 (Missing data)

In [42]:
# 인덱스 변경/추가/삭제: reindex
df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+['E'])
df1.loc[dates[0]:dates[1],'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,1.198916,5,,1.0
2013-01-02,0.360208,0.552296,0.612337,5,1.0,1.0
2013-01-03,2.432118,0.019141,0.48998,5,2.0,
2013-01-04,1.232998,0.751265,1.879437,5,3.0,


In [43]:
# 결측치 있는 행 모두 삭제
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-02,0.360208,0.552296,0.612337,5,1.0,1.0


In [47]:
# 모든 결측치 값에 5 넣기
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,1.198916,5,5.0,1.0
2013-01-02,0.360208,0.552296,0.612337,5,1.0,1.0
2013-01-03,2.432118,0.019141,0.48998,5,2.0,5.0
2013-01-04,1.232998,0.751265,1.879437,5,3.0,5.0


In [48]:
# 특정 컬럼의 결측치 값에 5 넣기
df1.F.fillna(value=5)

2013-01-01    5.0
2013-01-02    1.0
2013-01-03    2.0
2013-01-04    3.0
Freq: D, Name: F, dtype: float64

In [49]:
# 결측치 있는지 없는지 위치 확인하기
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


# 5. 조작 (Operations)

## 데이터 프레임의 기본 통계 구하기

In [50]:
print(df.mean())  # 컬럼별 평균
print(df.mean(1)) # 행별 평균

A    0.419633
B    0.591473
C    1.045931
D    5.000000
F    3.000000
dtype: float64
2013-01-01    1.549729
2013-01-02    1.504968
2013-01-03    1.988248
2013-01-04    2.372740
2013-01-05    1.888946
2013-01-06    2.473759
Freq: D, dtype: float64


In [51]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [52]:
print(df)
df.sub(s, axis='index')

                   A         B         C  D    F
2013-01-01  0.000000  0.000000  1.198916  5  NaN
2013-01-02  0.360208  0.552296  0.612337  5  1.0
2013-01-03  2.432118  0.019141  0.489980  5  2.0
2013-01-04  1.232998  0.751265  1.879437  5  3.0
2013-01-05 -1.459957  1.417531  0.487158  5  4.0
2013-01-06 -0.047570  0.808607  1.607757  5  5.0


Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,1.432118,-0.980859,-0.51002,4.0,1.0
2013-01-04,-1.767002,-2.248735,-1.120563,2.0,0.0
2013-01-05,-6.459957,-3.582469,-4.512842,0.0,-1.0
2013-01-06,,,,,


## 데이터 프레임에 함수 적용하기: df.apply()

In [54]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.198916,5,
2013-01-02,0.360208,0.552296,0.612337,5,1.0
2013-01-03,2.432118,0.019141,0.48998,5,2.0
2013-01-04,1.232998,0.751265,1.879437,5,3.0
2013-01-05,-1.459957,1.417531,0.487158,5,4.0
2013-01-06,-0.04757,0.808607,1.607757,5,5.0


In [55]:
# df의 컬럼별로 누적합 구하기
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.198916,5,
2013-01-02,0.360208,0.552296,1.811253,10,1.0
2013-01-03,2.792327,0.571437,2.301233,15,3.0
2013-01-04,4.025325,1.322702,4.180669,20,6.0
2013-01-05,2.565368,2.740232,4.667828,25,10.0
2013-01-06,2.517798,3.54884,6.275585,30,15.0


In [56]:
# df의 컬럼별로 lambda함수 적용하기
df.apply(lambda x:x.max()-x.min())

A    3.892075
B    1.417531
C    1.392278
D    0.000000
F    4.000000
dtype: float64

## 열의 value 개수 세기: s.value_counts()

In [57]:
s = pd.Series(np.random.randint(0,7,size=10))
s

0    4
1    6
2    3
3    3
4    6
5    0
6    2
7    5
8    0
9    6
dtype: int64

In [58]:
s.value_counts()

6    3
3    2
0    2
5    1
4    1
2    1
dtype: int64

## 문자열 조작하기

In [59]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [60]:
# 대문자 > 소문자로 만들기
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object