In [1]:
import pandas as pd
import numpy as np

In [4]:
# series생성
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [40]:
# 날짜형 데이터 'date_range'
dates = pd.date_range('20190101', periods=6)
dates

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:
# dataframe 생성
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A','B','C','D'])
# 6x4의 random 변수 / columns는 A, B, C, D로 지정 / 날짜형 데이터 dates 옵션 
df

Unnamed: 0,A,B,C,D
2019-01-01,-0.237972,0.502488,-0.660662,-0.035297
2019-01-02,0.845397,-1.158277,0.242403,0.706923
2019-01-03,-0.537115,1.378783,1.22399,1.281566
2019-01-04,-0.07647,2.049692,1.317591,-1.068869
2019-01-05,0.018789,0.031324,0.074842,2.532782
2019-01-06,-0.4627,-1.379926,-0.320027,1.073467


In [22]:
df.head(3)
df.index # 인덱스
df.columns # 컬럼
df.values # 내용
df.info() # 개요
df.describe() # 통계적 개요
df.sort_values(by='B', ascending=False) # 'B' column을 기준으로 내림차순 정렬

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2019-01-01 to 2019-01-06
Freq: D
Data columns (total 4 columns):
A    6 non-null float64
B    6 non-null float64
C    6 non-null float64
D    6 non-null float64
dtypes: float64(4)
memory usage: 240.0 bytes


Unnamed: 0,A,B,C,D
2019-01-04,-0.07647,2.049692,1.317591,-1.068869
2019-01-03,-0.537115,1.378783,1.22399,1.281566
2019-01-01,-0.237972,0.502488,-0.660662,-0.035297
2019-01-05,0.018789,0.031324,0.074842,2.532782
2019-01-02,0.845397,-1.158277,0.242403,0.706923
2019-01-06,-0.4627,-1.379926,-0.320027,1.073467


In [29]:
df['A']
df[0:3] # 슬라이싱으로 1행에서 3행까지
df['20190102':'20190104'] # 특정 행을 보고 싶을 때

Unnamed: 0,A,B,C,D
2019-01-02,0.845397,-1.158277,0.242403,0.706923
2019-01-03,-0.537115,1.378783,1.22399,1.281566
2019-01-04,-0.07647,2.049692,1.317591,-1.068869


In [48]:
# loc : 행과 열의 정보를 이용해서 데이터에 접근
df.loc[dates[0]] # location 옵션으로 위치 값을 지정할 수 있음. 여기서는 특정 날짜의 데이터만 보여줌.
df.loc[:,['A','B']] # A, B열의 모든 행을 보여줌. df.loc[행,열]
df.loc['20190102':'20190104',['A','B']]
df.loc['20190102',['A','B']]
df.loc[dates[0],'A'] # dates의 0에 맞는 날짜(20190101)의 'A'컬럼만 선택

-0.2379722798768409

In [53]:
# iloc : 행과 열의 번호를 이용해서 데이터에 접근
df.iloc[3] # 20190104의 모든 컬럼의 내용
df.iloc[3:5,0:2] # slicing
df.iloc[[1,2,4],[0,2]] # 선택 슬라이싱
df.iloc[1:3,:]
df.iloc[:,1:3]
df[df.A > 0] # df의 'A'컬럼의 값이 0보다 큰 행의 모든 값 출력

Unnamed: 0,B,C
2019-01-01,0.502488,-0.660662
2019-01-02,-1.158277,0.242403
2019-01-03,1.378783,1.22399
2019-01-04,2.049692,1.317591
2019-01-05,0.031324,0.074842
2019-01-06,-1.379926,-0.320027


In [64]:
# copy : 데이터 복사
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2019-01-01,-0.237972,0.502488,-0.660662,-0.035297,one
2019-01-02,0.845397,-1.158277,0.242403,0.706923,one
2019-01-03,-0.537115,1.378783,1.22399,1.281566,two
2019-01-04,-0.07647,2.049692,1.317591,-1.068869,three
2019-01-05,0.018789,0.031324,0.074842,2.532782,four
2019-01-06,-0.4627,-1.379926,-0.320027,1.073467,three


In [68]:
df2['E'].isin(['two','four']) # df2의 'E'컬럼의 내용 중 'two','four'가 있으면 True 출력
df2[df2['E'].isin(['two','four'])] # df2의 'E'컬럼의 내용 중 'two','four'가 있는 행 출력

Unnamed: 0,A,B,C,D,E
2019-01-03,-0.537115,1.378783,1.22399,1.281566,two
2019-01-05,0.018789,0.031324,0.074842,2.532782,four


In [70]:
df.apply(np.cumsum) # 각 column의 누적 합
df.apply(lambda x: x.max() - x.min()) # 각 column의 최대값과 최소값의 차이

A    1.382512
B    3.429618
C    1.978253
D    3.601651
dtype: float64