# Object Creation(객체 생성)

In [1]:
import numpy as np
import pandas as pd

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.141644,0.262719,-0.032641,0.61106
2013-01-02,0.907913,0.320297,-2.429864,-0.610992
2013-01-03,1.484077,-2.221066,-0.618392,0.0599
2013-01-04,0.529983,1.205465,-1.253895,-0.464049
2013-01-05,0.166672,0.491989,-0.467951,-1.418305
2013-01-06,2.252028,-1.730232,0.176295,1.082598


In [5]:
df2 = pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1, index=list(range(4)), dtype = 'float32'),
                    'D' : np.array([3] * 4, dtype='int32'),
                    'E' : pd.Categorical(["test", "train", "test", "train"]),
                    'F' : 'foo'
                   })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# Viewing Data(데이터 보기)

In [7]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.141644,0.262719,-0.032641,0.61106
2013-01-02,0.907913,0.320297,-2.429864,-0.610992
2013-01-03,1.484077,-2.221066,-0.618392,0.0599
2013-01-04,0.529983,1.205465,-1.253895,-0.464049
2013-01-05,0.166672,0.491989,-0.467951,-1.418305


In [8]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.529983,1.205465,-1.253895,-0.464049
2013-01-05,0.166672,0.491989,-0.467951,-1.418305
2013-01-06,2.252028,-1.730232,0.176295,1.082598


In [9]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [11]:
df.to_numpy()

array([[-0.14164436,  0.2627187 , -0.03264055,  0.61106025],
       [ 0.90791314,  0.32029723, -2.42986432, -0.61099161],
       [ 1.4840769 , -2.22106567, -0.61839165,  0.05989967],
       [ 0.52998266,  1.20546486, -1.2538952 , -0.46404854],
       [ 0.1666724 ,  0.49198899, -0.46795094, -1.41830534],
       [ 2.25202756, -1.73023234,  0.17629521,  1.08259779]])

In [12]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [13]:
df.describe

<bound method NDFrame.describe of                    A         B         C         D
2013-01-01 -0.141644  0.262719 -0.032641  0.611060
2013-01-02  0.907913  0.320297 -2.429864 -0.610992
2013-01-03  1.484077 -2.221066 -0.618392  0.059900
2013-01-04  0.529983  1.205465 -1.253895 -0.464049
2013-01-05  0.166672  0.491989 -0.467951 -1.418305
2013-01-06  2.252028 -1.730232  0.176295  1.082598>

In [14]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.866505,-0.278471,-0.771075,-0.123298
std,0.885682,1.365897,0.953173,0.900892
min,-0.141644,-2.221066,-2.429864,-1.418305
25%,0.2575,-1.231995,-1.095019,-0.574256
50%,0.718948,0.291508,-0.543171,-0.202074
75%,1.340036,0.449066,-0.141468,0.47327
max,2.252028,1.205465,0.176295,1.082598


In [15]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.141644,0.907913,1.484077,0.529983,0.166672,2.252028
B,0.262719,0.320297,-2.221066,1.205465,0.491989,-1.730232
C,-0.032641,-2.429864,-0.618392,-1.253895,-0.467951,0.176295
D,0.61106,-0.610992,0.0599,-0.464049,-1.418305,1.082598


In [16]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.61106,-0.032641,0.262719,-0.141644
2013-01-02,-0.610992,-2.429864,0.320297,0.907913
2013-01-03,0.0599,-0.618392,-2.221066,1.484077
2013-01-04,-0.464049,-1.253895,1.205465,0.529983
2013-01-05,-1.418305,-0.467951,0.491989,0.166672
2013-01-06,1.082598,0.176295,-1.730232,2.252028


In [17]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-03,1.484077,-2.221066,-0.618392,0.0599
2013-01-06,2.252028,-1.730232,0.176295,1.082598
2013-01-01,-0.141644,0.262719,-0.032641,0.61106
2013-01-02,0.907913,0.320297,-2.429864,-0.610992
2013-01-05,0.166672,0.491989,-0.467951,-1.418305
2013-01-04,0.529983,1.205465,-1.253895,-0.464049


# Selection
## Getting(얻기)

In [18]:
df['A']

2013-01-01   -0.141644
2013-01-02    0.907913
2013-01-03    1.484077
2013-01-04    0.529983
2013-01-05    0.166672
2013-01-06    2.252028
Freq: D, Name: A, dtype: float64

In [19]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.907913,0.320297,-2.429864,-0.610992
2013-01-03,1.484077,-2.221066,-0.618392,0.0599
2013-01-04,0.529983,1.205465,-1.253895,-0.464049


## Selection by label(라벨로 선택)

In [20]:
df.loc[dates[0]]

A   -0.141644
B    0.262719
C   -0.032641
D    0.611060
Name: 2013-01-01 00:00:00, dtype: float64

In [21]:
df.loc[:, ['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.141644,0.262719
2013-01-02,0.907913,0.320297
2013-01-03,1.484077,-2.221066
2013-01-04,0.529983,1.205465
2013-01-05,0.166672,0.491989
2013-01-06,2.252028,-1.730232


In [22]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,0.907913,0.320297
2013-01-03,1.484077,-2.221066
2013-01-04,0.529983,1.205465


In [23]:
df.loc['20130102', ['A', 'B']]

A    0.907913
B    0.320297
Name: 2013-01-02 00:00:00, dtype: float64

In [24]:
df.loc[dates[0],'A']

-0.1416443603264537

## Selection by Postion(위치로 얻기)

In [25]:
df.iloc[3]

A    0.529983
B    1.205465
C   -1.253895
D   -0.464049
Name: 2013-01-04 00:00:00, dtype: float64

In [26]:
df.iloc[3:5, 0:3]

Unnamed: 0,A,B,C
2013-01-04,0.529983,1.205465,-1.253895
2013-01-05,0.166672,0.491989,-0.467951


In [27]:
df.iloc[[1,2,4], [0,2]]

Unnamed: 0,A,C
2013-01-02,0.907913,-2.429864
2013-01-03,1.484077,-0.618392
2013-01-05,0.166672,-0.467951


In [28]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.907913,0.320297,-2.429864,-0.610992
2013-01-03,1.484077,-2.221066,-0.618392,0.0599


In [29]:
df.iat[1,1]

0.32029722563486157

In [30]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.141644,0.262719,-0.032641,0.61106
2013-01-02,0.907913,0.320297,-2.429864,-0.610992
2013-01-03,1.484077,-2.221066,-0.618392,0.0599
2013-01-04,0.529983,1.205465,-1.253895,-0.464049
2013-01-05,0.166672,0.491989,-0.467951,-1.418305
2013-01-06,2.252028,-1.730232,0.176295,1.082598


In [31]:
df.iat[0,0]

-0.1416443603264537

## Boolean indexing

In [32]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2013-01-02,0.907913,0.320297,-2.429864,-0.610992
2013-01-03,1.484077,-2.221066,-0.618392,0.0599
2013-01-04,0.529983,1.205465,-1.253895,-0.464049
2013-01-05,0.166672,0.491989,-0.467951,-1.418305
2013-01-06,2.252028,-1.730232,0.176295,1.082598


In [33]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,,0.262719,,0.61106
2013-01-02,0.907913,0.320297,,
2013-01-03,1.484077,,,0.0599
2013-01-04,0.529983,1.205465,,
2013-01-05,0.166672,0.491989,,
2013-01-06,2.252028,,0.176295,1.082598


In [34]:
df2 = df.copy()
df2['E']=['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.141644,0.262719,-0.032641,0.61106,one
2013-01-02,0.907913,0.320297,-2.429864,-0.610992,one
2013-01-03,1.484077,-2.221066,-0.618392,0.0599,two
2013-01-04,0.529983,1.205465,-1.253895,-0.464049,three
2013-01-05,0.166672,0.491989,-0.467951,-1.418305,four
2013-01-06,2.252028,-1.730232,0.176295,1.082598,three


In [35]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,1.484077,-2.221066,-0.618392,0.0599,two
2013-01-05,0.166672,0.491989,-0.467951,-1.418305,four


## Setting

In [36]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [37]:
df['F']=s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.141644,0.262719,-0.032641,0.61106,
2013-01-02,0.907913,0.320297,-2.429864,-0.610992,1.0
2013-01-03,1.484077,-2.221066,-0.618392,0.0599,2.0
2013-01-04,0.529983,1.205465,-1.253895,-0.464049,3.0
2013-01-05,0.166672,0.491989,-0.467951,-1.418305,4.0
2013-01-06,2.252028,-1.730232,0.176295,1.082598,5.0


In [38]:
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.262719,-0.032641,0.61106,
2013-01-02,0.907913,0.320297,-2.429864,-0.610992,1.0
2013-01-03,1.484077,-2.221066,-0.618392,0.0599,2.0
2013-01-04,0.529983,1.205465,-1.253895,-0.464049,3.0
2013-01-05,0.166672,0.491989,-0.467951,-1.418305,4.0
2013-01-06,2.252028,-1.730232,0.176295,1.082598,5.0


In [39]:
df.iat[0,1] = 0
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.032641,5,
2013-01-02,0.907913,0.320297,-2.429864,5,1.0
2013-01-03,1.484077,-2.221066,-0.618392,5,2.0
2013-01-04,0.529983,1.205465,-1.253895,5,3.0
2013-01-05,0.166672,0.491989,-0.467951,5,4.0
2013-01-06,2.252028,-1.730232,0.176295,5,5.0


In [40]:
df2 = df.copy()
df2[df>0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.032641,-5,
2013-01-02,-0.907913,-0.320297,-2.429864,-5,-1.0
2013-01-03,-1.484077,-2.221066,-0.618392,-5,-2.0
2013-01-04,-0.529983,-1.205465,-1.253895,-5,-3.0
2013-01-05,-0.166672,-0.491989,-0.467951,-5,-4.0
2013-01-06,-2.252028,-1.730232,-0.176295,-5,-5.0


In [41]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.032641,5,
2013-01-02,0.907913,0.320297,-2.429864,5,1.0
2013-01-03,1.484077,-2.221066,-0.618392,5,2.0
2013-01-04,0.529983,1.205465,-1.253895,5,3.0
2013-01-05,0.166672,0.491989,-0.467951,5,4.0
2013-01-06,2.252028,-1.730232,0.176295,5,5.0


# Missing Data

In [42]:
df1 = df.reindex(index = dates[0:4], columns = list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.032641,5,,1.0
2013-01-02,0.907913,0.320297,-2.429864,5,1.0,1.0
2013-01-03,1.484077,-2.221066,-0.618392,5,2.0,
2013-01-04,0.529983,1.205465,-1.253895,5,3.0,


In [43]:
df1.dropna(how = 'any')

Unnamed: 0,A,B,C,D,F,E
2013-01-02,0.907913,0.320297,-2.429864,5,1.0,1.0


In [44]:
df1.dropna()

Unnamed: 0,A,B,C,D,F,E
2013-01-02,0.907913,0.320297,-2.429864,5,1.0,1.0


In [45]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.032641,5,5.0,1.0
2013-01-02,0.907913,0.320297,-2.429864,5,1.0,1.0
2013-01-03,1.484077,-2.221066,-0.618392,5,2.0,5.0
2013-01-04,0.529983,1.205465,-1.253895,5,3.0,5.0


In [46]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


# Operation
## stats(통계)

In [47]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.032641,5,
2013-01-02,0.907913,0.320297,-2.429864,5,1.0
2013-01-03,1.484077,-2.221066,-0.618392,5,2.0
2013-01-04,0.529983,1.205465,-1.253895,5,3.0
2013-01-05,0.166672,0.491989,-0.467951,5,4.0
2013-01-06,2.252028,-1.730232,0.176295,5,5.0


In [48]:
df.mean()

A    0.890112
B   -0.322258
C   -0.771075
D    5.000000
F    3.000000
dtype: float64

In [49]:
df.mean(1)

2013-01-01    1.241840
2013-01-02    0.959669
2013-01-03    1.128924
2013-01-04    1.696310
2013-01-05    1.838142
2013-01-06    2.139618
Freq: D, dtype: float64

In [50]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index = dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [51]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,0.484077,-3.221066,-1.618392,4.0,1.0
2013-01-04,-2.470017,-1.794535,-4.253895,2.0,0.0
2013-01-05,-4.833328,-4.508011,-5.467951,0.0,-1.0
2013-01-06,,,,,


## Apply

In [53]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.032641,5,
2013-01-02,0.907913,0.320297,-2.429864,5,1.0
2013-01-03,1.484077,-2.221066,-0.618392,5,2.0
2013-01-04,0.529983,1.205465,-1.253895,5,3.0
2013-01-05,0.166672,0.491989,-0.467951,5,4.0
2013-01-06,2.252028,-1.730232,0.176295,5,5.0


In [52]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.032641,5,
2013-01-02,0.907913,0.320297,-2.462505,10,1.0
2013-01-03,2.39199,-1.900768,-3.080897,15,3.0
2013-01-04,2.921973,-0.695304,-4.334792,20,6.0
2013-01-05,3.088645,-0.203315,-4.802743,25,10.0
2013-01-06,5.340673,-1.933547,-4.626447,30,15.0


In [54]:
df.apply(lambda x: x.max() - x.min())

A    2.252028
B    3.426531
C    2.606160
D    0.000000
F    4.000000
dtype: float64

## Histogramming(도수 분포도)

In [57]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    0
1    5
2    2
3    4
4    0
5    3
6    2
7    4
8    6
9    1
dtype: int32