In [5]:
import numpy as np
import pandas as pd

In [7]:
data = pd.Series([1, 3, 5, np.nan, 6, 8])
data

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [10]:
dates = pd.date_range('20180129',periods=6)
dates

DatetimeIndex(['2018-01-29', '2018-01-30', '2018-01-31', '2018-02-01',
               '2018-02-02', '2018-02-03'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df = pd.DataFrame(np.random.randint(-5,5,size=(6,4)),index=dates,columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2018-01-29,3,4,-5,4
2018-01-30,4,-2,4,-2
2018-01-31,2,3,1,-1
2018-02-01,2,0,1,-2
2018-02-02,-3,-5,-1,-3
2018-02-03,4,2,3,1


In [27]:
df2 = pd.DataFrame({
    'A': 1,
    'B': pd.Timestamp('20180129'),
    'C': pd.Series(1, index=list(range(3))),
    'D': np.array([3] * 3),
    'E': pd.Categorical(['wow', 'how', 'mow']),
    'F': 'foo'
})
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2018-01-29,1,3,wow,foo
1,1,2018-01-29,1,3,how,foo
2,1,2018-01-29,1,3,mow,foo


In [24]:
df2.describe()

Unnamed: 0,A,C,D
count,3.0,3.0,3.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [25]:
df2.T

Unnamed: 0,0,1,2
A,1,1,1
B,2018-01-29 00:00:00,2018-01-29 00:00:00,2018-01-29 00:00:00
C,1,1,1
D,3,3,3
E,wow,how,mow
F,foo,foo,foo


In [29]:
df2['A']

0    1
1    1
2    1
Name: A, dtype: int64

In [30]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2018-01-29,3,4
2018-01-30,4,-2
2018-01-31,2,3
2018-02-01,2,0
2018-02-02,-3,-5
2018-02-03,4,2


In [32]:
df.loc['20180129':'20180204',['A','B']]

Unnamed: 0,A,B
2018-01-29,3,4
2018-01-30,4,-2
2018-01-31,2,3
2018-02-01,2,0
2018-02-02,-3,-5
2018-02-03,4,2


In [33]:
%%timeit
df.at[dates[0], 'A']

13.6 µs ± 177 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [36]:
%%timeit
df.at[dates[0], 'A']

11.2 µs ± 66.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [39]:
df.iloc[3]

A    2
B    0
C    1
D   -2
Name: 2018-02-01 00:00:00, dtype: int32

In [41]:
df2 = df.copy()
df2['E'] = [1,2,3,4,5,6]
df2

Unnamed: 0,A,B,C,D,E
2018-01-29,3,4,-5,4,1
2018-01-30,4,-2,4,-2,2
2018-01-31,2,3,1,-1,3
2018-02-01,2,0,1,-2,4
2018-02-02,-3,-5,-1,-3,5
2018-02-03,4,2,3,1,6


In [42]:
df2[df2['E']%2==0]

Unnamed: 0,A,B,C,D,E
2018-01-30,4,-2,4,-2,2
2018-02-01,2,0,1,-2,4
2018-02-03,4,2,3,1,6


In [43]:
# 인덱싱을 다시 한다.
# index 파라미터로 인덱싱을 하는 데이터의 범위를 나타내고
# columns 파라미터로 컬럼 데이터를 나타낸다.
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1],'E'] = [1,2]  # dates 0 ~ 1까지 value를 1,2로 변경한다.
df1

Unnamed: 0,A,B,C,D,E
2018-01-29,3.0,4.0,-5.0,4.0,1.0
2018-01-30,4.0,-2.0,4.0,-2.0,2.0
2018-01-31,2.0,3.0,1.0,-1.0,
2018-02-01,2.0,0.0,1.0,-2.0,


In [45]:
df1[df1.notnull()]

Unnamed: 0,A,B,C,D,E
2018-01-29,3.0,4.0,-5.0,4.0,1.0
2018-01-30,4.0,-2.0,4.0,-2.0,2.0
2018-01-31,2.0,3.0,1.0,-1.0,
2018-02-01,2.0,0.0,1.0,-2.0,


In [46]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2018-01-29,3.0,4.0,-5.0,4.0,1.0
2018-01-30,4.0,-2.0,4.0,-2.0,2.0


In [47]:
df1.loc[dates[0]]=np.nan
df1

Unnamed: 0,A,B,C,D,E
2018-01-29,,,,,
2018-01-30,4.0,-2.0,4.0,-2.0,2.0
2018-01-31,2.0,3.0,1.0,-1.0,
2018-02-01,2.0,0.0,1.0,-2.0,


In [49]:
df1.dropna(how='all')

Unnamed: 0,A,B,C,D,E
2018-01-30,4.0,-2.0,4.0,-2.0,2.0
2018-01-31,2.0,3.0,1.0,-1.0,
2018-02-01,2.0,0.0,1.0,-2.0,


In [50]:
df1['F'] = np.nan
df1

Unnamed: 0,A,B,C,D,E,F
2018-01-29,,,,,,
2018-01-30,4.0,-2.0,4.0,-2.0,2.0,
2018-01-31,2.0,3.0,1.0,-1.0,,
2018-02-01,2.0,0.0,1.0,-2.0,,


In [53]:
df1.dropna(how='all',axis=1)

Unnamed: 0,A,B,C,D,E
2018-01-29,,,,,
2018-01-30,4.0,-2.0,4.0,-2.0,2.0
2018-01-31,2.0,3.0,1.0,-1.0,
2018-02-01,2.0,0.0,1.0,-2.0,


In [54]:
df1.dropna(how='all', axis=[0, 1])

Unnamed: 0,A,B,C,D,E
2018-01-30,4.0,-2.0,4.0,-2.0,2.0
2018-01-31,2.0,3.0,1.0,-1.0,
2018-02-01,2.0,0.0,1.0,-2.0,


In [55]:
df1.fillna(value=3)

Unnamed: 0,A,B,C,D,E,F
2018-01-29,3.0,3.0,3.0,3.0,3.0,3.0
2018-01-30,4.0,-2.0,4.0,-2.0,2.0,3.0
2018-01-31,2.0,3.0,1.0,-1.0,3.0,3.0
2018-02-01,2.0,0.0,1.0,-2.0,3.0,3.0


In [56]:
df.mean()

A    2.000000
B    0.333333
C    0.500000
D   -0.500000
dtype: float64

In [57]:
df.mean(0)

A    2.000000
B    0.333333
C    0.500000
D   -0.500000
dtype: float64

In [58]:
df.mean(1)

2018-01-29    1.50
2018-01-30    1.00
2018-01-31    1.25
2018-02-01    0.25
2018-02-02   -3.00
2018-02-03    2.50
Freq: D, dtype: float64

In [60]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates)
print(s)
print(s.shift(2))
print(s.shift(-1))

2018-01-29    1.0
2018-01-30    3.0
2018-01-31    5.0
2018-02-01    NaN
2018-02-02    6.0
2018-02-03    8.0
Freq: D, dtype: float64
2018-01-29    NaN
2018-01-30    NaN
2018-01-31    1.0
2018-02-01    3.0
2018-02-02    5.0
2018-02-03    NaN
Freq: D, dtype: float64
2018-01-29    3.0
2018-01-30    5.0
2018-01-31    NaN
2018-02-01    6.0
2018-02-02    8.0
2018-02-03    NaN
Freq: D, dtype: float64


In [61]:
df.sub(s)

Unnamed: 0,2018-01-29 00:00:00,2018-01-30 00:00:00,2018-01-31 00:00:00,2018-02-01 00:00:00,2018-02-02 00:00:00,2018-02-03 00:00:00,A,B,C,D
2018-01-29,,,,,,,,,,
2018-01-30,,,,,,,,,,
2018-01-31,,,,,,,,,,
2018-02-01,,,,,,,,,,
2018-02-02,,,,,,,,,,
2018-02-03,,,,,,,,,,


In [62]:
tuples = list(zip(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                  ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [63]:
# MultiIndex : 인덱스데이터를 여러개 가질수 있다.(first와 second 두개의 인덱스를 가진다.)
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [64]:
df = pd.DataFrame(np.random.randint(10, size=(8, 2)), index=index, columns=list('AB'))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,6
bar,two,8,7
baz,one,1,2
baz,two,5,2
foo,one,4,6
foo,two,6,2
qux,one,1,8
qux,two,1,5


In [66]:
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,6
bar,two,8,7
baz,one,1,2
baz,two,5,2


In [68]:
stacked = df2.stack()
stacked

first  second   
bar    one     A    1
               B    6
       two     A    8
               B    7
baz    one     A    1
               B    2
       two     A    5
               B    2
dtype: int32

In [70]:
stacked.index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two'], ['A', 'B']],
           labels=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second', None])

In [71]:
stacked.names=['first','second','third']
stacked

first  second   
bar    one     A    1
               B    6
       two     A    8
               B    7
baz    one     A    1
               B    2
       two     A    5
               B    2
dtype: int32

In [72]:
stacked.index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two'], ['A', 'B']],
           labels=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second', None])

In [73]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,6
bar,two,8,7
baz,one,1,2
baz,two,5,2


In [75]:
stacked.unstack().unstack()

Unnamed: 0_level_0,A,A,B,B
second,one,two,one,two
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,1,8,6,7
baz,1,5,2,2


In [76]:
stacked.unstack(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,6
bar,two,8,7
baz,one,1,2
baz,two,5,2


In [78]:
rng = pd.date_range('2018-01-01',periods=100,freq='Min')
rng

DatetimeIndex(['2018-01-01 00:00:00', '2018-01-01 00:01:00',
               '2018-01-01 00:02:00', '2018-01-01 00:03:00',
               '2018-01-01 00:04:00', '2018-01-01 00:05:00',
               '2018-01-01 00:06:00', '2018-01-01 00:07:00',
               '2018-01-01 00:08:00', '2018-01-01 00:09:00',
               '2018-01-01 00:10:00', '2018-01-01 00:11:00',
               '2018-01-01 00:12:00', '2018-01-01 00:13:00',
               '2018-01-01 00:14:00', '2018-01-01 00:15:00',
               '2018-01-01 00:16:00', '2018-01-01 00:17:00',
               '2018-01-01 00:18:00', '2018-01-01 00:19:00',
               '2018-01-01 00:20:00', '2018-01-01 00:21:00',
               '2018-01-01 00:22:00', '2018-01-01 00:23:00',
               '2018-01-01 00:24:00', '2018-01-01 00:25:00',
               '2018-01-01 00:26:00', '2018-01-01 00:27:00',
               '2018-01-01 00:28:00', '2018-01-01 00:29:00',
               '2018-01-01 00:30:00', '2018-01-01 00:31:00',
               '2018-01-

In [79]:
ts = pd.Series(np.random.randint(0,500,len(rng)), index=rng)
ts

2018-01-01 00:00:00    108
2018-01-01 00:01:00    438
2018-01-01 00:02:00    148
2018-01-01 00:03:00     60
2018-01-01 00:04:00     22
2018-01-01 00:05:00    443
2018-01-01 00:06:00    437
2018-01-01 00:07:00    427
2018-01-01 00:08:00    308
2018-01-01 00:09:00    497
2018-01-01 00:10:00     16
2018-01-01 00:11:00    215
2018-01-01 00:12:00    428
2018-01-01 00:13:00    473
2018-01-01 00:14:00    252
2018-01-01 00:15:00    444
2018-01-01 00:16:00     85
2018-01-01 00:17:00    158
2018-01-01 00:18:00    448
2018-01-01 00:19:00    183
2018-01-01 00:20:00    282
2018-01-01 00:21:00     34
2018-01-01 00:22:00    145
2018-01-01 00:23:00    278
2018-01-01 00:24:00     75
2018-01-01 00:25:00    111
2018-01-01 00:26:00    334
2018-01-01 00:27:00    379
2018-01-01 00:28:00    307
2018-01-01 00:29:00    194
                      ... 
2018-01-01 01:10:00    253
2018-01-01 01:11:00    110
2018-01-01 01:12:00    351
2018-01-01 01:13:00    447
2018-01-01 01:14:00    234
2018-01-01 01:15:00    475
2

In [83]:
ts.resample('10Min').sum()

2018-01-01 00:00:00    2888
2018-01-01 00:10:00    2702
2018-01-01 00:20:00    2139
2018-01-01 00:30:00    2558
2018-01-01 00:40:00    2545
2018-01-01 00:50:00    2732
2018-01-01 01:00:00    2342
2018-01-01 01:10:00    3213
2018-01-01 01:20:00    2599
2018-01-01 01:30:00    3112
Freq: 10T, dtype: int32

In [84]:
ts.resample('Min').sum()

2018-01-01 00:00:00    108
2018-01-01 00:01:00    438
2018-01-01 00:02:00    148
2018-01-01 00:03:00     60
2018-01-01 00:04:00     22
2018-01-01 00:05:00    443
2018-01-01 00:06:00    437
2018-01-01 00:07:00    427
2018-01-01 00:08:00    308
2018-01-01 00:09:00    497
2018-01-01 00:10:00     16
2018-01-01 00:11:00    215
2018-01-01 00:12:00    428
2018-01-01 00:13:00    473
2018-01-01 00:14:00    252
2018-01-01 00:15:00    444
2018-01-01 00:16:00     85
2018-01-01 00:17:00    158
2018-01-01 00:18:00    448
2018-01-01 00:19:00    183
2018-01-01 00:20:00    282
2018-01-01 00:21:00     34
2018-01-01 00:22:00    145
2018-01-01 00:23:00    278
2018-01-01 00:24:00     75
2018-01-01 00:25:00    111
2018-01-01 00:26:00    334
2018-01-01 00:27:00    379
2018-01-01 00:28:00    307
2018-01-01 00:29:00    194
                      ... 
2018-01-01 01:10:00    253
2018-01-01 01:11:00    110
2018-01-01 01:12:00    351
2018-01-01 01:13:00    447
2018-01-01 01:14:00    234
2018-01-01 01:15:00    475
2

In [95]:
# 월에 대한 date range의 날짜를 마지막날에서 1일로 바꾸기
# 날짜 데이터를 삭제한후에 다시 생성
rng = pd.date_range('1/1/2018', periods=5, freq='M')
rng

DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
               '2018-05-31'],
              dtype='datetime64[ns]', freq='M')

In [96]:
ts = pd.Series(np.random.randint(0,10,len(rng)),index=rng)
ts

2018-01-31    2
2018-02-28    3
2018-03-31    8
2018-04-30    6
2018-05-31    6
Freq: M, dtype: int32

In [97]:
ps = ts.to_period()
ps

2018-01    2
2018-02    3
2018-03    8
2018-04    6
2018-05    6
Freq: M, dtype: int32

In [98]:
ps.to_timestamp()

2018-01-01    2
2018-02-01    3
2018-03-01    8
2018-04-01    6
2018-05-01    6
Freq: MS, dtype: int32

In [100]:
titanic = pd.read_csv('train.csv')
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [106]:
print(titanic.index, titanic.shape)
titanic.columns

RangeIndex(start=0, stop=891, step=1) (891, 12)


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [111]:
titanic_f1 = pd.DataFrame(titanic, columns=['Pclass','Sex'])
titanic_f1 = titanic.groupby(["Sex","Pclass"]).size().reset_index(name='Counts')
titanic_f1

Unnamed: 0,Sex,Pclass,Counts
0,female,1,94
1,female,2,76
2,female,3,144
3,male,1,122
4,male,2,108
5,male,3,347


In [110]:
titanic_f1.pivot('Sex','Pclass','Counts')

Pclass,1,2,3
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,94,76,144
male,122,108,347


In [112]:
titanic_f1.pivot('Sex','Counts','Pclass')

Counts,76,94,108,122,144,347
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,2.0,1.0,,,3.0,
male,,,2.0,1.0,,3.0


In [115]:
titanic_f1.groupby('Sex').agg('count').reset_index()

Unnamed: 0,Sex,Pclass,Counts
0,female,3,3
1,male,3,3


In [116]:
titanic.groupby('Sex').size().reset_index()

Unnamed: 0,Sex,0
0,female,314
1,male,577


In [117]:
titanic.groupby('Pclass').size().reset_index()

Unnamed: 0,Pclass,0
0,1,216
1,2,184
2,3,491


In [118]:
titanic.groupby(['Pclass','Sex']).size().reset_index()

Unnamed: 0,Pclass,Sex,0
0,1,female,94
1,1,male,122
2,2,female,76
3,2,male,108
4,3,female,144
5,3,male,347


In [123]:
titanic_f2 = pd.DataFrame(titanic)
titanic_f2 = titanic.groupby(["Pclass","Survived"]).size().reset_index(name='Counts')
titanic_f2.tail()

Unnamed: 0,Pclass,Survived,Counts
1,1,1,136
2,2,0,97
3,2,1,87
4,3,0,372
5,3,1,119


In [124]:
titanic_f2.pivot('Pclass','Survived','Counts')

Survived,0,1
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80,136
2,97,87
3,372,119


In [125]:
titanic_f2.pivot('Survived','Pclass','Counts')

Pclass,1,2,3
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,80,97,372
1,136,87,119


In [144]:
titanic_f3 = titanic.groupby(['Survived']).agg('sum').reset_index()

In [156]:
titanic_f3

Unnamed: 0,Survived,PassengerId,Pclass,Age,SibSp,Parch,Fare,Count
0,0,245412,1390,12985.5,304,181,12142.7199,549
1,1,151974,667,8219.67,162,159,16551.2294,342


In [160]:
# titanic_f3[['Count','Survived']]
titanic_f3.loc[:,['Count','Survived']]

Unnamed: 0,Count,Survived
0,549,0
1,342,1


In [131]:
titanic_f3.pivot('Survived','Sex')

Unnamed: 0_level_0,Counts,Counts
Sex,female,male
Survived,Unnamed: 1_level_2,Unnamed: 2_level_2
0,81,468
1,233,109


In [143]:
titanic_f3 = pd.DataFrame(titanic)
titanic_f3["Count"] = 1
titanic_f3.pivot_table(values='Count',index=['Sex'],columns=['Pclass'],aggfunc=np.sum)


Pclass,1,2,3
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,94,76,144
male,122,108,347


In [162]:
tmp = titanic.groupby(['Sex','Survived']).size().reset_index(name='Counts')
tmp

Unnamed: 0,Sex,Survived,Counts
0,female,0,81
1,female,1,233
2,male,0,468
3,male,1,109


In [165]:
df = tmp.pivot('Survived','Sex','Counts')
df

Sex,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,81,468
1,233,109


In [167]:
# 열추가
df['total'] = df['female'] + df['male']
df

Sex,female,male,total
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,81,468,549
1,233,109,342


In [168]:
# 행추가
df.loc['TOTAL'] = df.loc[0] + df.loc[1]
df

Sex,female,male,total
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,81,468,549
1,233,109,342
TOTAL,314,577,891
