# Series

In [2]:
import pandas as pd
from pandas import Series

In [3]:
obj = Series([3,6,9,12])
obj

0     3
1     6
2     9
3    12
dtype: int64

Seriesの特徴は、インデックスが振られていること（↑の0,1,2,3）

In [5]:
obj.values
# array型で取り出せる

array([ 3,  6,  9, 12])

In [6]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [9]:
ww2_cas = Series([870000, 43000, 30000, 2100, 40000], index=['USA','Germany','China','Japan','USA'])
ww2_cas

USA        870000
Germany     43000
China       30000
Japan        2100
USA         40000
dtype: int64

In [10]:
ww2_cas['USA']

USA    870000
USA     40000
dtype: int64

In [11]:
ww2_cas[ww2_cas > 400000]

USA    870000
dtype: int64

In [13]:
# これでTrue/Falseが帰るので、それに基づいて↑している
ww2_cas>400000

USA         True
Germany    False
China      False
Japan      False
USA        False
dtype: bool

In [14]:
'USSR' in ww2_cas

False

In [15]:
'USA' in ww2_cas

True

In [16]:
# 辞書型を作る
ww2_dict = ww2_cas.to_dict()

In [17]:
ww2_dict

{'China': 30000, 'Germany': 43000, 'Japan': 2100, 'USA': 40000}

In [18]:
ww2_Series = Series(ww2_dict)
ww2_Series

China      30000
Germany    43000
Japan       2100
USA        40000
dtype: int64

In [19]:
countries = ['China', 'Germany', 'Japan', 'USA', 'USSR', 'Argentina']
countries

['China', 'Germany', 'Japan', 'USA', 'USSR', 'Argentina']

In [20]:
obj2 = Series(ww2_dict, index=countries)
obj2

China        30000.0
Germany      43000.0
Japan         2100.0
USA          40000.0
USSR             NaN
Argentina        NaN
dtype: float64

In [21]:
pd.isnull(obj2)

China        False
Germany      False
Japan        False
USA          False
USSR          True
Argentina     True
dtype: bool

In [22]:
pd.notnull(obj2)

China         True
Germany       True
Japan         True
USA           True
USSR         False
Argentina    False
dtype: bool

In [23]:
ww2_Series

China      30000
Germany    43000
Japan       2100
USA        40000
dtype: int64

In [24]:
obj2

China        30000.0
Germany      43000.0
Japan         2100.0
USA          40000.0
USSR             NaN
Argentina        NaN
dtype: float64

In [25]:
ww2_Series + obj2

Argentina        NaN
China        60000.0
Germany      86000.0
Japan         4200.0
USA          80000.0
USSR             NaN
dtype: float64

足し合わせると、インデックスが揃っていると足し合わせてくれる

In [26]:
# Seriesに名前をつける
obj2.name = '第二次世界大戦の死傷者'
obj2

China        30000.0
Germany      43000.0
Japan         2100.0
USA          40000.0
USSR             NaN
Argentina        NaN
Name: 第二次世界大戦の死傷者, dtype: float64

In [27]:
# インデックスに名前をつける
obj2.index.name = 'Countries'
obj2

Countries
China        30000.0
Germany      43000.0
Japan         2100.0
USA          40000.0
USSR             NaN
Argentina        NaN
Name: 第二次世界大戦の死傷者, dtype: float64

# データフレーム

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
# データフレームの作り方
# コピペで作る (https://en.wikipedia.org/wiki/NFL_win%E2%80%93loss_records)

nfl_frame = pd.read_clipboard()

In [4]:
tmp = nfl_frame.copy()

In [5]:
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied,Pct.,First,NFL,Season,Total,Games,Division
0,1,Dallas,Cowboys,493,367,6,0.573,1960.0,866,NFC,East,
1,2,Green,Bay,Packers,730,553,37.0,0.567,1921,1320,NFC,North
2,3,Chicago,Bears,744,568,42,0.565,1920.0,1354,NFC,North,
3,4,Miami,Dolphins,439,341,4,0.563,1966.0,784,AFC,East,
4,5,New,England,Patriots[b],476,383,9.0,0.554,1960,868,AFC,East
5,6,New,York,Giants,684,572,33.0,0.543,1925,1289,NFC,East


In [8]:
# 列名を取得
nfl_frame.columns

Index(['Rank', 'Team', 'Won', 'Lost', 'Tied', 'Pct.', 'First', 'NFL', 'Season',
       'Total', 'Games', 'Division'],
      dtype='object')

In [11]:
# ある列の値を取得
nfl_frame['Season']

0      866
1     1921
2    1,354
3      784
4     1960
5     1925
Name: Season, dtype: object

In [13]:
# ある列の値を取得2
nfl_frame.Season

0      866
1     1921
2    1,354
3      784
4     1960
5     1925
Name: Season, dtype: object

In [17]:
# 複数列の値を取得(参照)
nfl_frame[['Team', 'Season']]

Unnamed: 0,Team,Season
0,Dallas,866
1,Green,1921
2,Chicago,1354
3,Miami,784
4,New,1960
5,New,1925


In [18]:
# 新しいデータフレームを作成
DataFrame(nfl_frame, columns=['Team', 'Season'])

Unnamed: 0,Team,Season
0,Dallas,866
1,Green,1921
2,Chicago,1354
3,Miami,784
4,New,1960
5,New,1925


In [20]:
# 存在しない列名を指定した場合の挙動
DataFrame(nfl_frame, columns=['Team', 'Season', 'Stadium'])

Unnamed: 0,Team,Season,Stadium
0,Dallas,866,
1,Green,1921,
2,Chicago,1354,
3,Miami,784,
4,New,1960,
5,New,1925,


In [23]:
# 先頭のn行を取り出す（デフォルト5行）
nfl_frame.head(2)

Unnamed: 0,Rank,Team,Won,Lost,Tied,Pct.,First,NFL,Season,Total,Games,Division
0,1,Dallas,Cowboys,493,367,6,0.573,1960.0,866,NFC,East,
1,2,Green,Bay,Packers,730,553,37.0,0.567,1921,1320,NFC,North


In [26]:
# 最後のn行を取得
nfl_frame.tail()

Unnamed: 0,Rank,Team,Won,Lost,Tied,Pct.,First,NFL,Season,Total,Games,Division
1,2,Green,Bay,Packers,730,553,37.0,0.567,1921,1320,NFC,North
2,3,Chicago,Bears,744,568,42,0.565,1920.0,1354,NFC,North,
3,4,Miami,Dolphins,439,341,4,0.563,1966.0,784,AFC,East,
4,5,New,England,Patriots[b],476,383,9.0,0.554,1960,868,AFC,East
5,6,New,York,Giants,684,572,33.0,0.543,1925,1289,NFC,East


In [29]:
# indexをつかってデータにアクセスする
nfl_frame.ix[3]

Rank               4
Team           Miami
Won         Dolphins
Lost             439
Tied             341
Pct.               4
First          0.563
NFL             1966
Season           784
Total            AFC
Games           East
Division         NaN
Name: 3, dtype: object

In [30]:
# 列全体に値を代入
nfl_frame['Stadium'] = "Levi's stadium"

In [31]:
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied,Pct.,First,NFL,Season,Total,Games,Division,Stadium
0,1,Dallas,Cowboys,493,367,6,0.573,1960.0,866,NFC,East,,Levi's stadium
1,2,Green,Bay,Packers,730,553,37.0,0.567,1921,1320,NFC,North,Levi's stadium
2,3,Chicago,Bears,744,568,42,0.565,1920.0,1354,NFC,North,,Levi's stadium
3,4,Miami,Dolphins,439,341,4,0.563,1966.0,784,AFC,East,,Levi's stadium
4,5,New,England,Patriots[b],476,383,9.0,0.554,1960,868,AFC,East,Levi's stadium
5,6,New,York,Giants,684,572,33.0,0.543,1925,1289,NFC,East,Levi's stadium


In [35]:
nfl_frame['Stadium'] = np.arange(6)

In [33]:
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied,Pct.,First,NFL,Season,Total,Games,Division,Stadium
0,1,Dallas,Cowboys,493,367,6,0.573,1960.0,866,NFC,East,,0
1,2,Green,Bay,Packers,730,553,37.0,0.567,1921,1320,NFC,North,1
2,3,Chicago,Bears,744,568,42,0.565,1920.0,1354,NFC,North,,2
3,4,Miami,Dolphins,439,341,4,0.563,1966.0,784,AFC,East,,3
4,5,New,England,Patriots[b],476,383,9.0,0.554,1960,868,AFC,East,4
5,6,New,York,Giants,684,572,33.0,0.543,1925,1289,NFC,East,5


In [40]:
# Seriesを作ってDataframeに追加
stadiums = Series(["Levi's stadium", "AT&T Stadium"], index=[4,0])
stadiums

4    Levi's stadium
0      AT&T Stadium
dtype: object

In [56]:
nfl_frame['Stadium'] = stadiums

In [57]:
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied,Pct.,First,NFL,Season,Total,Games,Division,Stadium
0,1,Dallas,Cowboys,493,367,6,0.573,1960.0,866,NFC,East,,AT&T Stadium
1,2,Green,Bay,Packers,730,553,37.0,0.567,1921,1320,NFC,North,
2,3,Chicago,Bears,744,568,42,0.565,1920.0,1354,NFC,North,,
3,4,Miami,Dolphins,439,341,4,0.563,1966.0,784,AFC,East,,
4,5,New,England,Patriots[b],476,383,9.0,0.554,1960,868,AFC,East,Levi's stadium
5,6,New,York,Giants,684,572,33.0,0.543,1925,1289,NFC,East,


In [58]:
# 列を削除
del nfl_frame['Stadium']

In [59]:
nfl_frame

Unnamed: 0,Rank,Team,Won,Lost,Tied,Pct.,First,NFL,Season,Total,Games,Division
0,1,Dallas,Cowboys,493,367,6,0.573,1960.0,866,NFC,East,
1,2,Green,Bay,Packers,730,553,37.0,0.567,1921,1320,NFC,North
2,3,Chicago,Bears,744,568,42,0.565,1920.0,1354,NFC,North,
3,4,Miami,Dolphins,439,341,4,0.563,1966.0,784,AFC,East,
4,5,New,England,Patriots[b],476,383,9.0,0.554,1960,868,AFC,East
5,6,New,York,Giants,684,572,33.0,0.543,1925,1289,NFC,East


In [60]:
# DataFrameを辞書から作成
data = {'City':['SF', 'LA', 'NYC'], 'Population':[837000, 38800, 89234]}

In [61]:
data

{'City': ['SF', 'LA', 'NYC'], 'Population': [837000, 38800, 89234]}

In [62]:
city_frame = DataFrame(data)
city_frame

Unnamed: 0,City,Population
0,SF,837000
1,LA,38800
2,NYC,89234


# indexの基本


In [63]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [65]:
my_ser = Series([1, 2, 3, 4],  index=['A','B','C','D'])
my_ser

A    1
B    2
C    3
D    4
dtype: int64

In [67]:
# インデックスを取り出す
my_index = my_ser.index
my_index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [68]:
my_index[0]

'A'

In [69]:
my_index[2]

'C'

In [70]:
my_index[1:3]

Index(['B', 'C'], dtype='object')

In [72]:
# インデックスは書き換え不可
my_index[0] = 'Z'

TypeError: Index does not support mutable operations

# indexを変える

In [77]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from numpy.random import randn

In [78]:
ser1 = Series([1,2,3,4], index=['A','B','C','D'])
ser1

A    1
B    2
C    3
D    4
dtype: int64

In [80]:
# indexの付け替え
ser2 = ser1.reindex(['A', 'B', 'C', 'D', 'E', 'F'])
ser2

A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
F    NaN
dtype: float64

In [81]:
ser1

A    1
B    2
C    3
D    4
dtype: int64

In [84]:
# 新たに追加されたindexにnullなら0を埋める
ser2.reindex(['A', 'B', 'C', 'D', 'E', 'F', 'G'], fill_value=0)

A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
F    NaN
G    0.0
dtype: float64

In [85]:
ser2

A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
F    NaN
dtype: float64

In [86]:
ser3 = Series(['USA', 'Mexico', 'Canada'], index=[0, 5, 10])
ser3

0        USA
5     Mexico
10    Canada
dtype: object

In [90]:
# 他の埋め方
ser3.reindex(range(15), method='ffill') #ffill = forward fill

0        USA
1        USA
2        USA
3        USA
4        USA
5     Mexico
6     Mexico
7     Mexico
8     Mexico
9     Mexico
10    Canada
11    Canada
12    Canada
13    Canada
14    Canada
dtype: object

In [104]:
# DataFrameのindex
dframe = DataFrame(randn(25).reshape((5,5)), index=['A', 'B', 'D', 'E', 'F'], columns=['col1', 'col2', 'col3', 'col4', 'col5'])
dframe

Unnamed: 0,col1,col2,col3,col4,col5
A,0.027091,0.548306,-1.126959,-0.947024,0.485461
B,1.856645,0.528307,-1.425594,-2.482745,0.827969
D,0.671427,-0.479438,-1.417219,-1.36958,-0.768597
E,-0.351547,1.351554,-0.678834,-0.864964,0.755994
F,-0.065292,0.093153,-0.386728,0.665143,-0.445112


In [105]:
new_index = ['A', 'B', 'C', 'D', 'E', 'F']
dframe2 = dframe.reindex(new_index, fill_value=0)
dframe2

Unnamed: 0,col1,col2,col3,col4,col5
A,0.027091,0.548306,-1.126959,-0.947024,0.485461
B,1.856645,0.528307,-1.425594,-2.482745,0.827969
C,0.0,0.0,0.0,0.0,0.0
D,0.671427,-0.479438,-1.417219,-1.36958,-0.768597
E,-0.351547,1.351554,-0.678834,-0.864964,0.755994
F,-0.065292,0.093153,-0.386728,0.665143,-0.445112


In [107]:
new_columns = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6']
dframe2.reindex(columns=new_columns)

Unnamed: 0,col1,col2,col3,col4,col5,col6
A,0.027091,0.548306,-1.126959,-0.947024,0.485461,
B,1.856645,0.528307,-1.425594,-2.482745,0.827969,
C,0.0,0.0,0.0,0.0,0.0,
D,0.671427,-0.479438,-1.417219,-1.36958,-0.768597,
E,-0.351547,1.351554,-0.678834,-0.864964,0.755994,
F,-0.065292,0.093153,-0.386728,0.665143,-0.445112,


In [110]:
# 素早く行と列のindexを変更する方法
dframe.ix(new_index, new_columns)

TypeError: __call__() takes from 1 to 2 positional arguments but 3 were given

In [109]:
dframe

Unnamed: 0,col1,col2,col3,col4,col5
A,0.027091,0.548306,-1.126959,-0.947024,0.485461
B,1.856645,0.528307,-1.425594,-2.482745,0.827969
D,0.671427,-0.479438,-1.417219,-1.36958,-0.768597
E,-0.351547,1.351554,-0.678834,-0.864964,0.755994
F,-0.065292,0.093153,-0.386728,0.665143,-0.445112


# 行や列を削除する

In [111]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [112]:
ser1 = Series(np.arange(3), index=['a', 'b', 'c'])
ser1

a    0
b    1
c    2
dtype: int64

In [114]:
ser1.drop('b')

a    0
c    2
dtype: int64

In [115]:
ser1

a    0
b    1
c    2
dtype: int64

In [116]:
dframe1 = DataFrame(np.arange(9).reshape((3,3)), index=['SF', 'LA', 'NYC'], columns=['pop', 'year', 'size'])
dframe1

Unnamed: 0,pop,year,size
SF,0,1,2
LA,3,4,5
NYC,6,7,8


In [119]:
# 行の削除
dframe1.drop('LA')

Unnamed: 0,pop,year,size
SF,0,1,2
NYC,6,7,8


In [123]:
# 列の削除
dframe1.drop('year', axis=1) # axisはデフォルト0(行)

Unnamed: 0,pop,size
SF,0,2
LA,3,5
NYC,6,8


# データを取り出す

In [124]:
ser1 = Series(np.arange(3), index=['A', 'B', 'C'])
ser1

A    0
B    1
C    2
dtype: int64

In [125]:
ser1 = 2 * ser1
ser1

A    0
B    2
C    4
dtype: int64

In [127]:
# インデックスを使ってデータにアクセスできる
ser1['B']

2

In [128]:
# 添え字でアクセスすることも可能
ser1[1]

2

In [130]:
ser1[1:3]

B    2
C    4
dtype: int64

In [132]:
ser1[[1,2]]

B    2
C    4
dtype: int64

In [134]:
# 条件式で絞ることも可能
ser1[ser1 > 3]

C    4
dtype: int64

In [137]:
# 条件式であった部分の値を変更することもできる
ser1[ser1>3] = 10
ser1

A     0
B     2
C    10
dtype: int64

In [141]:
dframe = DataFrame(np.arange(25).reshape((5,5)), 
                  index=['NYC', 'SF', 'WDC', 'LA', 'Chi'], columns=['A','B','C','D','E'])

In [142]:
dframe

Unnamed: 0,A,B,C,D,E
NYC,0,1,2,3,4
SF,5,6,7,8,9
WDC,10,11,12,13,14
LA,15,16,17,18,19
Chi,20,21,22,23,24


In [143]:
dframe['B']

NYC     1
SF      6
WDC    11
LA     16
Chi    21
Name: B, dtype: int64

In [144]:
dframe[['B', 'C']]

Unnamed: 0,B,C
NYC,1,2
SF,6,7
WDC,11,12
LA,16,17
Chi,21,22


In [148]:
dframe.drop(['B', 'C'], axis=1)

Unnamed: 0,A,D,E
NYC,0,3,4
SF,5,8,9
WDC,10,13,14
LA,15,18,19
Chi,20,23,24


In [150]:
# 論理式を与えることも可能
dframe[dframe['C'] > 8]

Unnamed: 0,A,B,C,D,E
WDC,10,11,12,13,14
LA,15,16,17,18,19
Chi,20,21,22,23,24


In [152]:
dframe>10

Unnamed: 0,A,B,C,D,E
NYC,False,False,False,False,False
SF,False,False,False,False,False
WDC,False,True,True,True,True
LA,True,True,True,True,True
Chi,True,True,True,True,True


In [153]:
dframe.ix['LA']

A    15
B    16
C    17
D    18
E    19
Name: LA, dtype: int64

In [155]:
dframe.ix['LA', 'A']

15

In [156]:
dframe.ix[1]

A    5
B    6
C    7
D    8
E    9
Name: SF, dtype: int64

# 形の違うデータの計算

In [157]:
ser1 = Series([0, 1, 2], index=['A', 'B', 'C'])
ser1

A    0
B    1
C    2
dtype: int64

In [160]:
ser2 = Series([3,4,5,6], index=['A', 'B', 'C', 'D'])
ser2

A    3
B    4
C    5
D    6
dtype: int64

In [161]:
ser1 + ser2

A    3.0
B    5.0
C    7.0
D    NaN
dtype: float64

In [165]:
# DataFrameでも試してみる
dframe1 = DataFrame(np.arange(4).reshape((2,2)), columns=list('AB'), index=['NYC', 'LA'])
dframe1

Unnamed: 0,A,B
NYC,0,1
LA,2,3


In [164]:
list('AB')

['A', 'B']

In [168]:
dframe2 = DataFrame(np.arange(9).reshape((3,3)), columns=list('ADC'), index=['NYC', 'LA', 'SF'])
dframe2

Unnamed: 0,A,D,C
NYC,0,1,2
LA,3,4,5
SF,6,7,8


In [169]:
dframe1 + dframe2

Unnamed: 0,A,B,C,D
LA,5.0,,,
NYC,0.0,,,
SF,,,,


In [171]:
# 片方がnullだと計算できていない。これをNaNになることを避けるには
dframe1.add(dframe2, fill_value=0)

Unnamed: 0,A,B,C,D
LA,5.0,3.0,5.0,4.0
NYC,0.0,1.0,2.0,1.0
SF,6.0,,8.0,7.0


B列のSFはどこにも存在しないのでnullのまま

In [172]:
dframe2

Unnamed: 0,A,D,C
NYC,0,1,2
LA,3,4,5
SF,6,7,8


In [174]:
# DataFrameからSeriesを取り出す
ser3 = dframe2.ix[0]
ser3

A    0
D    1
C    2
Name: NYC, dtype: int64

In [176]:
dframe2.ix[:,1]

NYC    1
LA     4
SF     7
Name: D, dtype: int64

In [177]:
# DataFrameとSeriesの計算
dframe2 - ser3

Unnamed: 0,A,D,C
NYC,0,0,0
LA,3,3,3
SF,6,6,6


# データの並べ替えと順番

In [178]:
ser1 = Series(range(3), index=['C', 'A', 'B'])
ser1

C    0
A    1
B    2
dtype: int64

In [180]:
# indexを使ったソート
ser1.sort_index()

A    1
B    2
C    0
dtype: int64

In [182]:
# valueを使ったソート
ser1.sort_values()

C    0
A    1
B    2
dtype: int64

In [183]:
ser2 = Series(np.random.randn(10))
ser2

0    0.238515
1    1.033618
2   -1.298732
3   -0.125731
4    1.231447
5    0.466724
6    0.198739
7    1.218610
8   -1.525546
9   -1.050708
dtype: float64

In [184]:
ser2.rank()

0     6.0
1     8.0
2     2.0
3     4.0
4    10.0
5     7.0
6     5.0
7     9.0
8     1.0
9     3.0
dtype: float64

In [185]:
ser2.sort_values()

8   -1.525546
2   -1.298732
9   -1.050708
3   -0.125731
6    0.198739
0    0.238515
5    0.466724
1    1.033618
7    1.218610
4    1.231447
dtype: float64

# データと統計量

In [6]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [8]:
arr = np.array([[1,2,np.nan], [np.nan, 3,4]])
arr

array([[  1.,   2.,  nan],
       [ nan,   3.,   4.]])

In [11]:
dframe = DataFrame(arr, index=['A', 'B'], columns=['one', 'two', 'three'])
dframe

Unnamed: 0,one,two,three
A,1.0,2.0,
B,,3.0,4.0


In [12]:
# 列ごとの合計値をnullを無視して算出
dframe.sum()

one      1.0
two      5.0
three    4.0
dtype: float64

In [13]:
# 行方向に計算
dframe.sum(axis=1)

A    3.0
B    7.0
dtype: float64

In [14]:
dframe.min()

one      1.0
two      2.0
three    4.0
dtype: float64

In [18]:
# どのindex
dframe.idxmin()

one      A
two      A
three    B
dtype: object

In [19]:
dframe.idxmax()

one      A
two      B
three    B
dtype: object

In [20]:
dframe

Unnamed: 0,one,two,three
A,1.0,2.0,
B,,3.0,4.0


In [21]:
# 累積の算出
dframe.cumsum()

Unnamed: 0,one,two,three
A,1.0,2.0,
B,,5.0,4.0


In [27]:
dframe['test'] = dframe['two'].cumsum()

In [28]:
dframe

Unnamed: 0,one,two,three,test
A,1.0,2.0,,2.0
B,,3.0,4.0,5.0


In [30]:
del dframe['test']

In [31]:
dframe

Unnamed: 0,one,two,three
A,1.0,2.0,
B,,3.0,4.0


In [32]:
dframe.describe()



Unnamed: 0,one,two,three
count,1.0,2.0,1.0
mean,1.0,2.5,4.0
std,,0.707107,
min,1.0,2.0,4.0
25%,,2.25,
50%,,2.5,
75%,,2.75,
max,1.0,3.0,4.0


In [34]:
# 株価のデータを使う
import pandas.io.data as pdweb

The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.
After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.


In [35]:
import datetime

In [46]:
price = pdweb.get_data_yahoo(['CVX', 'XOM', 'BP'],
                             start=datetime.datetime(2010,1,1),
                             end=datetime.datetime(2013,1,1))['Adj Close']



RemoteDataError: No data fetched using '_get_hist_yahoo'

In [47]:
# 株価の変化
rets = price.pct_change()

NameError: name 'prices' is not defined

In [48]:
import seaborn as sns
import matplotlib.pyplot as plt

In [50]:
sns.heatmap(rets.corr())

NameError: name 'rets' is not defined

In [51]:
# 重複データのカウント／削除
ser1 = Series(['w', 'w', 'x', 'y', 'z', 'a', 'z'])
ser1

0    w
1    w
2    x
3    y
4    z
5    a
6    z
dtype: object

In [52]:
ser1.unique()

array(['w', 'x', 'y', 'z', 'a'], dtype=object)

In [53]:
ser1.value_counts()

z    2
w    2
y    1
x    1
a    1
dtype: int64

# 欠損値の扱い

In [1]:
import numpy as np
from numpy import nan
import pandas as pd
from pandas import Series, DataFrame

In [2]:
data = Series(['one', 'two', nan, 'four'])
data

0     one
1     two
2     NaN
3    four
dtype: object

In [4]:
# どこに欠損値があるかわかる
data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
# 欠損値を取り除く
data.dropna()

0     one
1     two
3    four
dtype: object

In [7]:
dframe = DataFrame([[1,2,3], [nan,5,6], [7,nan,9], [nan,nan,nan]])
dframe

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0
3,,,


In [8]:
# 一つでもnanが含まれている行は削除される
dframe.dropna()

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [10]:
# すべてが欠損値の行だけが消える
dframe.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0


In [13]:
# 列方向に1つ以上Nanがあれば削除
dframe.dropna(axis=1)

0
1
2
3


In [29]:
dframe2 = DataFrame([[1,2,3,nan], [2,nan,5,6], [nan,7,nan,9], [1,nan,nan,nan]])
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [17]:
# 欠損値がないデータが2個以上あれば残す
dframe2.dropna(thresh=2)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0


In [30]:
dframe2.dropna(thresh=3)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0


In [31]:
# 欠損値を別のデータで埋める
dframe2.fillna(1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,1.0
1,2.0,1.0,5.0,6.0
2,1.0,7.0,1.0,9.0
3,1.0,1.0,1.0,1.0


In [21]:
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [32]:
# 列ごとに欠損値を何で埋めるかを指定できる
dframe2.fillna({0:0, 1:1, 2:2, 3:3})

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,3.0
1,2.0,1.0,5.0,6.0
2,0.0,7.0,2.0,9.0
3,1.0,1.0,2.0,3.0


In [33]:
dframe2.fillna(0, inplace=True)

In [34]:
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,0.0
1,2.0,0.0,5.0,6.0
2,0.0,7.0,0.0,9.0
3,1.0,0.0,0.0,0.0


# indexの階層構造

In [35]:
from numpy.random import randn

In [36]:
ser = Series(np.random.randn(6), index=[[1,1,1,2,2,2], ['a','b','c','a','b','c']])

In [37]:
ser

1  a   -0.348236
   b    0.834571
   c    0.076451
2  a    0.081690
   b   -0.662194
   c    0.650114
dtype: float64

In [38]:
ser.index

MultiIndex(levels=[[1, 2], ['a', 'b', 'c']],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [39]:
ser[1]

a   -0.348236
b    0.834571
c    0.076451
dtype: float64

In [40]:
ser[1]['a']

-0.34823605953620645

In [42]:
ser[:,'a']

1   -0.348236
2    0.081690
dtype: float64

In [43]:
dframe = ser.unstack()

In [44]:
dframe

Unnamed: 0,a,b,c
1,-0.348236,0.834571,0.076451
2,0.08169,-0.662194,0.650114


In [45]:
dframe.unstack()

a  1   -0.348236
   2    0.081690
b  1    0.834571
   2   -0.662194
c  1    0.076451
   2    0.650114
dtype: float64

In [46]:
dframe.T.unstack()

1  a   -0.348236
   b    0.834571
   c    0.076451
2  a    0.081690
   b   -0.662194
   c    0.650114
dtype: float64

In [47]:
dframe2 = DataFrame(np.arange(16).reshape((4,4)),
                   index=[['a','a','b','b'], [1,2,1,2]],
                   columns=[['NY','NY', 'LA', 'SF'],['cold','hot','hot','cold']])

In [48]:
dframe2

Unnamed: 0_level_0,Unnamed: 1_level_0,NY,NY,LA,SF
Unnamed: 0_level_1,Unnamed: 1_level_1,cold,hot,hot,cold
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [49]:
dframe2.index.names = ['INDEX_1', 'INDEX_2']

In [50]:
dframe2

Unnamed: 0_level_0,Unnamed: 1_level_0,NY,NY,LA,SF
Unnamed: 0_level_1,Unnamed: 1_level_1,cold,hot,hot,cold
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [51]:
dframe2.columns.names = ['Cities', 'Temp']

In [52]:
dframe2

Unnamed: 0_level_0,Cities,NY,NY,LA,SF
Unnamed: 0_level_1,Temp,cold,hot,hot,cold
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [53]:
dframe2.swaplevel('Cities', 'Temp', axis=1)

Unnamed: 0_level_0,Temp,cold,hot,hot,cold
Unnamed: 0_level_1,Cities,NY,NY,LA,SF
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [54]:
# 行方向についてレベル1の列をソート数
dframe2.sortlevel(1)

Unnamed: 0_level_0,Cities,NY,NY,LA,SF
Unnamed: 0_level_1,Temp,cold,hot,hot,cold
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
b,1,8,9,10,11
a,2,4,5,6,7
b,2,12,13,14,15


In [55]:
dframe2.sortlevel(0)

Unnamed: 0_level_0,Cities,NY,NY,LA,SF
Unnamed: 0_level_1,Temp,cold,hot,hot,cold
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [56]:
dframe2.sum(level='Temp', axis=1)

Unnamed: 0_level_0,Temp,cold,hot
INDEX_1,INDEX_2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,3,3
a,2,11,11
b,1,19,19
b,2,27,27
