# Chapter 5. Getting Started with pandas 

In [3]:
import pandas as pd
import numpy as np

In [18]:
# Series
se=pd.Series(np.random.randn(10),index=np.arange(10))
print(se)
print(type(se.values))
print(type(se.index))

0   -0.940579
1   -1.785186
2   -0.016452
3    0.920159
4   -1.524652
5    0.090834
6    1.364760
7   -0.615303
8   -1.162480
9   -0.738631
dtype: float64
<class 'numpy.ndarray'>
<class 'pandas.core.indexes.numeric.Int64Index'>


In [21]:
# Dataframe
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame=pd.DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


In [38]:
# 列の取得
year=frame.year
print(year==frame['year'])
year

# 行の取得
frame.loc[0]

# 列の追加
frame['debt']=np.arange(len(year))
frame

0    True
1    True
2    True
3    True
4    True
5    True
Name: year, dtype: bool


Unnamed: 0,pop,state,year,debt
0,1.5,Ohio,2000,0
1,1.7,Ohio,2001,1
2,3.6,Ohio,2002,2
3,2.4,Nevada,2001,3
4,2.9,Nevada,2002,4
5,3.2,Nevada,2003,5


In [39]:
# 列の削除
del frame['debt']
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


In [75]:
# ネストしたdictを設定すると、外のkeyが列、内側のkeyが行となる
pop = {'Nevada': {2001: 2.4, 2002: 2.9},  'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
pop=pd.DataFrame(pop)

In [79]:
# locは名前で指定、ilocは番号で指定

#  番号指定でnumpyのようにとりだせる
pop.iloc[:2,:2]
# 名前指定で取り出す
pop.loc[:2002,'Ohio']

2000    1.5
2001    1.7
2002    3.6
Name: Ohio, dtype: float64

In [98]:
# 関数適用
frame = pd.DataFrame(np.random.randn(12).reshape((4, 3)),  columns=list('bde'),  index=['Utah', 'Ohio', 'Texas', 'Oregon'])

# apply(デフォルトは縦軸)
print(frame.apply(lambda x: max(x)-min(x)))

# scalarだけでなく、以下のようにSeriseを返すことも可能
def f(x):
    return pd.Series([x.min(),x.max()],index=['min','max'])

frame.apply(f)

# 要素全てに関数を適用
formata=lambda x: '%.2f' %x

frame.applymap(formata)

# 列単位での適用
frame['b'].map(formata)

b    2.86589
d    3.32413
e    1.13952
dtype: float64


Utah       1.11
Ohio       0.33
Texas      1.20
Oregon    -1.67
Name: b, dtype: object

In [99]:
# 統計データの一覧
frame.describe()

Unnamed: 0,b,d,e
count,4.0,4.0,4.0
mean,0.24228,-0.73812,-0.444606
std,1.332802,1.444434,0.516351
min,-1.670587,-2.455005,-1.19414
25%,-0.167012,-1.576924,-0.574605
50%,0.722203,-0.683301,-0.264831
75%,1.131494,0.155503,-0.134832
max,1.195302,0.869125,-0.054621


In [136]:
# テストデータを取得 TODO!
import pandas_datareader.data as web

all_data={ticker: web.get_data_stooq(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

for key,data in  all_data.items():
    print(data)
    

Empty DataFrame
Columns: []
Index: []
Empty DataFrame
Columns: []
Index: []
Empty DataFrame
Columns: []
Index: []
Empty DataFrame
Columns: []
Index: []


In [55]:
rand=np.random.randn(3,3)
rand2=np.random.randn(3,3)
%time (rand @ rand2 for  i in range(10000) )
%time (np.dot(rand2,rand2) for i  in range(10000))

CPU times: user 16 µs, sys: 2 µs, total: 18 µs
Wall time: 28.4 µs
CPU times: user 16 µs, sys: 2 µs, total: 18 µs
Wall time: 27.9 µs


<generator object <genexpr> at 0x10e011780>