## Chap2 科学計算、データ加工、グラフ描画ライブラリの使い方の基礎

* Numpy
* Scipy
* Pandas
* Matplotlib

In [2]:
# マジックコマンドの表示
%quickref


IPython -- An enhanced Interactive Python - Quick Reference Card

obj?, obj??      : Get help, or more help for object (also works as
                   ?obj, ??obj).
?foo.*abc*       : List names in 'foo' containing 'abc' in them.
%magic           : Information about IPython's 'magic' % functions.

Magic functions are prefixed by % or %%, and typically take their arguments
without parentheses, quotes or even commas for convenience.  Line magics take a
single % and cell magics are prefixed with two %%.

Example magic function calls:

%alias d ls -F   : 'd' is now an alias for 'ls -F'
alias d ls -F    : Works if 'alias' not a python name
alist = %alias   : Get list of aliases to 'alist'
cd /usr/share    : Obvious. cd -<tab> to choose from visited dirs.
%cd??            : See help AND source for magic %cd
%timeit x=10     : time the 'x=10' statement with high precision.
%%timeit x=2**100
x**100           : time 'x**100' with a setup of 'x=2**100'; setup code is not
                   co

In [2]:
# ライブラリのインポート
import numpy as np
import numpy.random as random
import scipy as sp
import pandas as pd
from pandas import Series, DataFrame
# 可視化ライブラリ
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
# マジックコマンド
# 描画画商
%matplotlib inline
# 小数点以下の桁数の指定（numpy)
%precision 3

'%.3f'

### 2-2 Numpy

In [20]:
data = np.array([9,2,3,4,10,6,7,8,1,5])
data

array([ 9,  2,  3,  4, 10,  6,  7,  8,  1,  5])

In [8]:
data.dtype

dtype('int64')

In [9]:
print("次元数：",data.ndim)
print("要素数：",data.size)

次元数： 1
要素数： 10


In [10]:
# それぞれの数字を係数倍
data * 2

array([18,  4,  6,  8, 20, 12, 14, 16,  2, 10])

In [17]:
# 要素同士の演算
print(np.array([1,2,3]) * np.array([3,4,5]))
print(np.array([1,2,3]) ** np.array([3,4,5]))
print(np.array([1,2,3]) / np.array([3,4,5]))

[ 3  8 15]
[  1  16 243]
[0.333 0.5   0.6  ]


In [21]:
print(data)
data.sort()
print(data)

[ 9  2  3  4 10  6  7  8  1  5]
[ 1  2  3  4  5  6  7  8  9 10]


In [23]:
# ソートメソッド（.sort()）はdataそのものを変更
data

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [31]:
# ソート条件の付加
data[::-1].sort()
print(data)

[10  9  8  7  6  5  4  3  2  1]


In [32]:
data

array([10,  9,  8,  7,  6,  5,  4,  3,  2,  1])

In [35]:
# array内の計算
print(data.min())
print(data.max())
print(data.sum())
print(data.cumsum())
print("Ratio =",data.cumsum()/data.sum())

1
10
55
[10 19 27 34 40 45 49 52 54 55]
Ratio = [0.182 0.345 0.491 0.618 0.727 0.818 0.891 0.945 0.982 1.   ]


In [44]:
# 乱数
import numpy.random as random
random.seed(0)

In [45]:
# 正規分布
# 平均0,分散1,の乱数10個発生(randn)
rnd_data = random.randn(10)
print(rnd_data)

[ 1.764  0.4    0.979  2.241  1.868 -0.977  0.95  -0.151 -0.103  0.411]


In [46]:
# ランダム抽出
data = np.array([9,2,3,4,10,6,7,8,1,5])
# 10個抽出（重複あり）
print(random.choice(data,10))
# 10個抽出（重複なし）
print(random.choice(data,10,replace = False))

[ 7  8  8  1  2  6  5  1  5 10]
[10  2  7  8  3  1  6  5  9  4]


In [51]:
# 速度比較（python list vs numpy）
N = 10**7
normal_data = [random.random() for _ in range(N)]
numpy_data = np.array(normal_data)
# 速度比較
%timeit sum(normal_data)
%timeit np.sum(numpy_data)

37.3 ms ± 913 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
4.15 ms ± 25.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [52]:
# 行列
np.arange(9)

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [55]:
array1 = np.arange(9).reshape(3,3)
print(array1)

[[0 1 2]
 [3 4 5]
 [6 7 8]]


In [56]:
array1[0,:]

array([0, 1, 2])

In [57]:
array1[:,0]

array([0, 3, 6])

In [58]:
array2 = np.arange(9,18).reshape(3,3)
print(array2)

[[ 9 10 11]
 [12 13 14]
 [15 16 17]]


In [59]:
# 行列の積
np.dot(array1,array2)

array([[ 42,  45,  48],
       [150, 162, 174],
       [258, 279, 300]])

In [60]:
# 要素同士の積
array1 * array2

array([[  0,  10,  22],
       [ 36,  52,  70],
       [ 90, 112, 136]])

In [61]:
# 要素が0, 1のみの行列を作る。
print(np.zeros((2,3),dtype = np.int64))
print(np.ones((2,3),dtype = np.float64))

[[0 0 0]
 [0 0 0]]
[[1. 1. 1.]
 [1. 1. 1.]]


### 2-3 Scipy

In [3]:
import scipy.linalg as linalg
from scipy.optimize import minimize_scalar

In [4]:
matrix = np.array([[1,-1,-1],[-1,1,-1],[-1,-1,1]])
print("行列式")
print(linalg.det(matrix))

行列式
-4.0


In [5]:
print("逆行列")
print(linalg.inv(matrix))

逆行列
[[ 0.  -0.5 -0.5]
 [-0.5 -0.  -0.5]
 [-0.5 -0.5  0. ]]


In [7]:
print(matrix.dot(linalg.inv(matrix)))

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [8]:
#固有値
eig_value,eig_vector = linalg.eig(matrix)

print("固有値")
print(eig_value)
print("固有値ベクトル")
print(eig_vector)

固有値
[-1.+0.j  2.+0.j  2.+0.j]
固有値ベクトル
[[ 0.577 -0.816  0.428]
 [ 0.577  0.408 -0.816]
 [ 0.577  0.408  0.389]]


In [10]:
print(matrix.dot(eig_vector))

[[-0.577 -1.633  0.855]
 [-0.577  0.816 -1.632]
 [-0.577  0.816  0.777]]


In [11]:
#ニュートン法

#関数の定義
def my_function(x):
    return (x**2 + 2*x + 1)

In [24]:
# ニュートン法の読み込み
from scipy.optimize import newton

print(newton(my_function,0))
# 第一引数にf(x)を表す関数、第二引数に初期推定値をいれる

-0.9999999852953906


In [14]:
print(newton(lambda x:x**2+2*x + 1,3))

-0.9999999851107327


In [25]:
#最小値
print(minimize_scalar(my_function,method = "Brent"))
#brent法を用いる

     fun: 0.0
    nfev: 9
     nit: 4
 success: True
       x: -1.0000000000000002


In [27]:
#練習問題2-6
print(newton(lambda x:x**3+2*x+1,0))

-0.45339765151640365


## Pandas 

In [28]:
from pandas import Series,DataFrame

In [29]:
# Series
sample_pandas_data = pd.Series([0,10,20,30,40,50,60,70,80,90])
print(sample_pandas_data)

0     0
1    10
2    20
3    30
4    40
5    50
6    60
7    70
8    80
9    90
dtype: int64


In [31]:
sample_pandas_index_data = pd.Series([0,10,20,30,40,50,60,70,80,90],index=['a','b','c','d','e','f','g','h','i','j'])
print(sample_pandas_index_data)

a     0
b    10
c    20
d    30
e    40
f    50
g    60
h    70
i    80
j    90
dtype: int64


In [33]:
print("データ値：",sample_pandas_data.values)
print("インデックスの値：",sample_pandas_index_data.index)

データ値： [ 0 10 20 30 40 50 60 70 80 90]
インデックスの値： Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')


In [35]:
attri_data1 = {'ID':['100','101','102','103','104'],
              'City':['Tokyo','Osaka','Kyoto','Hokkaido','Tokyo'],
              'Birth_year':[1990,1989,1992,1997,1982],
              'Name':['Hiroshi','Akiko','Yuki','Satoru','Steve']}
attri_data_frame1 = DataFrame(attri_data1)
print(attri_data_frame1)

    ID      City  Birth_year     Name
0  100     Tokyo        1990  Hiroshi
1  101     Osaka        1989    Akiko
2  102     Kyoto        1992     Yuki
3  103  Hokkaido        1997   Satoru
4  104     Tokyo        1982    Steve


In [36]:
attri_data_frame1_index1 = DataFrame(attri_data1,index=['a','b','c','d','e'])
print(attri_data_frame1_index1)

    ID      City  Birth_year     Name
a  100     Tokyo        1990  Hiroshi
b  101     Osaka        1989    Akiko
c  102     Kyoto        1992     Yuki
d  103  Hokkaido        1997   Satoru
e  104     Tokyo        1982    Steve


In [37]:
attri_data_frame1_index1

Unnamed: 0,ID,City,Birth_year,Name
a,100,Tokyo,1990,Hiroshi
b,101,Osaka,1989,Akiko
c,102,Kyoto,1992,Yuki
d,103,Hokkaido,1997,Satoru
e,104,Tokyo,1982,Steve


In [38]:
#転置
attri_data_frame1.T

Unnamed: 0,0,1,2,3,4
ID,100,101,102,103,104
City,Tokyo,Osaka,Kyoto,Hokkaido,Tokyo
Birth_year,1990,1989,1992,1997,1982
Name,Hiroshi,Akiko,Yuki,Satoru,Steve


In [39]:
#列名指定
attri_data_frame1.Birth_year

0    1990
1    1989
2    1992
3    1997
4    1982
Name: Birth_year, dtype: int64

In [40]:
attri_data_frame1[['ID','Birth_year']]
#複数列はリストで指定する

Unnamed: 0,ID,Birth_year
0,100,1990
1,101,1989
2,102,1992
3,103,1997
4,104,1982


In [42]:
attri_data_frame1['Birth_year']

0    1990
1    1989
2    1992
3    1997
4    1982
Name: Birth_year, dtype: int64

In [43]:
#フィルター
attri_data_frame1[attri_data_frame1['City'] == 'Tokyo']

Unnamed: 0,ID,City,Birth_year,Name
0,100,Tokyo,1990,Hiroshi
4,104,Tokyo,1982,Steve


In [44]:
attri_data_frame1['City'] == 'Tokyo'
# dtypeがboolのseriesオブジェクト

0     True
1    False
2    False
3    False
4     True
Name: City, dtype: bool

In [45]:
attri_data_frame1[attri_data_frame1['City'].isin(['Tokyo','Osaka'])]

Unnamed: 0,ID,City,Birth_year,Name
0,100,Tokyo,1990,Hiroshi
1,101,Osaka,1989,Akiko
4,104,Tokyo,1982,Steve


In [46]:
attri_data_frame1.drop(['Birth_year'],axis=1)

Unnamed: 0,ID,City,Name
0,100,Tokyo,Hiroshi
1,101,Osaka,Akiko
2,102,Kyoto,Yuki
3,103,Hokkaido,Satoru
4,104,Tokyo,Steve


In [47]:
attri_data2 = {'ID':['100','101','102','105','107'],
              'Math':[50,43,33,76,98],
              'English':[90,30,20,50,30],
              'Sex':['M','F','F','M','M']}
attri_data_frame2 = DataFrame(attri_data2)
attri_data_frame2

Unnamed: 0,ID,Math,English,Sex
0,100,50,90,M
1,101,43,30,F
2,102,33,20,F
3,105,76,50,M
4,107,98,30,M


In [48]:
pd.merge(attri_data_frame1,attri_data_frame2)
#共通の値でマージ（ここではID）

Unnamed: 0,ID,City,Birth_year,Name,Math,English,Sex
0,100,Tokyo,1990,Hiroshi,50,90,M
1,101,Osaka,1989,Akiko,43,30,F
2,102,Kyoto,1992,Yuki,33,20,F


In [49]:
#データのグループ集計
attri_data_frame2.groupby('Sex')['Math'].mean()

Sex
F    38.000000
M    74.666667
Name: Math, dtype: float64

In [52]:
attri_data2 = {'ID':['100','101','102','103','104'],
              'City':['Tokyo','Osaka','Kyoto','Hokkaido','Tokyo'],
              'Birth_year':[1990,1989,1992,1997,1982],
              'Name':['Hiroshi','Akiko','Yuki','Satoru','Steve']}
attri_data_frame2 = DataFrame(attri_data2)
attri_data_frame_index2 = DataFrame(attri_data2,index=['e','b','a','d','c'])
attri_data_frame_index2

Unnamed: 0,ID,City,Birth_year,Name
e,100,Tokyo,1990,Hiroshi
b,101,Osaka,1989,Akiko
a,102,Kyoto,1992,Yuki
d,103,Hokkaido,1997,Satoru
c,104,Tokyo,1982,Steve


In [53]:
attri_data_frame_index2.sort_index()

Unnamed: 0,ID,City,Birth_year,Name
a,102,Kyoto,1992,Yuki
b,101,Osaka,1989,Akiko
c,104,Tokyo,1982,Steve
d,103,Hokkaido,1997,Satoru
e,100,Tokyo,1990,Hiroshi


In [54]:
attri_data_frame_index2.Birth_year.sort_values()

c    1982
b    1989
e    1990
a    1992
d    1997
Name: Birth_year, dtype: int64

In [55]:
#値があるかの確認
attri_data_frame_index2.isin(['Tokyo'])

Unnamed: 0,ID,City,Birth_year,Name
e,False,True,False,False
b,False,False,False,False
a,False,False,False,False
d,False,False,False,False
c,False,True,False,False


In [56]:
#欠損値の取り扱い
#nameをすべてnanにする
attri_data_frame_index2["Name"]=np.nan
attri_data_frame_index2.isnull()

Unnamed: 0,ID,City,Birth_year,Name
e,False,False,False,True
b,False,False,False,True
a,False,False,False,True
d,False,False,False,True
c,False,False,False,True


In [57]:
attri_data_frame_index2.isnull().sum()

ID            0
City          0
Birth_year    0
Name          5
dtype: int64

## matplotlib