<a href="https://colab.research.google.com/github/hongo-daisuke/study-python/blob/master/data_analysis/python_pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# pandas

*   pythonのデータ分析ライブラリの1つ、表形式のデータや行列を扱うことが出来る。



In [1]:
import numpy as np
import pandas as pd

## Series
pandasで扱えるデータの一種、1次元の配列で単一の列からなる表とみなす。

```
pd.Series(データ配列, index=インデックスのリスト)
```



In [None]:
s = pd.Series([1, 2, np.nan])
s

0    1.0
1    2.0
2    NaN
dtype: float64

In [None]:
# データにアクセス
s[1]

2.0

In [None]:
# sum関数による足し算
s.sum()

3.0

In [18]:
s = pd.Series([1, 2, np.nan], index=['A', 'B', 'C'])
s

A    1.0
B    2.0
C    NaN
dtype: float64

## DataFrame
行と配列からなる表形式の配列データ


```
# リストから生成
pd.DataFrame(2次元のデータ配列, columns, index)

# 辞書から生成
data = {'column1': 'データ配列1', 'column2': 'データ配列2'}
pd.DataFrame(data, index)
```



In [None]:
data = {'A': [1, 2], 'B': [3, 4]}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,1,3
1,2,4


In [None]:
df = pd.DataFrame(np.random.randn(6, 4))
df

Unnamed: 0,0,1,2,3
0,0.574786,0.239585,-0.939871,-0.848235
1,1.692302,-0.567214,-0.186334,0.925544
2,0.472251,0.649851,-0.303607,0.48533
3,1.232212,-0.876889,0.964393,-0.941811
4,-2.634621,-0.124148,-1.13269,-0.960561
5,-1.697359,-1.390254,0.689124,-0.909557


In [None]:
# インデックス名の設定
df = pd.DataFrame(np.random.randn(6, 4), index=pd.date_range('20220906', periods=6))
df

Unnamed: 0,0,1,2,3
2022-09-06,-0.164322,0.438152,-0.564134,-0.553272
2022-09-07,1.414395,-0.687984,0.85027,0.502112
2022-09-08,0.005705,-0.925636,-2.774242,0.227373
2022-09-09,-1.986748,1.570878,-0.839629,0.393962
2022-09-10,-1.662871,1.986123,-1.08023,-1.694869
2022-09-11,1.258426,2.81682,1.008002,-0.262373


In [None]:
# カラムの設定
df = pd.DataFrame(np.random.randn(6, 4), index=pd.date_range('20220906', periods=6), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2022-09-06,1.789819,1.588179,0.819663,0.234145
2022-09-07,-2.056649,0.584958,-0.097939,1.383745
2022-09-08,0.716233,-0.077171,0.201672,-0.197706
2022-09-09,-0.030896,0.500765,0.246067,-0.320968
2022-09-10,-0.113084,-0.474563,1.12354,-1.897694
2022-09-11,-1.054213,0.311134,1.613455,-0.733419


## DataFrameの様々なデータ取得


*   先頭からn件取得
*   末尾からn件取得
*   インデックスの取得
*   カラム名の取得
*   データの中身を取得
*   基本統計量を一括で取得
*   転置





In [None]:
# 先頭からn件取得
df.head(1)

Unnamed: 0,A,B,C,D
2022-09-06,1.789819,1.588179,0.819663,0.234145


In [None]:
# 末尾からn件取得
df.tail(2)

Unnamed: 0,A,B,C,D
2022-09-10,-0.113084,-0.474563,1.12354,-1.897694
2022-09-11,-1.054213,0.311134,1.613455,-0.733419


In [None]:
# インデックスの取得
df.index

DatetimeIndex(['2022-09-06', '2022-09-07', '2022-09-08', '2022-09-09',
               '2022-09-10', '2022-09-11'],
              dtype='datetime64[ns]', freq='D')

In [None]:
# カラムの取得
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [None]:
# データの中身を取得
df.values

array([[ 1.78981863,  1.58817889,  0.81966328,  0.23414536],
       [-2.05664913,  0.58495831, -0.0979393 ,  1.38374533],
       [ 0.71623268, -0.07717142,  0.2016718 , -0.19770588],
       [-0.03089634,  0.50076536,  0.24606655, -0.32096763],
       [-0.11308427, -0.474563  ,  1.12353959, -1.89769382],
       [-1.05421269,  0.31113423,  1.61345464, -0.7334189 ]])

In [None]:
# 基本統計量を一括で取得
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.124799,0.40555,0.651076,-0.255316
std,1.339993,0.700913,0.64882,1.082563
min,-2.056649,-0.474563,-0.097939,-1.897694
25%,-0.818931,0.019905,0.21277,-0.630306
50%,-0.07199,0.40595,0.532865,-0.259337
75%,0.52945,0.56391,1.047571,0.126183
max,1.789819,1.588179,1.613455,1.383745


In [None]:
# 転置
df.T

Unnamed: 0,2022-09-06,2022-09-07,2022-09-08,2022-09-09,2022-09-10,2022-09-11
A,1.789819,-2.056649,0.716233,-0.030896,-0.113084,-1.054213
B,1.588179,0.584958,-0.077171,0.500765,-0.474563,0.311134
C,0.819663,-0.097939,0.201672,0.246067,1.12354,1.613455
D,0.234145,1.383745,-0.197706,-0.320968,-1.897694,-0.733419


In [None]:
# DataFrameをソート
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2022-09-10,-0.113084,-0.474563,1.12354,-1.897694
2022-09-08,0.716233,-0.077171,0.201672,-0.197706
2022-09-11,-1.054213,0.311134,1.613455,-0.733419
2022-09-09,-0.030896,0.500765,0.246067,-0.320968
2022-09-07,-2.056649,0.584958,-0.097939,1.383745
2022-09-06,1.789819,1.588179,0.819663,0.234145


In [None]:
# スライス構文で抽出
df[0:3]

Unnamed: 0,A,B,C,D
2022-09-06,1.789819,1.588179,0.819663,0.234145
2022-09-07,-2.056649,0.584958,-0.097939,1.383745
2022-09-08,0.716233,-0.077171,0.201672,-0.197706


In [None]:
# インデックスで指定することも可能
df['2022-09-07':'2022-09-10']

Unnamed: 0,A,B,C,D
2022-09-07,-2.056649,0.584958,-0.097939,1.383745
2022-09-08,0.716233,-0.077171,0.201672,-0.197706
2022-09-09,-0.030896,0.500765,0.246067,-0.320968
2022-09-10,-0.113084,-0.474563,1.12354,-1.897694


## DataFrameの列データを取得

```
df['カラム名']

df.カラム名
```



In [3]:
name = ['James', 'Anthony', 'Michael', 'Howard', 'Russell']
data = {'height': [206, 201, 198, 208, 191], 'weight': [113, 108, 98, 120, 91]}
df = pd.DataFrame(data, index=name)
df

Unnamed: 0,height,weight
James,206,113
Anthony,201,108
Michael,198,98
Howard,208,120
Russell,191,91


In [4]:
# heightを取得
df['height']

James      206
Anthony    201
Michael    198
Howard     208
Russell    191
Name: height, dtype: int64

In [5]:
# weightを取得
df.weight

James      113
Anthony    108
Michael     98
Howard     120
Russell     91
Name: weight, dtype: int64

In [6]:
# インデックスを指定することで個別データを取得可能
df.weight.Russell

91

In [7]:
# インデックスがDataFrameと一致している場合、列データの更新が可能
new_height = [207, 202, 199, 209, 192]
df['height'] = new_height
df

Unnamed: 0,height,weight
James,207,113
Anthony,202,108
Michael,199,98
Howard,209,120
Russell,192,91


## DataFrameの行データを取得

```
# indexで指定した行のSeries
df.loc[index名]

# integer-locationで指定した行のSeries
df.iloc[index名]
```




In [8]:
df = pd.DataFrame(np.random.randn(6, 4), index=pd.date_range('20220906', periods=6), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2022-09-06,-0.256666,-0.282374,0.005225,0.396512
2022-09-07,0.602124,1.343904,0.031283,-0.713169
2022-09-08,1.659802,-1.23673,1.649086,-0.125421
2022-09-09,0.349772,2.195021,1.096174,0.974692
2022-09-10,-0.756491,1.022863,-0.450788,-0.871504
2022-09-11,-1.879364,0.268656,1.322916,0.723208


In [9]:
# indexで指定した行のSeriesを取得
df.loc['2022-09-06']

A   -0.256666
B   -0.282374
C    0.005225
D    0.396512
Name: 2022-09-06 00:00:00, dtype: float64

In [10]:
# 2022-09-06のAとBを取得
df.loc['2022-09-06', ['A', 'B']]

A   -0.256666
B   -0.282374
Name: 2022-09-06 00:00:00, dtype: float64

In [11]:
# '2022-09-06' ~ '2022-09-08'のAとBを取得
df.loc['2022-09-06':'2022-09-08', ['A', 'B']]

Unnamed: 0,A,B
2022-09-06,-0.256666,-0.282374
2022-09-07,0.602124,1.343904
2022-09-08,1.659802,-1.23673


In [12]:
# 全てのAとBを取得
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2022-09-06,-0.256666,-0.282374
2022-09-07,0.602124,1.343904
2022-09-08,1.659802,-1.23673
2022-09-09,0.349772,2.195021
2022-09-10,-0.756491,1.022863
2022-09-11,-1.879364,0.268656


In [13]:
# integer-locationで指定した行のSeries
df.iloc[0, 0]

-0.2566658692486384

In [17]:
# インデックスとカラムを0番目から1番目まで取得
df.iloc[0:2, 0:2]

Unnamed: 0,A,B
2022-09-06,-0.256666,-0.282374
2022-09-07,0.602124,1.343904


In [28]:
# データの更新
df.loc['2022-09-06'] = pd.Series(np.random.randn(4), index=list(df.columns))
df

Unnamed: 0,A,B,C,D
2022-09-06,-2.165963,1.586601,0.446755,0.40102
2022-09-07,0.602124,1.343904,0.031283,-0.713169
2022-09-08,1.659802,-1.23673,1.649086,-0.125421
2022-09-09,0.349772,2.195021,1.096174,0.974692
2022-09-10,-0.756491,1.022863,-0.450788,-0.871504
2022-09-11,-1.879364,0.268656,1.322916,0.723208


## DataFrameの行列を指定してデータを取得

```
df.at[インデックス, カラム名]

# 番号は0から数える
df.iat[インデックスの番号, カラムの番号]
```



In [29]:
name = ['James', 'Anthony', 'Michael', 'Howard', 'Russell']
data = {'height': [206, 201, 198, 208, 191], 'weight': [113, 108, 98, 120, 91]}
df = pd.DataFrame(data, index=name)
df

Unnamed: 0,height,weight
James,206,113
Anthony,201,108
Michael,198,98
Howard,208,120
Russell,191,91


In [31]:
# atによるデータの取得
df.at['James', 'height'] # Jamesのheightを取得

206

In [34]:
# iatによるデータの取得
df.iat[4, 1] # Russellのweightを取得

91

In [35]:
# データの更新
df.at['James', 'weight'] = 105 # Jamesのweightを更新
df

Unnamed: 0,height,weight
James,206,105
Anthony,201,108
Michael,198,98
Howard,208,120
Russell,191,91


## DataFrameのフィルタリング

```
df[フィルタリング条件]
```



In [None]:
# 大小比較
df[df.A > 0] # dfのカラムAで0より大きいものを抽出

Unnamed: 0,A,B,C,D
2022-09-06,1.789819,1.588179,0.819663,0.234145
2022-09-08,0.716233,-0.077171,0.201672,-0.197706


In [None]:
# 大小比較
df[df > 0] # dfで0より大きいものを抽出、0以下はNaN

Unnamed: 0,A,B,C,D
2022-09-06,1.789819,1.588179,0.819663,0.234145
2022-09-07,,0.584958,,1.383745
2022-09-08,0.716233,,0.201672,
2022-09-09,,0.500765,0.246067,
2022-09-10,,,1.12354,
2022-09-11,,0.311134,1.613455,


In [None]:
data2 = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df2 = pd.DataFrame(data2)
df2

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [None]:
# 完全一致
df2[df2.A == 3] # df2.Aがあれば取得

Unnamed: 0,A,B
2,3,6


In [None]:
# bool値でのフィルタリング
filter = [True, False, True]
df2[filter]

Unnamed: 0,A,B
0,1,4
2,3,6


## データを行または列方向にずらす

* ずらし幅を指定: periods
* ずらす方向を指定: axis

```
df.shift(n)
```



In [None]:
# 1行ずらす
df.shift(1)

Unnamed: 0,A,B,C,D
2022-09-06,,,,
2022-09-07,1.789819,1.588179,0.819663,0.234145
2022-09-08,-2.056649,0.584958,-0.097939,1.383745
2022-09-09,0.716233,-0.077171,0.201672,-0.197706
2022-09-10,-0.030896,0.500765,0.246067,-0.320968
2022-09-11,-0.113084,-0.474563,1.12354,-1.897694


## DataFrameやSeriesを結合させる

```
pd.concat()
```



In [None]:
df = pd.DataFrame(np.random.randn(2, 2))
df

Unnamed: 0,0,1
0,1.061436,-0.355503
1,0.531045,-0.513653


In [None]:
pd.concat([df, df])

Unnamed: 0,0,1
0,1.061436,-0.355503
1,0.531045,-0.513653
0,1.061436,-0.355503
1,0.531045,-0.513653


## データの追加


```
.append()
```



In [None]:
df = pd.DataFrame(np.random.randn(3, 3), index=pd.date_range('20220906', periods=3))
df

Unnamed: 0,0,1,2
2022-09-06,-0.16928,-0.052335,0.835214
2022-09-07,0.151625,1.332229,0.371602
2022-09-08,1.525265,-0.334217,-0.718232


In [None]:
df2 = pd.DataFrame(np.random.randn(2, 2))
df2

Unnamed: 0,0,1
0,0.795982,-0.94185
1,1.838852,0.025164


In [None]:
df2.append(df.iloc[0:2, 0:2])

Unnamed: 0,0,1
0,0.795982,-0.94185
1,1.838852,0.025164
2022-09-06 00:00:00,-0.16928,-0.052335
2022-09-07 00:00:00,0.151625,1.332229


In [None]:
# ignore_index=Trueを設定することでインデックス名を追加先に合わせる
df2.append(df.iloc[0:2, 0:2], ignore_index=True)

Unnamed: 0,0,1
0,0.795982,-0.94185
1,1.838852,0.025164
2,-0.16928,-0.052335
3,0.151625,1.332229


## GroupByでの集計


```
# 最小値
df.groupby(カラム名).min()

# 最大値
df.groupby(カラム名).max()

# 合計値
df.groupby(カラム名).sum()

# 平均値
df.groupby(カラム名).mean()

# 分散
df.groupby(カラム名).var()

# 標準偏差
df.groupby(カラム名).std()
```



In [None]:
df = pd.DataFrame({'A': ['aaa', 'bbb', 'aaa', 'bbb'], 'B': np.random.randn(4)})
df

Unnamed: 0,A,B
0,aaa,-0.519857
1,bbb,2.058434
2,aaa,-0.326114
3,bbb,-0.667413


In [None]:
# Aのインデックスでグルーピングして取得
df.groupby('A').sum()

Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
aaa,-0.845971
bbb,1.391022


## CSVファイルへの書き込みと読み込み

### CSVファイルへの書き出し
`df.to_csv(ファイルパス, sep, index, index_label)`
* sep : 区切り文字、デフォルトはカンマ
* index : indexの出力要否をbool型で指定
* index_label : indexを出力する場合のカラム名


In [None]:
df = pd.DataFrame({'A': ['aaa', 'bbb', 'ccc', 'ddd'], 'B': np.random.randn(4)})
df

Unnamed: 0,A,B
0,aaa,0.007967
1,bbb,1.884291
2,ccc,-0.972009
3,ddd,0.311584


In [None]:
df.to_csv('df.csv')

### CSVファイルへの読み込み
`pd.read_csv(ファイルパス, sep, header, dtype)`
* sep : 区切り文字、デフォルトはカンマ
* header : ヘッダ行番号(デフォルトは0)
* dtype : 列ごとの型を辞書で指定

In [None]:
pd.read_csv('df.csv')

Unnamed: 0.1,Unnamed: 0,A,B
0,0,aaa,0.007967
1,1,bbb,1.884291
2,2,ccc,-0.972009
3,3,ddd,0.311584


## データベースに対しての読み書き
### データベースへの書き込み
`df.to_sql('テーブル名', コネクション)`

* index : DataFrameのインデックスの登録を行うか
* index_label : インデックスを出力する場合のカラム名
* if_exists : 挿入データが既に存在していた場合の挙動指定(fail:例外、append:追加、replace:drop&insert)





In [None]:
data = {'name': ['Anthony', 'Sophia'], 'age': [20, 22]}
df = pd.DataFrame(data, index=[1, 2])
df

Unnamed: 0,name,age
1,Anthony,20
2,Sophia,22


In [None]:
import sqlite3
import pandas as pd
import pandas.io.sql as psql

# オンメモリで動作
conn = sqlite3.connect(':memory:')

# sqliteを操作するカーソルを準備
curs = conn.cursor()

# テーブルの作成
curs.execute(
    'CREATE TABLE persons(id INTEGER, name STRING, age INTEGER)'
    )

# データの登録
curs.execute(
    "INSERT INTO persons(id, name, age) values(1, 'Bob', 20)"
    )
curs.execute(
    "INSERT INTO persons(id, name, age) values(2, 'James', 22)"
    )
curs.execute(
    "INSERT INTO persons(id, name, age) values(3, 'Anne', 21)"
    )
curs.execute(
    "INSERT INTO persons(id, name, age) values(4, 'Nancy', 20)"
    )

# 登録するDataFrame
data = {'name': ['Anthony', 'Sophia'], 'age': [20, 22]}
df = pd.DataFrame(data, index=[1, 2])

# DataFrameをDBに登録
df.to_sql('persons', conn, index='id', if_exists='append')


# SELECT文からDataFrameを作成
df2 = psql.read_sql('SELECT * FROM persons;', conn, index_col='id')

# カーソルもクローズする
curs.close()

# sqliteへのコネクションをクローズする
conn.close()

df2

Unnamed: 0_level_0,name,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Bob,20
2,James,22
3,Anne,21
4,Nancy,20
1,Anthony,20
2,Sophia,22


### データベースからの読み込み
`psql.read_sql('SELECT文', コネクション)`

* index_col : DataFrameのインデックスにするカラム


In [None]:
import sqlite3
import pandas as pd
import pandas.io.sql as psql

# オンメモリで動作
conn = sqlite3.connect(':memory:')

# sqliteを操作するカーソルを準備
curs = conn.cursor()

# テーブルの作成
curs.execute(
    'CREATE TABLE persons(id INTEGER, name STRING, age INTEGER)'
    )

# データの登録
curs.execute(
    "INSERT INTO persons(id, name, age) values(1, 'Bob', 20)"
    )
curs.execute(
    "INSERT INTO persons(id, name, age) values(2, 'James', 22)"
    )
curs.execute(
    "INSERT INTO persons(id, name, age) values(3, 'Anne', 21)"
    )
curs.execute(
    "INSERT INTO persons(id, name, age) values(4, 'Nancy', 20)"
    )

# SELECT文からDataFrameを作成
df = psql.read_sql('SELECT * FROM persons;', conn, index_col='id')

# カーソルもクローズする
curs.close()

# sqliteへのコネクションをクローズする
conn.close()

df

Unnamed: 0_level_0,name,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Bob,20
2,James,22
3,Anne,21
4,Nancy,20


## DataFrameの欠損値について

```
# NaN判定
pd.isnull(df).any()

# 欠損値の削除
df.dropna()
```



In [None]:
# NaN判定
data = {'height': [100, 200, 300], 'weight': [10, 20, None]}
df = pd.DataFrame(data)
df

Unnamed: 0,height,weight
0,100,10.0
1,200,20.0
2,300,


In [None]:
pd.isnull(df.height).any()

False

In [None]:
pd.isnull(df.weight).any()

True

In [None]:
# 欠損値の削除
new_height_series = df.weight.dropna()
new_height_series

0    10.0
1    20.0
Name: weight, dtype: float64

In [None]:
# 全体の行ごと削除
df.dropna()

Unnamed: 0,height,weight
0,100,10.0
1,200,20.0


## DataFrameの値を置換


```
df.replace(置換前, 置換後)
```



In [None]:
data = {'name': ['Anthony', 'Sophia'], 'age': [20, 22]}
df = pd.DataFrame(data, index=[1, 2])
df

Unnamed: 0,name,age
1,Anthony,20
2,Sophia,22


In [None]:
# AnthonyをHowardに置換
df.replace('Anthony', 'Howard')

Unnamed: 0,name,age
1,Howard,20
2,Sophia,22


In [None]:
data = {'name': ['Anthony', 'Sophia', 'Kevin', 'Olivia'], 'age': [20, 22, None, 21]}
df = pd.DataFrame(data, index=[1, 2, 3, 4])
df

Unnamed: 0,name,age
1,Anthony,20.0
2,Sophia,22.0
3,Kevin,
4,Olivia,21.0


In [None]:
# 欠損値の除去
import numpy as np
import pandas as pd

df2 = df.replace(np.nan, 0)
df2

Unnamed: 0,name,age
1,Anthony,20.0
2,Sophia,22.0
3,Kevin,0.0
4,Olivia,21.0
