# pandas

データ解析とかでよく使うライブラリ群。詳細はグーグル

## 一次元のデータ列

Seriesを用いる。

In [3]:
import pandas as pd 
from pandas import Series

In [2]:
obj = Series([3,5,6,7])
obj

0    3
1    5
2    6
3    7
dtype: int64

In [3]:
obj.values

array([3, 5, 6, 7])

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
# indexに名前も使用可能
m2_cos = Series([2,3,4,5],index=['A','B','C','D'])

In [6]:
m2_cos

A    2
B    3
C    4
D    5
dtype: int64

In [7]:
m2_cos['A']

2

In [8]:
m2_cos[m2_cos >= 4] #条件式を使えばデータ抜粋も可能

C    4
D    5
dtype: int64

In [9]:
m2_cos >= 4

A    False
B    False
C     True
D     True
dtype: bool

In [10]:
# データを含むかの確認
'B' in m2_cos

True

In [12]:
# 辞書化も可能
m2_dict = m2_cos.to_dict()

In [13]:
m2_dict

{'A': 2, 'B': 3, 'C': 4, 'D': 5}

In [14]:
categoriy = ['A','B','C','D','E']

In [17]:
m3_cos = Series(m2_dict , index=categoriy)

In [18]:
m3_cos

A    2.0
B    3.0
C    4.0
D    5.0
E    NaN
dtype: float64

In [19]:
# NaN判定も可能
pd.isnull(m3_cos)

A    False
B    False
C    False
D    False
E     True
dtype: bool

In [20]:
pd.notnull(m3_cos)

A     True
B     True
C     True
D     True
E    False
dtype: bool

In [21]:
# Indexが同じであれば総和も可能
m2_cos + m3_cos

A     4.0
B     6.0
C     8.0
D    10.0
E     NaN
dtype: float64

In [22]:
# 名前の追加
m2_cos.name = 'ほげほげ'

In [23]:
m2_cos

A    2
B    3
C    4
D    5
Name: ほげほげ, dtype: int64

In [24]:
# Indexにも名前を付けられる
m2_cos.index.name = "ふがふが"

In [25]:
m2_cos

ふがふが
A    2
B    3
C    4
D    5
Name: ほげほげ, dtype: int64

# DataFrame

2次元のテーブル型のデータ。こちらもいろいろ使用可能

## 使用データ
https://en.wikipedia.org/wiki/List_of_all-time_NFL_win%E2%80%93loss_records

In [2]:
from pandas import Series, DataFrame

In [8]:
#クリップボードからデータ取得
# 次のURLにある表からコピーしておくのが元だが、docker+jupyter上では不可能 普通にcsvを作ったので、そちらを読み込ませる

# nfl_frame = pd.read_clipboard() #大本
nfl_frame = pd.read_csv("./NFLdata.csv")

In [9]:
nfl_frame

Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL season,Division
0,1,Dallas Cowboys,964,550,408,6,0.574,1960,NFC East
1,2,Green Bay Packers,1418,790,590,38,0.571,1921,NFC North
2,3,Baltimore Ravens,434,243,190,1,0.561,1996,AFC North
3,4,New England Patriots,966,537,420,9,0.561,1960,AFC East
4,5,Chicago Bears,1452,786,624,42,0.556,1920,NFC North
5,6,Miami Dolphins,882,485,393,4,0.552,1966,AFC East
6,7,Minnesota Vikings,952,516,425,11,0.548,1961,NFC North
7,8,Kansas City Chiefs,966,521,433,12,0.546,1960,AFC West
8,9,San Francisco 49ers[b],1084,574,496,14,0.536,1950,NFC West
9,10,Pittsburgh Steelers,1254,661,571,22,0.536,1933,AFC North


In [10]:
nfl_frame.columns #列表示

Index(['Rank', 'Team', 'GP', 'Won', 'Lost', 'Tied', 'Pct.', 'First NFL season',
       'Division'],
      dtype='object')

In [13]:
#特定の列取得
nfl_frame[['Team','First NFL season']]

Unnamed: 0,Team,First NFL season
0,Dallas Cowboys,1960
1,Green Bay Packers,1921
2,Baltimore Ravens,1996
3,New England Patriots,1960
4,Chicago Bears,1920
5,Miami Dolphins,1966
6,Minnesota Vikings,1961
7,Kansas City Chiefs,1960
8,San Francisco 49ers[b],1950
9,Pittsburgh Steelers,1933


In [20]:
# 新しいデータの取得
new_frame = DataFrame(nfl_frame,columns=['Team','First NFL season', 'Sad'])

In [21]:
# 先頭/末尾取得
nfl_frame.head(5)
nfl_frame

Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL season,Division
0,1,Dallas Cowboys,964,550,408,6,0.574,1960,NFC East
1,2,Green Bay Packers,1418,790,590,38,0.571,1921,NFC North
2,3,Baltimore Ravens,434,243,190,1,0.561,1996,AFC North
3,4,New England Patriots,966,537,420,9,0.561,1960,AFC East
4,5,Chicago Bears,1452,786,624,42,0.556,1920,NFC North
5,6,Miami Dolphins,882,485,393,4,0.552,1966,AFC East
6,7,Minnesota Vikings,952,516,425,11,0.548,1961,NFC North
7,8,Kansas City Chiefs,966,521,433,12,0.546,1960,AFC West
8,9,San Francisco 49ers[b],1084,574,496,14,0.536,1950,NFC West
9,10,Pittsburgh Steelers,1254,661,571,22,0.536,1933,AFC North


In [22]:
# 特定のIndex取得 ixは古いのでloc/ilocを使う
# nfl_frame.ix(5) #非推奨
nfl_frame.iloc[5]

Rank                             6
Team                Miami Dolphins
GP                             882
Won                            485
Lost                           393
Tied                             4
Pct.                         0.552
First NFL season              1966
Division                  AFC East
Name: 5, dtype: object

In [23]:
# 列削除
del new_frame['Sad']

In [24]:
new_frame

Unnamed: 0,Team,First NFL season
0,Dallas Cowboys,1960
1,Green Bay Packers,1921
2,Baltimore Ravens,1996
3,New England Patriots,1960
4,Chicago Bears,1920
5,Miami Dolphins,1966
6,Minnesota Vikings,1961
7,Kansas City Chiefs,1960
8,San Francisco 49ers[b],1950
9,Pittsburgh Steelers,1933


# Index操作

Indexの基本的な操作部分のまとめ

In [25]:
my_ser = Series([1,2,3,4],index=['A','B','C','D'])

In [26]:
my_ser

A    1
B    2
C    3
D    4
dtype: int64

In [27]:
#indexの取得
my_index = my_ser.index

In [28]:
my_index[0]

'A'

In [29]:
my_index[2:]

Index(['C', 'D'], dtype='object')

In [30]:
#pandasではindex置き換えは不可能 これはデータの信頼性向上のためのpandaの仕様によるもの
my_index[0] = 'Z'

TypeError: Index does not support mutable operations

# Indexの変更

pandasではIndexを直接変更できないため、その場合の変更方法

変更したい対象に対してメソッド「reindex」を使用するだけでよい

In [33]:
# テーブルの作成
from numpy.random import randn

serl = Series([1,2,3,4],index=['A','B','C','D'])

serl

A    1
B    2
C    3
D    4
dtype: int64

In [35]:
# 新しいIndexのテーブルをもとのserlから”新規作成"する。元のテーブルに影響はない
serl2 = serl.reindex(['A','B','C','D','E','F'],fill_value=0)
serl2

A    1
B    2
C    3
D    4
E    0
F    0
dtype: int64

In [36]:
serl

A    1
B    2
C    3
D    4
dtype: int64

In [39]:
ser3 = Series(['USA','Mexico','Canada'], index=[0,5,10])
ser3

0        USA
5     Mexico
10    Canada
dtype: object

In [40]:
# indexを0-15にする。値は直前の値で埋めてよいのであるなら、以下で自動付け替えが可能
ser3.reindex(range(15), method='ffill')　#ffill : forward filling

0        USA
1        USA
2        USA
3        USA
4        USA
5     Mexico
6     Mexico
7     Mexico
8     Mexico
9     Mexico
10    Canada
11    Canada
12    Canada
13    Canada
14    Canada
dtype: object

# 行列の削除

dropなどで可能

In [6]:
import numpy as np
import pandas as pd 
from pandas import Series

serl =  Series(np.arange(3),index=['A','B','C'])
serl

A    0
B    1
C    2
dtype: int64

In [10]:
# 列を削除した新しいテーブルを作成する 削除したい列がある場合はaxisで指定
serl.drop("A", axis=0)

B    1
C    2
dtype: int64

In [11]:
#元データは問題ない
serl

A    0
B    1
C    2
dtype: int64