In [9]:
# データフレームを作る
import pandas as pd
import numpy as np

# 二次元配列から作成
display(pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]])))

#  ディクショナルから作成
display(pd.DataFrame({1: ['1', '3'], 2: ['1', '2'], 3: ['2', '4']}))

#  data, index, columnsを指定して作成
display(pd.DataFrame(data=[4,5,6,7], index=range(0,4), columns=['A']))

# Seriesから作成
my_series = pd.Series({"Tokyo":"Shinjuku", "Osaka":"Osaka", "Kyoto":"Kyoto", "Aichi":"Nagoya"})
display(pd.DataFrame(my_series, columns=['City']))

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6


Unnamed: 0,1,2,3
0,1,1,2
1,3,2,4


Unnamed: 0,A
0,4
1,5
2,6
3,7


Unnamed: 0,City
Aichi,Nagoya
Kyoto,Kyoto
Osaka,Osaka
Tokyo,Shinjuku


In [20]:
# データフレームの行・列数を取得
df = pd.DataFrame(np.array([[4, 5, 6, 7], [4, 5, 6, 7], [4, 5, 6, 7]]),\
                          columns=np.array(["a", "b", "c", "d"]))

# Use the `shape` property
display(df)
print(df.shape)
print('行数:', df.shape[0])
print('列数:', df.shape[1])

## Or use the `len()` function with the `index` property
#for i in df.index:
#   print(df.iloc[i])
#   print("-----")
    
print('行数:', len(df.index))
print('列数:', len(df.columns))

Unnamed: 0,a,b,c,d
0,4,5,6,7
1,4,5,6,7
2,4,5,6,7


(3, 4)
行数: 3
列数: 4
行数: 3
列数: 4


In [23]:
# 行を選択（loc, iloc）

df = pd.DataFrame(data=["apple","orange","grape","pineapple"], columns=['name'], index=range(4))

# Using `loc[]`
print(df.loc[0]['name'])

# Using `iloc[]`
print(df.iloc[1][0])

# Using `at[]`
print(df.at[2,'name'])

# Using `iat[]`
print(df.iat[3,0])

apple
orange
grape
pineapple


In [64]:
# Fetching data from DataFrame with ":"

df = pd.DataFrame(data={"name":["apple","orange","grape","pineapple"], "price":[100,200,140,120]})
display(df)

# `loc[]`を使って特定の列のデータを取得する
display(df.loc[2:, ])

# `loc[]`を使って特定の行・列のデータを取得する
display(df.loc[:,'price'].to_frame())

Unnamed: 0,name,price
0,apple,100
1,orange,200
2,grape,140
3,pineapple,120


Unnamed: 0,name,price
2,grape,140
3,pineapple,120


Unnamed: 0,price
0,100
1,200
2,140
3,120


In [72]:
# set_index
df = pd.DataFrame(data={"name":["apple","orange","grape","pineapple"], "price":[100,200,140,120]})
display(df)
df = df.set_index('name')
print("price of grape:", df.loc['grape', 'price'])

Unnamed: 0,name,price
0,apple,100
1,orange,200
2,grape,140
3,pineapple,120


price of grape: 140


In [45]:
# set_index, loc, iloc

df = pd.DataFrame(data=np.array([["apple",3,"orange",5], ['r', 's', 'b', 'x']]), columns=['A', 'B', 'C', 'D'])
df = df.set_index('C')
print(df.loc['b'])
print("---")
print(df.iloc[0])

A    r
B    s
D    x
Name: b, dtype: object
---
A    apple
B        3
D        5
Name: orange, dtype: object


In [86]:
# データを追加する
df = pd.DataFrame(data={"name":["apple","orange"], "price":[100,200]})

# インデックスを指定する
df.set_index('name', inplace=True)
display(df)

#　データを追加する
df.loc["strawbelly"] = [500]
display(df)

Unnamed: 0_level_0,price
name,Unnamed: 1_level_1
apple,100
orange,200


Unnamed: 0_level_0,price
name,Unnamed: 1_level_1
apple,100
orange,200
strawbelly,500


In [88]:
# 列を追加する
df = pd.DataFrame(data={"name":["apple","orange"], "price":[100,200]})
df['location'] = ["Aomori", "Ehime"]
display(df)

Unnamed: 0,name,price,location
0,apple,100,Aomori
1,orange,200,Ehime


In [90]:
# データを書き換える
df = pd.DataFrame(data={"name":["apple","orange"], "price":[100,200]})
df.loc[1, 'price'] = 300
display(df)

Unnamed: 0,name,price
0,apple,100
1,orange,300


In [64]:
# Resetting index
df = pd.DataFrame(np.array([[1,2,3,4], [5,6,7,8]]), columns=['A','B','C','D'], index=[1.2, 2.5])

# Check out the weird index of your dataframe
print(df)
print("---")
# Use `reset_index()` to reset the values. 
print(df.reset_index(level=0, drop=True))
print("---")
print(df.reset_index(level=0, drop=False))


     A  B  C  D
1.2  1  2  3  4
2.5  5  6  7  8
---
   A  B  C  D
0  1  2  3  4
1  5  6  7  8
---
   index  A  B  C  D
0    1.2  1  2  3  4
1    2.5  5  6  7  8


In [100]:
# Dropping column
df = pd.DataFrame(np.array([[1,2,3,4], [5,6,7,8]]), columns=['A','B','C','D'])

# Check out the DataFrame `df`
print(df)
print("---")

# Drop the column with label 'A'                  
df.drop('A', axis=1, inplace=True) # 行の場合はaxis = 0, inplaceにTrueをセットすることで対象自体を変更できる
print(df)
print("---")

# Drop the column at position 1
print(df.drop(df.columns[[1]], axis=1))
print("---")
print(df)

df = pd.DataFrame(data={"name":["apple","orange"], "price":[100,200]})
# 列の場合はaxis = 1
display(df.drop('price', axis=1))
# 行の場合はaxis = 0
display(df.drop(0, axis=0))
df.drop('price', axis=1, inplace=True) # inplaceにTrueをセットすることで対象自体を変更できる
display(df)

   A  B  C  D
0  1  2  3  4
1  5  6  7  8
---
   B  C  D
0  2  3  4
1  6  7  8
---
   B  D
0  2  4
1  6  8
---
   B  C  D
0  2  3  4
1  6  7  8


Unnamed: 0,name
0,apple
1,orange


Unnamed: 0,name,price
1,orange,200


Unnamed: 0,name
0,apple
1,orange


In [106]:
# Removing duplicates

df = pd.DataFrame(np.array([[1,2,3,4], [1,2,3,4], [2,2,3,4]]), columns=['A','B','C','D'])

# Check out your DataFrame `df`
print(df)

# Drop the duplicates in `df`
df.drop_duplicates(['A'], keep='last') #keep = 'last'の場合、最後のものをのこす


# 重複を削除する
df = pd.DataFrame(data={"name":["apple","orange","apple"], "price":[100,200,105]})
display(df)
df.drop_duplicates(['name'], keep='last', inplace=True)
df

   A  B  C  D
0  1  2  3  4
1  1  2  3  4
2  2  2  3  4


Unnamed: 0,name,price
0,apple,100
1,orange,200
2,apple,105


Unnamed: 0,name,price
1,orange,200
2,apple,105


In [108]:
# Renaming columns or index

df = pd.DataFrame(np.array([[1,2,3], [2,3,4], [2,2,3]]), columns=['A','B','C'])

# Check out your DataFrame `df`
print(df)

# Define the new names of your columns
newcols = {
    'A': 'new_column_1', 
    'B': 'new_column_2', 
    'C': 'new_column_3'
}

# Use `rename()` to rename your columns
df.rename(columns=newcols, inplace=True)
display(df)

# Use `rename()` to your index
display(df.rename(index={1: 'a'}))


# カラム名を変更する
df = pd.DataFrame(data={"name":["apple","orange"], "price":[100,200]})
df.rename(columns={'price': 'quantity'})

   A  B  C
0  1  2  3
1  2  3  4
2  2  2  3


Unnamed: 0,new_column_1,new_column_2,new_column_3
0,1,2,3
1,2,3,4
2,2,2,3


Unnamed: 0,new_column_1,new_column_2,new_column_3
0,1,2,3
a,2,3,4
2,2,2,3


Unnamed: 0,name,quantity
0,apple,100
1,orange,200


In [83]:
# Replacing

df = pd.DataFrame(np.array([[1,2,3], [2,3,4], [2,2,3]]), columns=['A','B','C'])

# Check out your DataFrame `df`
display(df)
display(df.replace([1, 2],['x', 'y']))


Unnamed: 0,A,B,C
0,1,2,3
1,2,3,4
2,2,2,3


Unnamed: 0,A,B,C
0,x,y,3
1,y,3,4
2,y,y,3


In [119]:
# 正規表現で置換する
df = pd.DataFrame({'Region': ['Osaka', 'Tokyo', 'AIchi'], \
                                   'Speciality': ['Okonomiyaki\nI like it.', 'Monja\r\nI like it.', 'Misokatsu\r\nI hate it.']})
display(df)
df.replace({'\n|\r\n':'<br />'}, regex=True)

Unnamed: 0,Region,Speciality
0,Osaka,Okonomiyaki\nI like it.
1,Tokyo,Monja\r\nI like it.
2,AIchi,Misokatsu\r\nI hate it.


Unnamed: 0,Region,Speciality
0,Osaka,Okonomiyaki<br />I like it.
1,Tokyo,Monja<br />I like it.
2,AIchi,Misokatsu<br />I hate it.


In [111]:
# Removing unwanted parts

df = pd.DataFrame(np.array([["-1a",2,3], ["-2b",3,4], ["+3c","q",3]]), columns=['A','B','C'])
display(df)
df['A'] = df['A'].map(lambda x: x.lstrip('+-'))
display(df)

Unnamed: 0,A,B,C
0,-1a,2,3
1,-2b,3,4
2,+3c,q,3


Unnamed: 0,A,B,C
0,1a,2,3
1,2b,3,4
2,3c,q,3


In [123]:
# Splitting data

df = pd.DataFrame(np.array([["- 1a",2,3], ["- 2b",3,4], ["+ 3c","q",3]]), columns=['A','B','C'])

# Inspect your DataFrame `df`
display(df)

# Split out the two values in the third row
# Make it a Series
# Stack the values
display(df['A'].str)
a_series = df['A'].str.split(' ').apply(pd.Series)
a_series.rename(columns={0:"E", 1:"F"}, inplace=True)

# Get rid of the stack:
# Drop the level to line up with the DataFrame
#a_series.index = a_series.index.droplevel(-1)
display(a_series)

# Delete the `Ticket` column from your DataFrame
del df['A']

# Join the `ticketdf` DataFrame to `df`
df = df.join(pd.DataFrame(data=a_series))

# Check out the new `df`
display(df)

Unnamed: 0,A,B,C
0,- 1a,2,3
1,- 2b,3,4
2,+ 3c,q,3


<pandas.core.strings.StringMethods at 0x10868c940>

Unnamed: 0,E,F
0,-,1a
1,-,2b
2,+,3c


Unnamed: 0,B,C,E,F
0,2,3,-,1a
1,3,4,-,2b
2,q,3,+,3c


In [170]:
# apply function to specified row and columns

doubler = lambda x : x*2

df = pd.DataFrame(data=[[1,2,3], [4,5,6], [7,8,9]], columns=['A', 'B', 'C'])
display(df)
display(df.applymap(doubler))

display(pd.DataFrame(df['B'].apply(doubler)))
display(pd.DataFrame(df.loc[1].apply(doubler)))
display()
df.loc[1] = df.loc[1].apply(doubler)
display(df)
df.loc[:, 'A'] = df.A.apply(doubler)
display(df)

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9


Unnamed: 0,A,B,C
0,2,4,6
1,8,10,12
2,14,16,18


Unnamed: 0,B
0,4
1,10
2,16


Unnamed: 0,1
A,8
B,10
C,12


Unnamed: 0,A,B,C
0,1,2,3
1,8,10,12
2,7,8,9


Unnamed: 0,A,B,C
0,2,2,3
1,16,10,12
2,14,8,9


In [175]:
# empty dataframe
display(pd.DataFrame(np.nan, columns=['A'], index=range(4)))

Unnamed: 0,A
0,
1,
2,
3,


In [201]:
# create new derived table out of original one
df = pd.DataFrame([["a","foo",3, 100],["a","bar",6,200],["b","baz",9,300],["b", "bar", 10,400]], columns=['A', 'B', 'C', 'D'])
display(df)
display(df.pivot(columns='B', index='A'))
display(df.pivot(columns='B', index='A', values='D'))

Unnamed: 0,A,B,C,D
0,a,foo,3,100
1,a,bar,6,200
2,b,baz,9,300
3,b,bar,10,400


Unnamed: 0_level_0,C,C,C,D,D,D
B,bar,baz,foo,bar,baz,foo
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,6.0,,3.0,200.0,,100.0
b,10.0,9.0,,400.0,300.0,


B,bar,baz,foo
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,200.0,,100.0
b,400.0,300.0,


In [132]:
# if you can't ensure uniquness of data, you can use pivot_table
# aggfunc argument is used to aggregate values.
# default aggfunc is 'mean'
df = pd.DataFrame([["a","foo",3, 100],["a","bar",6,200],["b","bar",9,300],["b", "bar", 10,400]], columns=['A', 'B', 'C', 'D'])
display(df)
display(df.pivot_table(index=['A', 'B']))
display(df.pivot_table(index=['A', 'B'], aggfunc='sum'))

df = pd.DataFrame(data={"name":["yamada","sato","takahashi","takeda","kishi"], \
                                              "score":[100,80,40,70,90],\
                                              "sex":['Male', 'Male', 'Female', 'Female', 'Male'],\
                                              "class":['A', 'A', 'B', 'B', 'A']})
display(df)
df.pivot_table(index=['class', 'sex'], values='score', aggfunc='mean')

Unnamed: 0,A,B,C,D
0,a,foo,3,100
1,a,bar,6,200
2,b,bar,9,300
3,b,bar,10,400


Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
a,bar,6.0,200.0
a,foo,3.0,100.0
b,bar,9.5,350.0


Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
a,bar,6,200
a,foo,3,100
b,bar,19,700


Unnamed: 0,class,name,score,sex
0,A,yamada,100,Male
1,A,sato,80,Male
2,B,takahashi,40,Female
3,B,takeda,70,Female
4,A,kishi,90,Male


Unnamed: 0_level_0,Unnamed: 1_level_0,score
class,sex,Unnamed: 2_level_1
A,Male,90
B,Female,55


In [210]:
# Reshape Your DataFrame With melt()
# The `people` DataFrame
people = pd.DataFrame({'FirstName' : ['John', 'Jane'],
                       'LastName' : ['Doe', 'Austen'],
                       'BloodType' : ['A-', 'B+'],
                       'Weight' : [90, 64]})

display(people)
# Use `melt()` on the `people` DataFrame
display(pd.melt(people, id_vars=['FirstName', 'LastName'], var_name='measurements'))

Unnamed: 0,BloodType,FirstName,LastName,Weight
0,A-,John,Doe,90
1,B+,Jane,Austen,64


Unnamed: 0,FirstName,LastName,measurements,value
0,John,Doe,BloodType,A-
1,Jane,Austen,BloodType,B+
2,John,Doe,Weight,90
3,Jane,Austen,Weight,64


In [134]:
# Iterating data

df = pd.DataFrame([["a","foo",3, 100],["a","bar",6,200],["b","bar",9,300],["b", "bar", 10,400]], columns=['A', 'B', 'C', 'D'])

for index, row in df.iterrows():
    display(row['B'])
    
df = pd.DataFrame(data={"name":["yamada","sato","takahashi","takeda","kishi"], \
                                              "score":[100,80,40,70,90],\
                                              "sex":['Male', 'Male', 'Female', 'Female', 'Male'],\
                                              "class":['A', 'A', 'B', 'B', 'A']})
for index, row in df.iterrows():
    print("Score of", df.loc[index, 'name'], "=", df.loc[index, 'score'])

'foo'

'bar'

'bar'

'bar'

Score of yamada = 100
Score of sato = 80
Score of takahashi = 40
Score of takeda = 70
Score of kishi = 90
