In [2]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [3]:
df = pd.DataFrame({
    'key1': ['a','a','b','b','a'],
    'key2' : ['one','two','one','two','one'],
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
                  })
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.204708,1.393406
1,a,two,0.478943,0.092908
2,b,one,-0.519439,0.281746
3,b,two,-0.55573,0.769023
4,a,one,1.965781,1.246435


In [6]:
grouped = df['data1'].groupby(df['key1']) # key1 を元にグループ化
grouped # groupby object

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x11100a780>

In [7]:
grouped.mean()

key1
a    0.746672
b   -0.537585
Name: data1, dtype: float64

In [8]:
means = df['data1'].groupby([df['key1'],df['key2']]).means
means

key1  key2
a     one     0.880536
      two     0.478943
b     one    -0.519439
      two    -0.555730
Name: data1, dtype: float64

In [15]:
print('data1に関してkey1(aかb)とkey2(oneかtwoか)の二つのキーでグループ化しただけ ')
means.unstack()

data1に関してkey1(aかb)とkey2(oneかtwoか)の二つのキーでグループ化しただけ 


key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.880536,0.478943
b,-0.519439,-0.55573


In [17]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
print('二つのキーstatesとyearsに関してグループ化し、data1を割り当てる')
df['data1'].groupby([states, years]).mean()

二つのキーstatesとyearsに関してグループ化し、data1を割り当てる


California  2005    0.478943
            2006   -0.519439
Ohio        2005   -0.380219
            2006    1.965781
Name: data1, dtype: float64

In [18]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.746672,0.910916
b,-0.537585,0.525384


In [19]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.880536,1.31992
a,two,0.478943,0.092908
b,one,-0.519439,0.281746
b,two,-0.55573,0.769023


In [20]:

df.groupby(['key1', 'key2']).size() # 個数を表す

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [24]:
for name, group in df.groupby('key1'):
    print(name) # a ,b 
    print(group) # データ

a
  key1 key2     data1     data2
0    a  one -0.204708  1.393406
1    a  two  0.478943  0.092908
4    a  one  1.965781  1.246435
b
  key1 key2     data1     data2
2    b  one -0.519439  0.281746
3    b  two -0.555730  0.769023


In [27]:
for (k1,k2) , group in df.groupby(['key1', 'key2']):
    print(k1, k2) # a,b - one,two
    print(group)

a one
  key1 key2     data1     data2
0    a  one -0.204708  1.393406
4    a  one  1.965781  1.246435
a two
  key1 key2     data1     data2
1    a  two  0.478943  0.092908
b one
  key1 key2     data1     data2
2    b  one -0.519439  0.281746
b two
  key1 key2    data1     data2
3    b  two -0.55573  0.769023


In [28]:
pieces = dict(list(df.groupby('key1')))

In [31]:
pieces # {a:data1,b:data2}

{'a':   key1 key2     data1     data2
 0    a  one -0.204708  1.393406
 1    a  two  0.478943  0.092908
 4    a  one  1.965781  1.246435, 'b':   key1 key2     data1     data2
 2    b  one -0.519439  0.281746
 3    b  two -0.555730  0.769023}

In [33]:
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,-0.519439,0.281746
3,b,two,-0.55573,0.769023


In [34]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [35]:
grouped = df.groupby(df.dtypes,axis=1)

In [37]:
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.204708  1.393406
1  0.478943  0.092908
2 -0.519439  0.281746
3 -0.555730  0.769023
4  1.965781  1.246435
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


In [47]:
df.groupby('key1')['data1'].mean()
df.groupby('key1')[['data2']].mean() # [[]]をつけることによりデータフレーム型になる。

Unnamed: 0_level_0,data2
key1,Unnamed: 1_level_1
a,0.910916
b,0.525384


In [50]:
df.groupby(['key1','key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,1.31992
a,two,0.092908
b,one,0.281746
b,two,0.769023


In [67]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people
people.iloc[2:3,[1,2]] = np.nan 
people

Unnamed: 0,a,b,c,d,e
Joe,1.270025,-0.974378,-0.634709,-0.395701,-0.289436
Steve,-0.734297,-0.728505,0.838775,0.266893,0.721194
Wes,0.910983,,,1.296608,0.252275
Jim,1.127481,-0.568363,0.309362,-0.577385,-1.168634
Travis,-0.82502,-2.644409,-0.152986,-0.751921,-0.132609


In [72]:
mapping = {'a' : 'red','b': 'red','c':'blue','d':'blue','e': 'red','f' : 'orange'}
by_column = people.groupby(mapping,axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,-1.03041,0.006211
Steve,1.105668,-0.741607
Wes,1.296608,1.163258
Jim,-0.268023,-0.609516
Travis,-0.904907,-3.602039


In [73]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [76]:
people.groupby(map_series,axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [77]:
people

Unnamed: 0,a,b,c,d,e
Joe,1.270025,-0.974378,-0.634709,-0.395701,-0.289436
Steve,-0.734297,-0.728505,0.838775,0.266893,0.721194
Wes,0.910983,,,1.296608,0.252275
Jim,1.127481,-0.568363,0.309362,-0.577385,-1.168634
Travis,-0.82502,-2.644409,-0.152986,-0.751921,-0.132609


In [79]:
people.groupby(len).sum() #関数を渡すことが出来る。この場合はlenで文字数の長さを渡す

Unnamed: 0,a,b,c,d,e
3,3.308488,-1.542742,-0.325347,0.323522,-1.205795
5,-0.734297,-0.728505,0.838775,0.266893,0.721194
6,-0.82502,-2.644409,-0.152986,-0.751921,-0.132609


In [82]:
key_list = ['one', 'one', 'one', 'two', 'two'] # indexのkey list
people.groupby([len,key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.910983,-0.974378,-0.634709,-0.395701,-0.289436
3,two,1.127481,-0.568363,0.309362,-0.577385,-1.168634
5,one,-0.734297,-0.728505,0.838775,0.266893,0.721194
6,two,-0.82502,-2.644409,-0.152986,-0.751921,-0.132609


In [88]:
arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
columns = 
hier = pd.DataFrame(np.random.randn(4,4),columns=columns)
hier

blue,1,1,2,2
red,red,blue,red,blue
0,0.940489,-0.642437,1.040179,-1.082922
1,0.429214,-0.236224,0.641818,-0.331661
2,1.394072,-1.076742,-0.192466,-0.871188
3,0.420852,-1.211411,-0.258867,-0.581647


In [91]:
pd.MultiIndex.from_arrays(arrays,names=['blue','red'])

MultiIndex(levels=[[1, 2], ['blue', 'red']],
           labels=[[0, 0, 1, 1], [1, 0, 1, 0]],
           names=['blue', 'red'])

In [94]:
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tenor'])

In [97]:
hier_df = pd.DataFrame(np.random.randn(4,5),columns=columns)
hier_df # このように二つ以上のindexを作るmultiindex　をコラムとして使っている

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.170506,-1.158087,1.104599,0.634238,1.259683
1,0.964931,-0.434446,-0.879603,-0.694838,1.226374
2,0.457279,0.115699,1.014042,-1.135008,-0.263371
3,1.306425,-1.610841,-1.026621,1.241573,-0.15676


In [98]:
hier_df.groupby(level='cty',axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


In [102]:
df
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9) # 90パーセントの値をだす

key1
a    1.668413
b   -0.523068
Name: data1, dtype: float64

In [105]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [107]:
grouped.agg(peak_to_peak) # aggで自作集約関数を使える

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.170488,1.300498
b,0.036292,0.487276


In [108]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,0.746672,1.109736,-0.204708,0.137118,0.478943,1.222362,1.965781,3.0,0.910916,0.712217,0.092908,0.669671,1.246435,1.31992,1.393406
b,2.0,-0.537585,0.025662,-0.55573,-0.546657,-0.537585,-0.528512,-0.519439,2.0,0.525384,0.344556,0.281746,0.403565,0.525384,0.647203,0.769023


In [111]:
tips = pd.read_csv('tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill'] # 総額分のチップの割合
tips[:6]

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624


In [114]:
grouped = tips.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg(['mean','std',peak_to_peak]) # aggでは複数の集約関数を使用することが出来る

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [116]:
grouped_pct.agg([('foo','mean'),('bar','std')]) # (名前,関数)の形で渡すことが出来る。

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [118]:
functions = ['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [119]:
ftuples = [('Durchschnitt','mean'),('Abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.000791,18.42,25.596333
Fri,Yes,0.174783,0.002631,16.813333,82.562438
Sat,No,0.158048,0.001581,19.661778,79.908965
Sat,Yes,0.147906,0.003767,21.276667,101.387535
Sun,No,0.160113,0.001793,20.506667,66.09998
Sun,Yes,0.18725,0.023757,24.12,109.046044
Thur,No,0.160298,0.001503,17.113111,59.625081
Thur,Yes,0.163863,0.001551,19.190588,69.808518


In [120]:
grouped.agg({'tip' : np.max,'size' : 'sum'}) #複数の列に異なる関数を適応したい場合,列名と関数のマッピングディクショナリーを渡す

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [121]:
grouped.agg({'tip_pct' : ['min','max','mean','std'],'size' : 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


In [122]:
tips.groupby(['day','smoker'],as_index=False).mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


In [123]:
def top(df,n=5,column='tip_pct'):
    return df.sort_values(by=column)[-n:]
top(tips,n=6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [125]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [135]:
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill') # applyで関数を適応することが出来る

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


In [130]:
result = tips.groupby('smoker')['tip_pct'].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [131]:
result.unstack('smoker')

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [134]:
f = lambda x: x.describe()
grouped.apply(f) #　ラムダ関数をapplyすることが出来る

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,size,tip_pct
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Fri,No,count,4.000000,4.000000,4.000000,4.000000
Fri,No,mean,18.420000,2.812500,2.250000,0.151650
Fri,No,std,5.059282,0.898494,0.500000,0.028123
Fri,No,min,12.460000,1.500000,2.000000,0.120385
Fri,No,25%,15.100000,2.625000,2.000000,0.137239
Fri,No,50%,19.235000,3.125000,2.000000,0.149241
Fri,No,75%,22.555000,3.312500,2.250000,0.163652
Fri,No,max,22.750000,3.500000,3.000000,0.187735
Fri,Yes,count,15.000000,15.000000,15.000000,15.000000
Fri,Yes,mean,16.813333,2.714000,2.066667,0.174783


In [139]:
tips.groupby('smoker', group_keys=False).apply(top) # smoker というキーに対してのグループキーを無効にすることが出来る

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [141]:
frame = pd.DataFrame({'data1': np.random.randn(1000),
                      'data2': np.random.randn(1000)})
frame

Unnamed: 0,data1,data2
0,0.258614,-0.156241
1,-0.087471,1.027193
2,0.590119,-0.083484
3,-1.219580,-2.909373
4,0.272788,0.728884
5,-2.691584,-0.319469
6,1.567780,-0.420456
7,0.265030,-1.116827
8,-0.929412,0.303514
9,0.885587,-0.004398


In [148]:
quartiles = pd.cut(frame.data1,4) # cutによりcategrorical　オブジェクトが生成される。
quartiles[:10]

0     (-0.467, 1.074]
1     (-0.467, 1.074]
2     (-0.467, 1.074]
3    (-2.008, -0.467]
4     (-0.467, 1.074]
5    (-3.555, -2.008]
6      (1.074, 2.615]
7     (-0.467, 1.074]
8    (-2.008, -0.467]
9     (-0.467, 1.074]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.555, -2.008] < (-2.008, -0.467] < (-0.467, 1.074] < (1.074, 2.615]]

In [154]:
def get_stats(group):
    return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}
grouped = frame.data2.groupby(quartiles)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.555, -2.008]",23.0,1.388465,0.057203,-1.460094
"(-2.008, -0.467]",303.0,2.531127,-0.026138,-2.909373
"(-0.467, 1.074]",516.0,3.366626,0.01262,-2.969411
"(1.074, 2.615]",158.0,3.525865,0.04487,-3.64586


In [150]:
grouping = pd.qcut(frame.data1,10,labels=False)
frame.data2.groupby(grouping)


0     -0.156241
1      1.027193
2     -0.083484
3     -2.909373
4      0.728884
5     -0.319469
6     -0.420456
7     -1.116827
8      0.303514
9     -0.004398
         ...   
990    0.417915
991    0.205153
992    1.582765
993   -0.163192
994    0.618774
995    0.395566
996   -1.229259
997    0.750435
998    0.199873
999   -0.634886
Name: data2, Length: 1000, dtype: float64