In [1]:
import numpy as np
import pandas as pd

In [13]:
d1 = pd.read_csv('temp/p07_d1.txt', index_col=0)
d1 = d1.reindex(columns=['State','City','Views','Likes'])
d1

Unnamed: 0,State,City,Views,Likes
0,NE,Page,10,4
1,KY,Stone,9,3
2,CO,Rye,3,0
3,CO,Rye,7,2
4,KY,Dema,4,1
5,KY,Keavy,2,1
6,CO,Rye,1,0
7,NE,Cairo,8,3
8,CO,Dumont,12,7


## Groupby

In [4]:
g1 = d1.groupby('State')
g1.groups

{'CO': Int64Index([2, 3, 6, 8], dtype='int64'),
 'KY': Int64Index([1, 4, 5], dtype='int64'),
 'NE': Int64Index([0, 7], dtype='int64')}

In [5]:
for name,group in g1:
    print (name)
    print (group)
    print ('Total Views: %d - Total Likes: %d\n\n' %(group['Views'].sum(),
                                                    group['Likes'].sum()))

CO
  State    City  Views  Likes
2    CO     Rye      3      0
3    CO     Rye      7      2
6    CO     Rye      1      0
8    CO  Dumont     12      7
Total Views: 23 - Total Likes: 9


KY
  State   City  Views  Likes
1    KY  Stone      9      3
4    KY   Dema      4      1
5    KY  Keavy      2      1
Total Views: 15 - Total Likes: 5


NE
  State   City  Views  Likes
0    NE   Page     10      4
7    NE  Cairo      8      3
Total Views: 18 - Total Likes: 7




In [6]:
d2 = d1.set_index(['State','City'])
d2

Unnamed: 0_level_0,Unnamed: 1_level_0,Views,Likes
State,City,Unnamed: 2_level_1,Unnamed: 3_level_1
NE,Page,10,4
KY,Stone,9,3
CO,Rye,3,0
CO,Rye,7,2
KY,Dema,4,1
KY,Keavy,2,1
CO,Rye,1,0
NE,Cairo,8,3
CO,Dumont,12,7


## Aggregate

In [12]:
g2 = d2.groupby(level=[0])
print (g2.groups)
g2.aggregate(np.sum)

{'CO': MultiIndex(levels=[['CO', 'KY', 'NE'], ['Cairo', 'Dema', 'Dumont', 'Keavy', 'Page', 'Rye', 'Stone']],
           labels=[[0, 0, 0, 0], [5, 5, 5, 2]],
           names=['State', 'City']), 'NE': MultiIndex(levels=[['CO', 'KY', 'NE'], ['Cairo', 'Dema', 'Dumont', 'Keavy', 'Page', 'Rye', 'Stone']],
           labels=[[2, 2], [4, 0]],
           names=['State', 'City']), 'KY': MultiIndex(levels=[['CO', 'KY', 'NE'], ['Cairo', 'Dema', 'Dumont', 'Keavy', 'Page', 'Rye', 'Stone']],
           labels=[[1, 1, 1], [6, 1, 3]],
           names=['State', 'City'])}


Unnamed: 0_level_0,Views,Likes
State,Unnamed: 1_level_1,Unnamed: 2_level_1
CO,23,9
KY,15,5
NE,18,7


In [14]:
g3 = d2.groupby(level=[0,1])
print (g3.groups)
g3.aggregate(np.sum)

{('CO', 'Dumont'): MultiIndex(levels=[['CO', 'KY', 'NE'], ['Cairo', 'Dema', 'Dumont', 'Keavy', 'Page', 'Rye', 'Stone']],
           labels=[[0], [2]],
           names=['State', 'City']), ('KY', 'Dema'): MultiIndex(levels=[['CO', 'KY', 'NE'], ['Cairo', 'Dema', 'Dumont', 'Keavy', 'Page', 'Rye', 'Stone']],
           labels=[[1], [1]],
           names=['State', 'City']), ('CO', 'Rye'): MultiIndex(levels=[['CO', 'KY', 'NE'], ['Cairo', 'Dema', 'Dumont', 'Keavy', 'Page', 'Rye', 'Stone']],
           labels=[[0, 0, 0], [5, 5, 5]],
           names=['State', 'City']), ('KY', 'Keavy'): MultiIndex(levels=[['CO', 'KY', 'NE'], ['Cairo', 'Dema', 'Dumont', 'Keavy', 'Page', 'Rye', 'Stone']],
           labels=[[1], [3]],
           names=['State', 'City']), ('NE', 'Cairo'): MultiIndex(levels=[['CO', 'KY', 'NE'], ['Cairo', 'Dema', 'Dumont', 'Keavy', 'Page', 'Rye', 'Stone']],
           labels=[[2], [0]],
           names=['State', 'City']), ('KY', 'Stone'): MultiIndex(levels=[['CO', 'KY', 'NE'], ['C

Unnamed: 0_level_0,Unnamed: 1_level_0,Views,Likes
State,City,Unnamed: 2_level_1,Unnamed: 3_level_1
CO,Dumont,12,7
CO,Rye,11,2
KY,Dema,4,1
KY,Keavy,2,1
KY,Stone,9,3
NE,Cairo,8,3
NE,Page,10,4


In [15]:
g4 = d2.groupby(level=[0,1], as_index=False)
g4.aggregate(np.sum)

Unnamed: 0,Views,Likes
0,12,7
1,11,2
2,4,1
3,2,1
4,9,3
5,8,3
6,10,4


In [16]:
d1[['State', 'Views']].groupby('State').aggregate([np.sum, np.mean, np.std])

Unnamed: 0_level_0,Views,Views,Views
Unnamed: 0_level_1,sum,mean,std
State,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
CO,23,5.75,4.856267
KY,15,5.0,3.605551
NE,18,9.0,1.414214


## Apply

In [17]:
pd.set_option('display.float_format', lambda x: '{:.1f}'.format(x))

def add_field(group):
    group['Tot.Views'] = group['Views'].sum()
    group['Likes[%]'] = 100.0*group['Likes']/group['Likes'].sum()
    return group

d1.groupby('State').apply(add_field)

Unnamed: 0,State,City,Views,Likes,Tot.Views,Likes[%]
0,NE,Page,10,4,18,57.1
1,KY,Stone,9,3,15,60.0
2,CO,Rye,3,0,23,0.0
3,CO,Rye,7,2,23,22.2
4,KY,Dema,4,1,15,20.0
5,KY,Keavy,2,1,15,20.0
6,CO,Rye,1,0,23,0.0
7,NE,Cairo,8,3,18,42.9
8,CO,Dumont,12,7,23,77.8


## example

In [19]:
idx = pd.date_range('1999/5/28', periods=1500, freq='1B')
s1 = pd.Series(np.random.normal(5.5, 2, 1500), idx)
s1 = s1.rolling(10,10).mean().dropna()
s1

1999-06-10   6.4
1999-06-11   6.0
1999-06-14   5.8
1999-06-15   6.1
1999-06-16   6.3
1999-06-17   6.4
1999-06-18   6.4
1999-06-21   6.3
1999-06-22   6.5
1999-06-23   5.7
1999-06-24   5.7
1999-06-25   5.8
1999-06-28   5.6
1999-06-29   5.0
1999-06-30   5.0
1999-07-01   5.0
1999-07-02   4.8
1999-07-05   4.9
1999-07-06   4.7
1999-07-07   5.2
1999-07-08   5.1
1999-07-09   5.0
1999-07-12   4.8
1999-07-13   4.7
1999-07-14   4.4
1999-07-15   4.3
1999-07-16   4.4
1999-07-19   4.6
1999-07-20   4.6
1999-07-21   4.4
              ..
2005-01-14   5.8
2005-01-17   5.9
2005-01-18   5.5
2005-01-19   5.6
2005-01-20   5.9
2005-01-21   5.9
2005-01-24   5.3
2005-01-25   5.6
2005-01-26   5.8
2005-01-27   5.5
2005-01-28   5.4
2005-01-31   5.5
2005-02-01   5.3
2005-02-02   5.0
2005-02-03   4.8
2005-02-04   5.2
2005-02-07   5.6
2005-02-08   5.5
2005-02-09   5.6
2005-02-10   5.9
2005-02-11   6.1
2005-02-14   6.0
2005-02-15   6.5
2005-02-16   6.5
2005-02-17   6.6
2005-02-18   6.3
2005-02-21   6.2
2005-02-22   6

In [21]:
def my_groupby_key_year(timestamp):
    return timestamp.year

def my_groupby_key_month(timestamp):
    return timestamp.month

def my_normalization(group):
    return (group-group.mean())/group.std()

Here we normalize the data on a monthly base and check mean and std on an yearly base:

In [23]:
t1 = s1.groupby(my_groupby_key_month).apply(my_normalization)
t1.head(8)

1999-06-10   1.7
1999-06-11   1.2
1999-06-14   0.8
1999-06-15   1.4
1999-06-16   1.6
1999-06-17   1.7
1999-06-18   1.9
1999-06-21   1.7
Freq: B, dtype: float64

In [24]:
t1.groupby(my_groupby_key_year).aggregate([np.mean, np.std])

Unnamed: 0,mean,std
1999,-0.2,0.9
2000,0.1,0.8
2001,0.2,0.9
2002,0.1,1.1
2003,-0.3,1.1
2004,-0.1,0.9
2005,0.4,0.8


In [25]:
d3 = pd.read_csv('example_data/company.csv', index_col=0)
d3.head()

Unnamed: 0_level_0,Value,expenses,employees,Dimension
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dapibus Company,96 008,7124,78,Big
Pede Blandit Congue Company,61 562,8454,60,Big
Pede Suspendisse Associates,54 728,6641,4,Small
Dictum Associates,16 802,6498,89,Big
Dui Cras Pellentesque Ltd,93 954,5040,97,Big


In [26]:
d3['Value'] = d3['Value'].apply(lambda x: float(x.replace(' ', '')))
d3.head()

Unnamed: 0_level_0,Value,expenses,employees,Dimension
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dapibus Company,96008.0,7124,78,Big
Pede Blandit Congue Company,61562.0,8454,60,Big
Pede Suspendisse Associates,54728.0,6641,4,Small
Dictum Associates,16802.0,6498,89,Big
Dui Cras Pellentesque Ltd,93954.0,5040,97,Big


In [27]:
d3.groupby('Dimension').mean()

Unnamed: 0_level_0,Value,expenses,employees
Dimension,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Big,49445.4,5474.9,61.1
Small,60947.8,4521.4,31.8
