In [None]:
# ----- DATA AGGREGATION AND GROUP OPERATIONS -----
import numpy as np
import pandas as pd

In [None]:
df = pd.DataFrame({ 'key1' : ['a','a','b','b','a'],
                    'key2' : ['one','two','one','two','one'],
                    'data1': np.random.randn(5),
                    'data2': np.random.randn(5)   })
df

In [None]:
import statistics as st
st.mean([0.566648, 0.566648, -0.828803])

In [None]:
st.median([0.566648, 0.566648, -0.828803, 23, 23, 7])

In [None]:
grouped = df['data1'].groupby(df['key1'])

In [None]:
grouped

In [None]:
grouped.mean()

In [None]:
grouped.describe()

In [None]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

In [None]:
means.unstack()

In [None]:
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)

In [None]:
def peak_to_peak(arr):
  return arr.max() - arr.min()


In [None]:
grouped.agg(peak_to_peak)

In [None]:
# ---- Column-wise and Multiple Function Application ----
tips = pd.read_csv('examples/tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips

In [None]:
grouped = tips.groupby(['day','smoker'])

In [None]:
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')

In [None]:
grouped_pct.agg(['mean','std', peak_to_peak])

In [None]:
grouped_pct.agg([('foo','mean'), ('bar', np.std)])

In [None]:
functions = ['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result

In [None]:
ftuples = [('Duchschnitt','mean'), ('Abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)

In [None]:
grouped.agg({'tip': np.max, 'size': 'sum'})

In [None]:
grouped.agg({'tip_pct': ['min','max','mean','std'], 'size': 'sum'})

In [None]:
# --- Returning Aggregated Data without Row Indexes
tips.groupby(['day','smoker'], as_index=False).mean()

In [None]:
# --- Misc. tests 

In [None]:
states = np.array(['New Jersey','California','California','New Jersey','New Jersey'])
years = np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()

In [None]:
df.groupby([df['key1'], df['key2']]).mean()

In [None]:
df.groupby([df['key1'], df['key2']]).size()

In [None]:
# Iterating over groups
for name, group in df.groupby('key1'):
    print(name)
    print(group)

In [None]:
pieces = dict(list(df.groupby('key1')))
pieces

In [None]:
pieces['b']

In [None]:
# Group by axis = 1 ( Y-axis or columns )
df.dtypes

In [None]:
grouped_y_axis = df.groupby(df.dtypes, axis=1)

In [None]:
for dtype, group in grouped_y_axis:
    print(dtype)
    print(group)

In [None]:
# ---------- GroupBy: apply (Split Apply Combine) ---------------
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]

In [None]:
top(tips, n=6)

In [None]:
tips.groupby('smoker').apply(top)

In [None]:
tips.groupby(['smoker','day']).apply(top, n=1, column='total_bill')

In [None]:
# -- suppressing the Group Keys
tips.groupby('smoker', group_keys=False).apply(top)

In [None]:
# ----- Quantile and Bucket analysis
frame = pd.DataFrame({'data1': np.random.randn(1000),
                      'data2': np.random.randn(1000) })
frame.head()


In [None]:
# --- Quantiles with equal-length buckets
quantiles = pd.cut(frame.data1, 4)
quantiles[:10]

In [None]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean() }

In [None]:
grouped = frame.data2.groupby(quantiles)
grouped.apply(get_stats)

In [None]:
grouped.apply(get_stats).unstack()

In [None]:
# --- Quantiles with equal-size buckets
grouping = pd.qcut(frame.data1, 10, labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

In [None]:
# ---- Example: Random Sampling and Permutation ----
# Hearts, Spades, Clubs, Diamonds
suits = ['H', 'S', 'C','D']
card_vals = (list(range(1,11)) + [10] * 3) * 4
base_names = ['A'] + list(range(2,11)) + ['J','K','Q']
cards = []
for suit in suits:
    cards.extend(str(num)  + suit for num in base_names)
deck = pd.Series(card_vals, index=cards)


In [40]:
def draw(deck, n=5):
    return deck.sample(n)

In [41]:
# drawing 5 random cards from the deck
draw(deck)

10H    10
AH      1
KH     10
5C      5
10S    10
dtype: int64

In [42]:
# drawing 2 random cards from each suit
get_suits = lambda card: card[-1] # last letter is suit
deck.groupby(get_suits).apply(draw, n=2)


C  6C     6
   JC    10
D  3D     3
   6D     6
H  3H     3
   KH    10
S  2S     2
   6S     6
dtype: int64