In [34]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import itertools as it
from pprint import pprint


pd.set_option('display.expand_frame_repr', False)


In [14]:
pdf = pd.DataFrame.from_records([
    (1, "Mary Smith", "Math", 4, 1),
    (1, "Mary Smith", "Computer Science", 5, 1),
    (1, "Mary Smith", "English Literature", 2, 0),
    (2, "Mary Smith", "Math", 4, 1),
    (2, "Mary Smith", "Computer Science", 5, 1),
    (2, "Mary Smith", "English Literature", 4, 1),
    (1, "John Brown", "Math", 1, 0),
    (1, "John Brown", "Computer Science", 4, 1),
    (1, "John Brown", "English Literature", 5, 1),
    (2, "John Brown", "Math", 4, 1),
    (2, "John Brown", "Computer Science", 3, 0),
    (2, "John Brown", "English Literature", 5, 1),
],
columns=['year', 'name', 'subject', 'grade', 'pass'])

df = dd.from_pandas(pdf, 2)

df.head(3)

Unnamed: 0,year,name,subject,grade,pass
0,1,Mary Smith,Math,4,1
1,1,Mary Smith,Computer Science,5,1
2,1,Mary Smith,English Literature,2,0


In [35]:
collect_list = dd.Aggregation(
    'collect_list',
    lambda s: s.apply(list),
    lambda s: s.apply(lambda chunks: list(it.chain.from_iterable(chunks))),
)

In [37]:
ag = df.groupby(['year', 'name']).agg({
    'grade': {'mean_grade': np.mean,
              'grades': collect_list},
    'pass': {'passes': 'sum'}
})

ag.compute()

Unnamed: 0_level_0,Unnamed: 1_level_0,grade,grade,pass
Unnamed: 0_level_1,Unnamed: 1_level_1,grades,mean_grade,passes
year,name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,John Brown,"[1, 4, 5]",3.333333,2
1,Mary Smith,"[4, 5, 2]",3.666667,2
2,John Brown,"[4, 3, 5]",4.0,2
2,Mary Smith,"[4, 5, 4]",4.333333,3


In [38]:
ag.index.compute()

MultiIndex(levels=[[1, 2], [u'John Brown', u'Mary Smith']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=[u'year', u'name'])

In [39]:
ag.columns

MultiIndex(levels=[[u'grade', u'pass'], [u'grades', u'mean_grade', u'passes']],
           labels=[[0, 0, 1], [0, 1, 2]])

In [47]:
ri = ag.reset_index()
ri.compute()

Unnamed: 0_level_0,year,name,grade,grade,pass
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,grades,mean_grade,passes
0,1,John Brown,"[1, 4, 5]",3.333333,2
1,1,Mary Smith,"[4, 5, 2]",3.666667,2
2,2,John Brown,"[4, 3, 5]",4.0,2
3,2,Mary Smith,"[4, 5, 4]",4.333333,3


In [48]:
ri.index.compute()

RangeIndex(start=0, stop=4, step=1)

In [49]:
ri.columns

MultiIndex(levels=[[u'grade', u'pass', u'name', u'year'], [u'grades', u'mean_grade', u'passes', u'']],
           labels=[[3, 2, 0, 0, 1], [3, 3, 0, 1, 2]])

In [50]:
ri.columns.__class__

pandas.core.indexes.multi.MultiIndex

In [51]:
ri.columns.get_level_values(0)

Index([u'year', u'name', u'grade', u'grade', u'pass'], dtype='object')

In [52]:
ri.columns.get_level_values(1)

Index([u'', u'', u'grades', u'mean_grade', u'passes'], dtype='object')

In [55]:
ri.columns = ['year', 'name', 'grades', 'mean_grade', 'passes']
ri.compute()

Unnamed: 0,year,name,grades,mean_grade,passes
0,1,John Brown,"[1, 4, 5]",3.333333,2
1,1,Mary Smith,"[4, 5, 2]",3.666667,2
2,2,John Brown,"[4, 3, 5]",4.0,2
3,2,Mary Smith,"[4, 5, 4]",4.333333,3


In [57]:
[_ for _ in dir(ri) if _.startswith('to')]

['to_bag',
 'to_csv',
 'to_delayed',
 'to_hdf',
 'to_html',
 'to_parquet',
 'to_records',
 'to_string',
 'to_timestamp']

In [60]:
ri.to_records().compute()

rec.array([(0, 1, 'John Brown', list([1, 4, 5]), 3.33333333, 2),
           (1, 1, 'Mary Smith', list([4, 5, 2]), 3.66666667, 2),
           (2, 2, 'John Brown', list([4, 3, 5]), 4.        , 2),
           (3, 2, 'Mary Smith', list([4, 5, 4]), 4.33333333, 3)],
          dtype=[(u'index', '<i8'), (u'year', '<i8'), (u'name', 'O'), (u'grades', 'O'), (u'mean_grade', '<f8'), (u'passes', '<i8')])