# Pandas Groupby Love

Source code from Medium's article ["Pandas Groupby Love"](https://towardsdatascience.com/pandas-groupby-love-5b3bce19c35e), written by [Josh Johnson](https://towardsdatascience.com/pandas-groupby-love-5b3bce19c35e).

## Import Libraries

In [1]:
import pandas as pd
import zipfile
import wget

## Open University Learning Analytics Dataset

In [2]:
url = 'https://analyse.kmi.open.ac.uk/open_dataset/download'

filename = wget.download(url)

zpf = zipfile.ZipFile(filename)

100% [........................................................................] 46750706 / 46750706

In [4]:
chunksize = 100000
list_ = []

for chunk in pd.read_csv(zpf.open('studentVle.csv'), chunksize=chunksize):
    
        list_.append(chunk)
        student_vle = pd.concat(list_, axis=0)
        
student_vle

Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click
0,AAA,2013J,28400,546652,-10,4
1,AAA,2013J,28400,546652,-10,1
2,AAA,2013J,28400,546652,-10,1
3,AAA,2013J,28400,546614,-10,11
4,AAA,2013J,28400,546714,-10,1
...,...,...,...,...,...,...
10655275,GGG,2014J,675811,896943,269,3
10655276,GGG,2014J,675578,896943,269,1
10655277,GGG,2014J,654064,896943,269,3
10655278,GGG,2014J,654064,896939,269,1


## Count Aggregator

In [5]:
student_groups = student_vle.groupby('id_student')
activity_counts = student_groups.count()

activity_counts.head()

Unnamed: 0_level_0,code_module,code_presentation,id_site,date,sum_click
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6516,662,662,662,662,662
8462,304,304,304,304,304
11391,196,196,196,196,196
23629,59,59,59,59,59
23698,305,305,305,305,305


## Mean Average Aggregator

In [6]:
average_clicks = student_groups.mean()

print('Mean of median clicks per activity is', average_clicks['sum_click'].mean())

average_clicks.head()

Mean of median clicks per activity is 3.277612411690169


Unnamed: 0_level_0,id_site,date,sum_click
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6516,877282.676737,110.483384,4.216012
8462,675792.838816,37.128289,2.157895
11391,546697.668367,102.132653,4.765306
23629,542863.067797,43.033898,2.728814
23698,911160.937705,85.639344,2.983607


In [8]:
activities_and_clicks = pd.merge(left=activity_counts['id_site'],
                                 right=average_clicks['sum_click'],
                                 how='inner',
                                 on='id_student')

cols = ['Total Activities Engaged', 'Average Clicks per Activity']
activities_and_clicks.columns = cols

activities_and_clicks.head()

Unnamed: 0_level_0,Total Activities Engaged,Average Clicks Per Activity
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1
6516,662,4.216012
8462,304,2.157895
11391,196,4.765306
23629,59,2.728814
23698,305,2.983607


## Other Groupings

In [11]:
module_group = student_vle.groupby('code_module')
module_averages = module_group.mean()
module_counts = module_group.count()

modules = pd.merge(module_counts['id_site'],
                   module_averages['sum_click'],
                   how='inner',
                   on= 'code_module')

modules.columns = ['Total Activities Engaged', 'Average Clicks per Activity']

modules

Unnamed: 0_level_0,Total Activities Engaged,Average Clicks per Activity
code_module,Unnamed: 1_level_1,Unnamed: 2_level_1
AAA,350298,3.558833
BBB,1567564,3.339051
CCC,1207827,3.876506
DDD,2166486,2.552562
EEE,961433,4.144051
FFF,4014499,4.382595
GGG,387173,3.445189


## Multi-Indexing

In [12]:
cols = ['id_student', 'code_presentation', 'code_module']
reg_groups = student_vle.groupby(cols)

reg_averages = reg_groups.mean()

reg_averages.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id_site,date,sum_click
id_student,code_presentation,code_module,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6516,2014J,AAA,877282.676737,110.483384,4.216012
8462,2013J,DDD,673952.073333,37.49,2.153333
8462,2014J,DDD,813850.25,10.0,2.5
11391,2013J,AAA,546697.668367,102.132653,4.765306
23629,2013B,BBB,542863.067797,43.033898,2.728814


## Tuple Indices

In [14]:
reg_averages.loc[(8462, '2013J', 'DDD'), 'sum_click']

2.1533333333333333

In [15]:
cols = ['code_module', 'code_presentation', 'id_student']
groupby_module = student_vle.groupby(cols)

module_averages = groupby_module.mean()

module_averages.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id_site,date,sum_click
code_module,code_presentation,id_student,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAA,2013J,11391,546697.668367,102.132653,4.765306
AAA,2013J,28400,546704.453488,86.993023,3.337209
AAA,2013J,30268,546721.092105,2.355263,3.697368
AAA,2013J,31604,546712.515837,106.147813,3.254902
AAA,2013J,32885,546721.215909,91.934659,2.9375


In [16]:
module_averages.loc[('BBB', '2014B'), 'sum_click']

id_student
25629      2.285714
25997      1.857143
27891      3.357143
50069      2.540670
52426      2.193548
             ...   
2677541    2.586957
2681783    1.857143
2683697    1.750000
2687378    2.576754
2690136    2.000000
Name: sum_click, Length: 1294, dtype: float64

## Flattening a Multi_Index

In [17]:
flat_module_averages = module_averages.reset_index()

flat_module_averages.head()

Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click
0,AAA,2013J,11391,546697.668367,102.132653,4.765306
1,AAA,2013J,28400,546704.453488,86.993023,3.337209
2,AAA,2013J,30268,546721.092105,2.355263,3.697368
3,AAA,2013J,31604,546712.515837,106.147813,3.254902
4,AAA,2013J,32885,546721.215909,91.934659,2.9375


## Full Gist

In [18]:
index_cols = ['code_module', 'code_presentation', 'id_student']
student_groups = student_vle.groupby(index_cols)

activity_counts = student_groups.count()
average_clicks = student_groups.mean()

activities_and_clicks = pd.merge(activity_counts['id_site'],
                                 average_clicks['sum_click'],
                                 how='inner',
                                 on=index_cols)

activities_and_clicks.columns = ['Total Activities Engaged',
                                 'Average Clicks per Activity']

activities_and_clicks.reset_index(inplace=True)

activities_and_clicks.head()


Unnamed: 0,code_module,code_presentation,id_student,Total Activities Engaged,Average Clicks per Activity
0,AAA,2013J,11391,196,4.765306
1,AAA,2013J,28400,430,3.337209
2,AAA,2013J,30268,76,3.697368
3,AAA,2013J,31604,663,3.254902
4,AAA,2013J,32885,352,2.9375
