# Summarizing, Aggregating, and Grouping data in Python Pandas

[Reference](https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/)

In [26]:
import pandas as pd
import dateutil

In [27]:
DATA_FILE_PATH = 'Python Examples/data/phone_data.csv'

In [28]:
df = pd.read_csv(DATA_FILE_PATH)
df['date'] = df['date'].apply(dateutil.parser.parse, dayfirst=True)
df.head()

Unnamed: 0,index,date,duration,item,month,network,network_type
0,0,2014-10-15 06:58:00,34.429,data,2014-11,data,data
1,1,2014-10-15 06:58:00,13.0,call,2014-11,Vodafone,mobile
2,2,2014-10-15 14:46:00,23.0,call,2014-11,Meteor,mobile
3,3,2014-10-15 14:48:00,4.0,call,2014-11,Tesco,mobile
4,4,2014-10-15 17:27:00,4.0,call,2014-11,Tesco,mobile


In [29]:
df.describe()

Unnamed: 0,index,duration
count,830.0,830.0
mean,414.5,117.804036
std,239.744656,444.12956
min,0.0,1.0
25%,207.25,1.0
50%,414.5,24.5
75%,621.75,55.0
max,829.0,10528.0


# Basic summary statistics

In [30]:
# How many rows in the dataset
df['item'].count()

830

In [31]:
df.shape[0]

830

In [32]:
# What was the longest phone call / data entry?
df['duration'].max()

10528.0

In [33]:
# How many seconds of phone calls are recorded in total?
df['duration'][df['item'] == 'call'].sum()

92321.0

In [34]:
# How many entries are there for each month?
df['month'].value_counts()

2014-11    230
2015-01    205
2014-12    157
2015-02    137
2015-03    101
Name: month, dtype: int64

In [35]:
# How many non-null unique network entries?
df['network'].nunique()

9

# Summarizing Groups in the DataFrame

In [37]:
df.groupby(['month']).groups.keys()

dict_keys(['2014-11', '2014-12', '2015-01', '2015-02', '2015-03'])

In [38]:
len(df.groupby(['month']).groups['2014-11'])

230

In [41]:
# Get the first entry for each month
df.groupby(['month']).first()

Unnamed: 0_level_0,index,date,duration,item,network,network_type
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-11,0,2014-10-15 06:58:00,34.429,data,data,data
2014-12,228,2014-11-13 06:58:00,34.429,data,data,data
2015-01,381,2014-12-13 06:58:00,34.429,data,data,data
2015-02,577,2015-01-13 06:58:00,34.429,data,data,data
2015-03,729,2015-02-12 20:15:00,69.0,call,landline,landline


In [43]:
# Get the. sum of the durations per month
df.groupby(['month'])['duration'].sum()

month
2014-11    26639.441
2014-12    14641.870
2015-01    18223.299
2015-02    15522.299
2015-03    22750.441
Name: duration, dtype: float64

In [44]:
# Get the number of dates / entries in each month
df.groupby(['month'])['date'].count()

month
2014-11    230
2014-12    157
2015-01    205
2015-02    137
2015-03    101
Name: date, dtype: int64

In [46]:
# What is the sum of durations, for calls only, to each network
df[df['item'] == 'call'].groupby(['network'])['duration'].sum()

network
Meteor        7200.0
Tesco        13828.0
Three        36464.0
Vodafone     14621.0
landline     18433.0
voicemail     1775.0
Name: duration, dtype: float64

In [77]:
df[df['item'] == 'call'] \
    .groupby(['network'], as_index=False) \
    .agg({'duration': sum}) \
    .rename(columns={'duration': 'total_duration'})

Unnamed: 0,network,total_duration
0,Meteor,7200.0
1,Tesco,13828.0
2,Three,36464.0
3,Vodafone,14621.0
4,landline,18433.0
5,voicemail,1775.0


In [48]:
# How many calls, sms, and date entries are in each month
df.groupby(['month', 'item'])['date'].count()

month    item
2014-11  call    107
         data     29
         sms      94
2014-12  call     79
         data     30
         sms      48
2015-01  call     88
         data     31
         sms      86
2015-02  call     67
         data     31
         sms      39
2015-03  call     47
         data     29
         sms      25
Name: date, dtype: int64

In [53]:
# How many calls, texts, and data are sent per month, 
# split by network type?
df.groupby(['month', 'network_type'])['date'].count()

month    network_type
2014-11  data             29
         landline          5
         mobile          189
         special           1
         voicemail         6
2014-12  data             30
         landline          7
         mobile          108
         voicemail         8
         world             4
2015-01  data             31
         landline         11
         mobile          160
         voicemail         3
2015-02  data             31
         landline          8
         mobile           90
         special           2
         voicemail         6
2015-03  data             29
         landline         11
         mobile           54
         voicemail         4
         world             3
Name: date, dtype: int64

# Groupby output format - Series or DataFrame

In [63]:
df.groupby(['month'], as_index=False).agg({'duration': 'sum'})

Unnamed: 0,month,duration
0,2014-11,26639.441
1,2014-12,14641.87
2,2015-01,18223.299
3,2015-02,15522.299
4,2015-03,22750.441


# Multiple Statistics per Group