# Grouping rows in Pandas

### Imports

In [1]:
% matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import requests
from StringIO import StringIO

### Reading UFC data from a Google Sheet

In [2]:
url = "https://docs.google.com/spreadsheets/d/1sQOtIwkEiTe4kwwRHw5bwi72ZJHXlAVcCNMImr-Ml6U/export?format=csv&gid=0"

data = requests.get(url).content.decode("utf-8")
df = pd.read_csv(StringIO(data))

df.head(3)

Unnamed: 0,pageurl,eid,mid,event_name,event_org,event_date,event_place,f1pageurl,f2pageurl,f1name,f2name,f1result,f2result,f1fid,f2fid,method,method_d,ref,round,time
0,/events/UFC-1-The-Beginning-7,7,8,UFC 1 - The Beginning,Ultimate Fighting Championship,11/12/1993,"McNichols Arena, Denver, Colorado, United States",/fighter/Royce-Gracie-19,/fighter/Gerard-Gordeau-15,Royce Gracie,Gerard Gordeau,win,loss,19,15,Submission,Rear-Naked Choke,Helio Vigio,1,1:44
1,/events/UFC-1-The-Beginning-7,7,7,UFC 1 - The Beginning,Ultimate Fighting Championship,11/12/1993,"McNichols Arena, Denver, Colorado, United States",/fighter/Jason-DeLucia-22,/fighter/Trent-Jenkins-23,Jason DeLucia,Trent Jenkins,win,loss,22,23,Submission,Rear-Naked Choke,Joao Alberto Barreto,1,0:52
2,/events/UFC-1-The-Beginning-7,7,6,UFC 1 - The Beginning,Ultimate Fighting Championship,11/12/1993,"McNichols Arena, Denver, Colorado, United States",/fighter/Royce-Gracie-19,/fighter/Ken-Shamrock-4,Royce Gracie,Ken Shamrock,win,loss,19,4,Submission,Rear-Naked Choke,Helio Vigio,1,0:57


### Summarizing groups

In [3]:
df.groupby(by="method") #returns a group object which is a dictionary
df.groupby("method").groups  #to access the dictionary
df.groupby("method").groups.keys()

['Draw',
 'No Contest - Overturned by Commission',
 'Submission',
 'No',
 'Technical',
 'TKO',
 'Decision',
 'NC',
 'KO',
 'No Contest - Overturned by State Commission',
 'No Contest - Hoffman Failed Drug Test',
 'No Contest',
 'No Contest - Caceres Failed Drug Test',
 'DQ']

#### Using groupby object methods

The group data can be summarized thanks to various methods (mean, std, var, min, max, sum, cumsum, cumprod, abs etc...)

In [4]:
df.groupby(by="method", as_index = False)["pageurl"].count()

Unnamed: 0,method,pageurl
0,DQ,11
1,Decision,1470
2,Draw,21
3,KO,414
4,NC,26
5,No,6
6,No Contest,2
7,No Contest - Caceres Failed Drug Test,1
8,No Contest - Hoffman Failed Drug Test,1
9,No Contest - Overturned by Commission,1


#### For example we can look at all the victories of Royce Gracie either as f1 or f2

In [5]:
df[((df.f1name == "Royce Gracie") & (df.f1result == "win")) | 
   ((df.f2name == "Royce Gracie") & (df.f2result == "win"))].groupby("method").groups

{'Submission': [0L, 2L, 5L, 8L, 9L, 11L, 15L, 25L, 29L, 32L, 36L]}

In [6]:
df[(df.f1name == "Royce Gracie")].groupby(["method","f1result"]).groups

{('Submission', 'win'): [0L, 2L, 5L, 8L, 9L, 11L, 15L, 25L, 29L, 32L, 36L]}

In [7]:
df[(df.f2name == "Royce Gracie")].groupby(["method","f2result"]).groups

{('Draw', 'draw'): [40L], ('TKO', 'loss'): [548L]}

#### Grouping by multiple columns

In [8]:
gdf = df.groupby(["method","method_d"], as_index = False)
gdf.count().sort_values(["method","event_place"], ascending = False)

Unnamed: 0,method,method_d,pageurl,eid,mid,event_name,event_org,event_date,event_place,f1pageurl,f2pageurl,f1name,f2name,f1result,f2result,f1fid,f2fid,ref,round,time
200,Technical,Submission (Guillotine Choke,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
203,Technical,Submission (Rear-Naked Choke,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,15,16,16
205,Technical,Submission (Triangle Choke,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
196,Technical,Submission (Arm-Triangle Choke,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
197,Technical,Submission (Armbar,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,5,5
198,Technical,Submission (Brabo Choke,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
195,Technical,Submission (Anaconda Choke,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
204,Technical,Submission (Shoulder Choke,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2
193,Technical,Decision (Majority,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
194,Technical,Decision (Unanimous,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


### Aggregation

#### Let's start by formatting the date data and extracting the months

In [9]:
["{0}-{1}".format(x.month,x.year) for x in pd.to_datetime(df.event_date)][:10]

['11-1993',
 '11-1993',
 '11-1993',
 '11-1993',
 '11-1993',
 '11-1993',
 '11-1993',
 '11-1993',
 '3-1994',
 '3-1994']

In [10]:
df["month"] = ["{0}-{1}".format(x.year,x.month) for x in pd.to_datetime(df.event_date)]

#### Now we will convert the time in seconds

In [11]:
import re

def string_to_seconds(string):
    try:
        minutes = int(re.match(r'([0-9]*):([0-9]*)',string).group(1))
        seconds = int(re.match(r'([0-9]*):([0-9]*)',string).group(2))
        return minutes * 60 + seconds
    except:
        return None

df["time_s"] = [string_to_seconds(x) for x in df.time.tolist()]

#### Now let's group the data by month

In [12]:
df.groupby("month").pageurl.count().head()

month
1993-11     8
1994-12    10
1994-3     15
1994-9      6
1995-4     10
Name: pageurl, dtype: int64

#### With the agg function we'll be able to calculate multiple statistics from a single column

In [13]:
aggregations = {"time_s":
                 {"number_of_fights":"count",
                  "total_duration_s":"sum", 
                  "average_duration_s":"mean"},
               "round":["mean", "count"]}

df.groupby("month").agg(aggregations).head()

Unnamed: 0_level_0,round,round,time_s,time_s,time_s
Unnamed: 0_level_1,mean,count,number_of_fights,average_duration_s,total_duration_s
month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1993-11,1,8,8,100.625,805
1994-12,1,10,10,257.0,2570
1994-3,1,15,15,212.066667,3181
1994-9,1,6,6,182.5,1095
1995-4,1,10,10,337.6,3376


#### Accessing the data

In [14]:
data = df.groupby("month", as_index = False).agg(aggregations)

print data.columns

data["round"]["mean"].head()

MultiIndex(levels=[[u'round', u'time_s', u'month'], [u'average_duration_s', u'count', u'mean', u'number_of_fights', u'total_duration_s', u'']],
           labels=[[2, 0, 0, 1, 1, 1], [5, 2, 1, 3, 0, 4]])


0    1
1    1
2    1
3    1
4    1
Name: mean, dtype: float64