In [39]:
import collections
import datetime
import itertools as it
import math
import numpy as np
import pandas as pd
import wrapt
import load_cric_data
reload(load_cric_data)

<module 'load_cric_data' from 'load_cric_data.pyc'>

In [164]:
def split_id_outcome(deliveries):
    """Splits an iterable of deliveries into the [delivery_ids], [outcomes]"""
    # each delivery is a map { delivery_id => outcome }
    # d.items() will return a list of [(k, v)] pairs in the delivery
    # a list of maps is converted into a list of tuples
    id_outcome_tuples = [d.items()[0] for d in deliveries]
    
    # id_outcome_tuples is essentially a pair of lists zipped together into a 
    # list of pairs.
    #
    # zip is its own inverse. i.e. we can unzip a list of pairs using zip
    return zip(*id_outcome_tuples)


def heirarchical_delivery_index(delivery_ids):
    """Given a list of delivery ids of the form Over.ball, converts it into a 
    2-level pandas Index with overs and balls"""
    # unzip a list of pairs into a pair of lists
    balls, overs = zip(*[math.modf(did) for did in delivery_ids])
    balls = [round(b, 1) for b in balls]
    # construct the 2d index
    index = pd.MultiIndex.from_arrays([np.array(overs), np.array(balls)], names=['over', 'ball'])
    return index


def pick_winner(pick_from):
    return pick_from['winner']


def col_contains(value):
    def cell_contains_value(cell_list):
        return value in cell_list
    
    return cell_contains_value


class BaseFrame(pd.DataFrame):

    # Overriding the DataFrame constructor so that new instances 
    # derived from this class take the type of the subclass
    @property
    def _constructor(self):
        return self.__class__
    
    # Support the propagation of attributes across  data frames
    # from: https://github.com/pandas-dev/pandas/issues/2485#issuecomment-174577149
    def _combine_const(self, other, *args, **kwargs):
        return super(MyDataFrame, self)._combine_const(other, *args, **kwargs).__finalize__(self)
    
    
class DeliveriesFrame(BaseFrame):
    """Compositional wrapper around a dataframe for a number of deliveries"""
    pass

    
class MatchesFrame(BaseFrame):
    """Compositional wrapper around a dataframe for a number of matches.
    
    Provides helpers specfic for manipulating cricket match data.
    """
    def filter_team(self, team):
        return self.teams.apply(col_contains(team))
    
    def filter_umpire(self, umpire):
        return self.umpires.apply(col_contains(umpire))
    
    def won_matches(self):
        df = self
        won_matches = df[df.outcome.apply(lambda oc: 'winner' in oc)]
        return MatchesFrame(won_matches)
        
    def toss_winner_won(self):
        df = self.won_matches()
        toss_winner_won = df[df.toss.apply(pick_winner) == df.outcome.apply(pick_winner)]
        return MatchesFrame(toss_winner_won)
    
    def team_names(self):
        """List of teams who have at least one match in the matches."""
        all_teams = [(t, True) for tpair in self.teams for t in tpair]
        team_names = collections.OrderedDict(all_teams)
        return np.array(team_names.keys())
    
    def team_innings(self, team_name):
        """Returns a series of innings of the team_name batting"""
        team_innings = [inn1 if inn1.attrs['batting'] == team_name 
                        else inn2 if inn2.attrs['batting'] == team_name
                        else np.nan 
                        for (inn1, inn2) in zip(self['1st innings_frame'], 
                                                self['2nd innings_frame'])]
        return pd.Series(team_innings, index=self.index)

                        
def inning_summary(inning_frame):
    """Takes a inning data frame and tries to produce a summary of the inning"""
    if len(inning_frame):
        last_ball = inning_frame.iloc[-1]
        rt_cum = last_ball['rt_cum'] if 'rt' in inning_frame.columns else '-'
        w_cum = last_ball['w_cum'] if 'w_cum' in inning_frame.columns else '-'
        return "%d/%0s" % (rt_cum, w_cum)
    else:
        return "-"
    

def create_innings_dataframe(innings):
    """Given an cricsheet innings convert it into a data frame"""
    delivery_ids, outcomes = split_id_outcome(load_cric_data.pick_deliveries(innings))
    # heirarchical index by over and delivery. eg: 4.3 => (4, 3)
    outcome_index = heirarchical_delivery_index(delivery_ids)
    # flatten the outcome into columns
    outcome_rows = [load_cric_data.flat_delivery_outcome(o) for o in outcomes]

    inning_df = pd.DataFrame(outcome_rows, index=outcome_index)
    if 'rt' in inning_df.columns:
        # add a cummulative run counter
        inning_df['rt_cum'] = inning_df.rt.cumsum()
    
    inning_df['w_cum'] = 0
    if 'wpo' in inning_df.columns:
        # set the wicket counter = 1 where a person was out
        inning_df.loc[inning_df.wpo.notnull(), 'w_cum'] = 1
    inning_df['w_cum'] = inning_df.w_cum.cumsum()
    
    inning_df = DeliveriesFrame(inning_df)
    return inning_df


def add_inning_to_match_info(innings, match_info):
    inning_name, inning_data = innings.items()[0]
    match_info[inning_data['team']] = create_innings_dataframe(inning_data)
    match_info[inning_name] = inning_summary(match_info[inning_data['team']])
    match_info[inning_name+'_frame'] = match_info[inning_data['team']]
    return match_info


def partial_reorder_columns(df, column_order=None):
    if column_order is None:
        column_order = ['dates', 'teams', 'venue', 'overs', 'umpires', 
                        'match_type', 'neutral_venue', 'gender',  'city', 
                        'toss', 'supersubs', 'outcome', 'player_of_match']

    reordered_cols = list(df.columns)
    for (new_pos, col_name) in enumerate(column_order):
        # find the current position of the column
        curr_pos = reordered_cols.index(col_name)
        # pop it out of that position and reinsert it in the new position
        reordered_cols.insert(new_pos, reordered_cols.pop(curr_pos))
        
    return reordered_cols
        

def create_match_summaries_frame(match_summaries):
    """Convert a series of match summaries into a data frame."""
    match_infos = []
    for match in match_summaries:
        match_info = load_cric_data.pick_match_info(match)
        match_info['dates'] = match_info['dates'][0]
        match_info['dates'] = pd.to_datetime(match_info['dates'])
        
        for inning in load_cric_data.pick_innings(match):
            add_inning_to_match_info(inning, match_info)
        
        match_infos.append(match_info)
        
    df = pd.DataFrame(match_infos)
    # reindex the dataframe with a column order
    df = df.reindex_axis(partial_reorder_columns(df), axis=1)

    # TODO(kochhar): figure out how to index properly with dates
    # df.set_index('dates', inplace=True)
    return df

In [165]:
summaries = load_cric_data.match_summaries_from_dir('data/t20s')
matches_df = create_match_summaries_frame(it.islice(summaries, 100))

Read file 100 of 730

In [166]:
matches = MatchesFrame(matches_df[matches_df.gender == 'male'])
matches = matches.sort_values('dates')

In [174]:
# inn1.loc[inn1.wpo.notnull()]
indiam = matches[matches.teams.apply(col_contains('India'))]
matches['India.RT'] = indiam.India.apply(lambda inning: inning.iloc[-1]['rt_cum'])
matches['India.WT'] = indiam.India.apply(lambda inning: inning.iloc[-1]['w_cum'])
indiam.head(10)


Unnamed: 0,dates,teams,venue,overs,umpires,match_type,neutral_venue,gender,city,toss,...,Papua New Guinea,Scotland,South Africa,Sri Lanka,United Arab Emirates,West Indies,Zimbabwe,bowl_out,India.RT,India.WT
66,2006-12-01,"[South Africa, India]",New Wanderers Stadium,20,"[IL Howell, BG Jerling]",T20,,male,Johannesburg,"{u'decision': u'bat', u'winner': u'South Africa'}",...,,,batsman bowler ...,,,,,,127.0,4.0
77,2007-09-14,"[India, Pakistan]",Kingsmead,20,"[BR Doctrove, SJA Taufel]",T20,1.0,male,Durban,"{u'decision': u'field', u'winner': u'Pakistan'}",...,,,,,,,,"[{u'bowler': u'V Sehwag', u'outcome': u'hit'},...",141.0,9.0
80,2007-09-16,"[India, New Zealand]",New Wanderers Stadium,20,"[MR Benson, NJ Llong]",T20,1.0,male,Johannesburg,"{u'decision': u'field', u'winner': u'India'}",...,,,,,,,,,180.0,9.0
88,2007-09-19,"[England, India]",Kingsmead,20,"[BR Doctrove, SJA Taufel]",T20,1.0,male,Durban,"{u'decision': u'bat', u'winner': u'India'}",...,,,,,,,,,218.0,4.0
91,2007-09-20,"[South Africa, India]",Kingsmead,20,"[BR Doctrove, SJA Taufel]",T20,,male,Durban,"{u'decision': u'bat', u'winner': u'India'}",...,,,batsman bowler ...,,,,,,153.0,5.0
93,2007-09-22,"[Australia, India]",Kingsmead,20,"[Asad Rauf, MR Benson]",T20,1.0,male,Durban,"{u'decision': u'bat', u'winner': u'India'}",...,,,,,,,,,188.0,5.0
94,2007-09-24,"[India, Pakistan]",New Wanderers Stadium,20,"[MR Benson, SJA Taufel]",T20,1.0,male,Johannesburg,"{u'decision': u'bat', u'winner': u'India'}",...,,,,,,,,,157.0,5.0
98,2007-10-20,"[India, Australia]",Brabourne Stadium,20,"[AM Saheba, SL Shastri]",T20,,male,Mumbai,"{u'decision': u'bat', u'winner': u'Australia'}",...,,,,,,,,,167.0,3.0
96,2008-02-01,"[Australia, India]",Melbourne Cricket Ground,20,"[BNJ Oxenford, SJA Taufel]",T20,,male,,"{u'decision': u'bat', u'winner': u'India'}",...,,,,,,,,,74.0,10.0
4,2016-06-18,"[Zimbabwe, India]",Harare Sports Club,20,"[TJ Matibiri, RB Tiffin]",T20,,male,,"{u'decision': u'field', u'winner': u'India'}",...,,,,,,,batsman bowler non_stri...,,168.0,6.0
