# Data Extraction  

## Load some random file and display the data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, glob

data_dir= './datasets/'

# Main idea
I want to extract the following from all the matches

1. match_date
2. team1
3. score1
4. wkt1
5. team2
6. score2
7. wkt2
8. winner 

Since the data is not as clean as I'd have liked it to be, I have to figure out how to clean the data and extract the information I want. For that the first thing I noticed was the inconsistent number of the info line at the begining of all the csv file. So the solution for that was to separately open the file and read those lines which start from *info* and save the index somewhere for later use. And that index was used later to skip the rows before that indexed line. So, **read_info** function reads a file and returns only the info part as a dictionary as well as the index for the end of info line.

In [2]:
database_dir = data_dir+'csv/t20_csv_male/'
def read_info(fil):
    f = open(fil, 'r')
    lines = f.readlines(); f.close() 
    
    info ={}
    heading_line=1
    teams=[]
    for line in lines[1:]:
        if line[:4] != 'info':
            break
        lsp = (line.strip('\n').split(',')[1:])
        if 'team' in lsp:
            teams.append(lsp[1])
        else:
            info[lsp[0]] = lsp[1]
        heading_line+=1
    info['teams'] = teams  
    return (info, heading_line)


f = '343764.csv'
fil=os.path.join(database_dir,  f )
(info, heading_line) = read_info(fil)
print (info, heading_line)


{'gender': 'male', 'season': '2007/08', 'date': '2008/04/20', 'venue': 'National Stadium', 'city': 'Karachi', 'toss_winner': 'Pakistan', 'toss_decision': 'bat', 'player_of_match': 'Misbah-ul-Haq', 'umpire': 'Zameer Haider', 'reserve_umpire': 'Khalid Mahmood', 'tv_umpire': 'Saleem Badar', 'match_referee': 'MJ Procter', 'winner': 'Pakistan', 'winner_runs': '102', 'teams': ['Pakistan', 'Bangladesh']} 18


Next step is to extract the dat I need from the scoreboard. I need the following: ``match_date``, ``team1``, ``score1``, ``wkt1``, ``team2``, ``score2``, ``wkt2``, ``winner`` from each of the match. match_date and winner was retrieved from the info dictionary whereas scores and teams were also read easily with some pandas function. 

One more thing I did was to give *NaN* in the case where there was no winner, possibly the abandoned game. I am giving it anyway which can easily be removed in the analysis part, using ``df= df.dropna()``.

After reading the things for all the files, I save the information as a **csv** file.



In [3]:
f = '343764.csv'
fil=os.path.join(database_dir,  f )
(info, indx) = read_info(fil)
#print (info, indx)

col_names=['ball', 'innings', 'overs', 'team', 'batsman1', 'batsman2', \
                   'bowler', 'runs', 'extras', 'wkt_type', 'wkt_batsman']
df=pd.read_csv(fil, skiprows=indx, header=None, names=col_names)
display( df[df.innings==1].tail(5) )
display( df[df.innings==2].tail(5) )

Unnamed: 0,ball,innings,overs,team,batsman1,batsman2,bowler,runs,extras,wkt_type,wkt_batsman
118,ball,1,19.2,Pakistan,Misbah-ul-Haq,Fawad Alam,Shahadat Hossain,6,0,,
119,ball,1,19.3,Pakistan,Misbah-ul-Haq,Fawad Alam,Shahadat Hossain,1,0,,
120,ball,1,19.4,Pakistan,Fawad Alam,Misbah-ul-Haq,Shahadat Hossain,4,0,,
121,ball,1,19.5,Pakistan,Fawad Alam,Misbah-ul-Haq,Shahadat Hossain,6,0,,
122,ball,1,19.6,Pakistan,Fawad Alam,Misbah-ul-Haq,Shahadat Hossain,3,0,,


Unnamed: 0,ball,innings,overs,team,batsman1,batsman2,bowler,runs,extras,wkt_type,wkt_batsman
223,ball,2,15.3,Bangladesh,Abdur Razzak,Mashrafe Mortaza,Mansoor Amjad,0,1,,
224,ball,2,15.4,Bangladesh,Abdur Razzak,Mashrafe Mortaza,Mansoor Amjad,0,0,,
225,ball,2,15.5,Bangladesh,Abdur Razzak,Mashrafe Mortaza,Mansoor Amjad,1,0,,
226,ball,2,15.6,Bangladesh,Mashrafe Mortaza,Abdur Razzak,Mansoor Amjad,0,0,caught,Mashrafe Mortaza
227,ball,2,15.7,Bangladesh,Shahadat Hossain,Abdur Razzak,Mansoor Amjad,0,0,bowled,Shahadat Hossain


In [4]:
def get_score_at_overs(df, over=20, over_min=0):
    scores=[]
    
    runs1   = df[ (df.innings==1) & (df.overs<over) & (df.overs>=over_min)].runs.sum()
    extras1 = df[ (df.innings==1) & (df.overs<over) & (df.overs>=over_min)].extras.sum()
    wkts1   = df[ (df.innings==1) & (df.overs<over) & (df.overs>=over_min)].wkt_type.count()

    runs2   = df[ (df.innings==2) & (df.overs<over) & (df.overs>=over_min)].runs.sum()
    extras2 = df[ (df.innings==2) & (df.overs<over) & (df.overs>=over_min)].extras.sum()
    wkts2   = df[ (df.innings==2) & (df.overs<over) & (df.overs>=over_min)].wkt_type.count()

    return (runs1+extras1, wkts1, runs2+extras2, wkts2 )

get_score_at_overs(df, over=10, over_min=6)

(30, 1, 27, 1)

In [5]:
def get_data():
    match_data = []
    all_fils = glob.glob(database_dir+'*.csv')
    for fil in all_fils:
        col_names=['ball', 'innings', 'overs', 'team', 'batsman1', 'batsman2',
                   'bowler', 'runs', 'extras', 'wkt_type', 'wkt_batsman']
        (info, indx) = read_info(fil)
        df=pd.read_csv(fil, skiprows=indx, header=None, names=col_names)
        
        team1  = info['teams'][0]
        score1 = df[df.innings==1].runs.sum() + df[df.innings==1].extras.sum()
        wkts1  = df[df.innings==1].wkt_type.count()
        
        team2  = info['teams'][1]
        score2 = df[df.innings==2].runs.sum() + df[df.innings==2].extras.sum()
        wkts2  = df[df.innings==2].wkt_type.count()
        
        if 'winner' not in info:
            info['winner'] = 'NaN'
            #continue # we are dropping the No-result matches

        winner = info['winner']
        match_date = info['date']
        toss_winner = info['toss_winner']
        toss_decision = info['toss_decision']
        
        match_data.append( [match_date, team1, score1, wkts1, team2, score2, wkts2,
                            toss_winner, toss_decision, winner] )

    dff = pd.DataFrame(match_data, columns=['date', 'team1', 'score1', 'wkt1', 'team2', 'score2', 'wkt2',
       'toss_winner', 'toss_decision', 'winner'])
    
    dff["date"] = pd.to_datetime(dff["date"])
    dff = dff.sort_values(by="date")
    dff.to_csv(data_dir+'data_full.csv', index=False)
    
    print ('saved the data file.')

get_data()

saved the data file.


In [7]:
def get_data2():
    match_data = []
    all_fils = glob.glob(database_dir+'*.csv')
    for fil in all_fils:
        col_names=['ball', 'innings', 'overs', 'team', 'batsman1', 'batsman2',
                   'bowler', 'runs', 'extras', 'wkt_type', 'wkt_batsman']
        (info, indx) = read_info(fil)
        if 'winner' not in info:
            continue # we are dropping the No-result matches
        df=pd.read_csv(fil, skiprows=indx, header=None, names=col_names)

        team1, team2  = info['teams']
        score1, wkts1, score2, wkts2= get_score_at_overs(df, over=20, over_min=0)
        score1_at6, wkts1_at6, score2_at6, wkts2_at6= get_score_at_overs(df, over=6., over_min=0)
        score1_at10, wkts1_at10, score2_at10, wkts2_at10= get_score_at_overs(df, over=10., over_min=0)
        score1_at15, wkts1_at15, score2_at15, wkts2_at15= get_score_at_overs(df, over=15., over_min=0)

        winner = info['winner']
        match_date = info['date']
        toss_winner = info['toss_winner']
        toss_decision = info['toss_decision']
        
        match_data.append( [match_date, team1, team2, score1, wkts1, score2, wkts2,
                            score1_at6, wkts1_at6, score1_at10, wkts1_at10,
                            score1_at15, wkts1_at15, toss_winner, toss_decision, winner] )
            
    dff = pd.DataFrame(match_data, columns=['date', 'team1', 'team2', 'score1', 'wkt1', 'score2', 'wkt2',
                                            'score1_at6', 'wkts1_at6', 'score1_at10', 'wkts1_at10', 
                                            'score1_at15', 'wkts1_at15', 'toss_winner', 'toss_decision', 'winner'])
    
    dff["date"] = pd.to_datetime(dff["date"])
    dff = dff.sort_values(by="date")
    dff.to_csv(data_dir+'data_full.csv', index=False)

    print ('saved the data file.')

get_data2()

saved the data file.
