# Analysis of the Batting scores on t20 Cricket results

# 1. Data Extraction  

## Load some random file and display the data

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, glob

# Main idea
I want to extract the following from all the matches

1. match_date
2. team1
3. score1
4. wkt1
5. team2
6. score2
7. wkt2
8. winner 

Since the data is not as clean as I'd have liked it to be, I have to figure out how to clean the data and extract the information I want. For that the first thing I noticed was the inconsistent number of the info line at the begining of all the csv file. So the solution for that was to separately open the file and read those lines which start from *info* and save the index somewhere for later use. And that index was used later to skip the rows before that indexed line. So, **read_info** function reads a file and returns only the info part as a dictionary as well as the index for the end of info line.

In [3]:
database_dir = '../dataset/csv/t20_csv_male/'
def read_info(fil):
    f = open(fil, 'r')
    lines = f.readlines(); f.close() 
    
    info ={}
    c=1
    teams=[]
    for line in lines[1:]:
        if line[:4] != 'info':
            break
        lsp = (line.strip('\n').split(',')[1:])
        if 'team' in lsp:
            teams.append(lsp[1])
        else:
            info[lsp[0]] = lsp[1]
        c+=1
    info['teams'] = teams  
    return (info, c)


f = '343764.csv'
fil=os.path.join(database_dir,  f )
(info, c) = read_info(fil)
print (info, c)


{'gender': 'male', 'season': '2007/08', 'date': '2008/04/20', 'venue': 'National Stadium', 'city': 'Karachi', 'toss_winner': 'Pakistan', 'toss_decision': 'bat', 'player_of_match': 'Misbah-ul-Haq', 'umpire': 'Zameer Haider', 'reserve_umpire': 'Khalid Mahmood', 'tv_umpire': 'Saleem Badar', 'match_referee': 'MJ Procter', 'winner': 'Pakistan', 'winner_runs': '102', 'teams': ['Pakistan', 'Bangladesh']} 18


Next step is to extract the dat I need from the scoreboard. I need the following: **match_date, team1, score1, wkt1, team2, score2, wkt2, winner** from each of the match. match_date and winner was retrieved from the info dictionary whereas scores and teams were also read easily with some pandas function. Finding the total wickets lost was a little difficult, as I had to count the number of cells in the wkt_type column which weren't NaN. So for that I subtracted the total length of each innings in terms of balls and subtracted the *NaN* values. May be there is a better way to dot it. But at the moment I don't know about that. And this method works just fine as well. One more thing I did was to give *NaN* in the case where there was no winner, possibly the abandoned game. I am giving it anyway which can easily be removed in the analysis part, using **df= df.dropna()**. 

After reading the things for all the files, I save the information as a **csv** file.



In [4]:
def get_data(savename='data_full.csv'):
    match_date_L = []; winner_L= []
    team1_L = []; score1_L = []; wkt1_L = []; 
    team2_L = []; score2_L = []; wkt2_L = [];
    toss_winner_L = []
    toss_decision_L = []

    all_fils = glob.glob(database_dir+'*.csv')
    
    for fil in all_fils:

        # get the info as well as the index for the ball line
        col_names=['ball', 'innings', 'overs', 'team', 'batsman1', 'batsman2', \
                   'bowler', 'runs', 'extras', 'wkt_type', 'wkt_batsman']
        (info, indx) = read_info(fil)
        df=pd.read_csv(fil, skiprows=indx, header=None, names=col_names)

        df1=df[df['innings']==1]
        score1 = df1.sum().runs; 
        wkt1 = len(df1['wkt_type']) - df1['wkt_type'].isnull().sum()

        df2=df[df['innings']==2]
        score2 = df1.sum().runs;
        wkt2 = len(df2['wkt_type']) - df2['wkt_type'].isnull().sum()

        match_date_L.append(info['date'])

        if 'winner' not in info:
            info['winner'] = 'NaN'
        winner_L.append(info['winner'])
        
        toss_winner_L.append(info['toss_winner'])
        toss_decision_L.append(info['toss_decision'])

        team1_L.append(info['teams'][0])
        team2_L.append(info['teams'][1])

        score1_L.append(score1);
        score2_L.append(score2);

        wkt1_L.append(wkt1);
        wkt2_L.append(wkt2);

        #print (info['date'], info['winner'], ' ', info['teams'][0], info['teams'][1], score1, score2, wkt1, wkt2)

    dff = pd.DataFrame({ 'Date' : match_date_L, 
                        'Team1' : team1_L, 'Score1' : score1_L, 'Wkt1' : wkt1_L,
                        'Team2' : team2_L, 'Score2' : score2_L, 'Wkt2' : wkt2_L,
                        'Toss Winner':toss_winner_L, 'Toss Decision':toss_decision_L, 
                        'Winner' : winner_L} )

    dff["Date"] = pd.to_datetime(dff["Date"])
    dff = dff.sort_values(by="Date")
    dff.to_csv(savename, index=False)
    
    print ('data saved as file:',savename)

get_data(savename='data_full.csv')

data saved as file: data_full.csv
