### Import libraries

In [92]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from bs4 import BeautifulSoup
import requests
from itertools import chain
from tqdm import tqdm_notebook as tqdm


### Scrape Stats from IPL website

In [95]:
def scrape_stats_for_year(year):
    URL = f"https://www.iplt20.com/stats/{year}/player-points"
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    tables = soup.find('table')
    trs = tables.find_all('tr')
    columns = [th.text.strip() for th in trs[0].find_all('th')]
    columns.append("team_name")
    columns.append("nationality")
    columns.append("year")
    data = []
    for index, tr in enumerate(trs[1:]):
        tds = tr.find_all('td')
        stat = ["".join(td.text.strip().split()) for td in tds]
        class_index = 1 if index > 0 else 2
        team_name = tr.get('class')[class_index]
        nationality = tr.get('data-nationality')
        stat.append(team_name)
        stat.append(nationality)
        stat.append(year)
        data.append(dict(zip(columns, stat)))
    return pd.DataFrame(data, columns=columns)

In [96]:
d1 = scrape_stats_for_year(2008)

In [98]:
allData = []
for year in tqdm(range(2008,2020)):
    data = scrape_stats_for_year(year)
    allData.append(data)
stats = pd.concat(allData)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  





In [109]:
stats.Pts = stats.Pts.astype(float)
stats.Wkts = stats.Wkts.astype(int)
stats.Dots = stats.Dots.astype(int)
stats["4s"] = stats["4s"].astype(int)
stats["6s"] = stats["6s"].astype(int)
stats.Catches = stats.Catches.astype(int)
stats.Stumpings = stats.Stumpings.astype(int)
stats.Mat = stats.Mat.astype(int)

### Aggregated Stats over the years for all players

In [100]:
agg_stats = stats.groupby(['PLAYER'])['Pts', 'Mat', 'Wkts', 'Dots', '4s', '6s', 'Catches', 'Stumpings'].sum().reset_index(drop=False)

In [110]:
agg_stats.describe()

Unnamed: 0,Pts,Wkts,Dots,4s,6s,Catches,Stumpings
count,556.0,556.0,556.0,556.0,556.0,556.0,556.0
mean,329.202338,14.154676,112.742806,36.156475,14.584532,9.697842,0.494604
std,473.960443,26.616959,211.853988,80.43935,34.320299,15.54495,2.885498
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,120.75,3.0,21.5,5.0,1.0,4.0,0.0
75%,438.375,14.25,112.5,24.0,10.0,11.25,0.0
max,2698.0,170.0,1249.0,527.0,326.0,101.0,38.0


### TOP 10 PLAYERS OVERALL BY POINTS

In [105]:
stats.groupby(['PLAYER'])['Pts'].sum().reset_index(drop=False).sort_values('Pts', ascending=False).head(10)

Unnamed: 0,PLAYER,Pts
449,ShaneWatson,2698.0
495,SureshRaina,2532.5
106,ChrisGayle,2369.0
171,HarbhajanSingh,2238.5
537,ViratKohli,2119.5
407,RohitSharma,2106.5
152,DwayneBravo,2038.0
357,PiyushChawla,1932.0
394,RavindraJadeja,1924.5
405,RobinUthappa,1923.5


### Writing to a file

In [106]:
stats.to_csv("./data/yearly_stats_ipl.csv", index=False)

In [111]:
agg_stats.to_csv("./data/sum_agg_stats.csv", index=False)

### Have all the aggregation stats

In [129]:
all_agg_stats = stats.groupby(['PLAYER'])['Pts', 'Mat', 'Wkts', 'Dots', '4s', '6s', 'Catches', 'Stumpings'].agg(['sum', 'mean', 'count']).reset_index(drop=False)

In [130]:
all_agg_stats.head()

Unnamed: 0_level_0,PLAYER,Pts,Pts,Pts,Mat,Mat,Mat,Wkts,Wkts,Wkts,...,4s,6s,6s,6s,Catches,Catches,Catches,Stumpings,Stumpings,Stumpings
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,count,sum,mean,count,sum,mean,count,...,count,sum,mean,count,sum,mean,count,sum,mean,count
0,ABdeVilliers,1712.5,155.681818,11,139,12.636364,11,0,0.0,11,...,11,200,18.181818,11,80,7.272727,11,7,0.636364,11
1,AakashChopra,22.5,11.25,2,7,3.5,2,0,0.0,2,...,2,0,0.0,2,2,1.0,2,0,0.0,2
2,AaronFinch,751.0,83.444444,9,75,8.333333,9,1,0.111111,9,...,9,67,7.444444,9,24,2.666667,9,0,0.0,9
3,AavishkarSalvi,30.0,30.0,1,2,2.0,1,3,3.0,1,...,1,0,0.0,1,1,1.0,1,0,0.0,1
4,AbdurRazzak,5.0,5.0,1,1,1.0,1,0,0.0,1,...,1,0,0.0,1,0,0.0,1,0,0.0,1


In [131]:
all_agg_stats.columns = all_agg_stats.columns.map('_'.join).str.strip('_')

In [136]:
cols_to_select = ['PLAYER', 'Pts_sum', 'Pts_mean', 'Pts_count', 'Mat_sum', 'Mat_mean',
       'Wkts_sum', 'Wkts_mean',  'Dots_sum',
       'Dots_mean', '4s_sum', '4s_mean', '6s_sum',
       '6s_mean',  'Catches_sum', 'Catches_mean',
       'Stumpings_sum', 'Stumpings_mean']

In [137]:
all_agg_stats = all_agg_stats[cols_to_select]

In [140]:
all_agg_stats.rename(columns={"Pts_count": "years_played"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [141]:
all_agg_stats.to_csv("./data/agg_stats.csv", index=False)

In [143]:
'''
Not used anymore as we got official data on it
'''

def classify_players_on_stats(stats):
    '''
    A cricket player can be a Batsman, Wicketkeeper, All rounder and a Bowler.
    Batsman - He will have more 6's and 4's than everyone
    Bowler - He will have more wickets and Dots
    All rounder - If a player satisfies both Batsman condition and Bowler condition, then he is an All rounder acc to us
    Wicketkeeper - Stumpings can only be got by him
    Trying to classify them naively based on stats. 
    If a player hasn't played much , there wont be any stats. Here, we classify them as unknowns
    Possible values -> [BAT, AR, BWL, WK, UNK]
    :param stats - The aggregated stats for each player
    :returns players - A dictionary of all players and their respective positions.
    '''
    all_players = stats.PLAYER
    players = {}
    for player in all_players:
        isBatsman = False
        isBowler= False
        record = stats[stats.PLAYER==player]
        if record["Stumpings"].values[0] > 0:
            players[player] = "WK"
            continue
        if (record["4s"].values[0] >= np.percentile(stats["4s"], 75)) and (record["6s"].values[0] >= np.percentile(stats["6s"], 75)):
            isBatsman = True
            players[player] = "BAT"
        if (record["Wkts"].values[0] >= np.percentile(stats["Wkts"], 75)) and (record["Dots"].values[0] >= np.percentile(stats["Dots"], 50)):
            isBowler = True
            players[player] = "BWL"
        if isBatsman and isBowler:
            players[player] = "AR"
        if isBatsman==False and isBowler==False:
            players[player] = "UNK"
    return players