In [2]:
import os
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [36]:
#loading datasets to perform EDA 

df_players = pd.read_csv('csv_datasets\players_2008_2023.csv')
df_players_2008_2019 = pd.read_csv('csv_datasets\most_runs_average_strikerate_2008_2019.csv')
df_highest_run = pd.read_csv('csv_datasets\most_run_season_2008_2023.csv')
df_highest_batting_avg = pd.read_csv('csv_datasets\highest_batting_average_2008_2023.csv')
df_highest_scoring_rate = pd.read_csv('csv_datasets\highest_scoring_rate_2008_2023.csv')

## PLAYERS DATASET (2008-2023)

In [37]:
#players_dataset

df_players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  698 non-null    int64  
 1   Name        698 non-null    object 
 2   Team(s)     698 non-null    object 
 3   Matches     698 non-null    int64  
 4   Runs        698 non-null    int64  
 5   Bat Avg     603 non-null    float64
 6   Wickets     698 non-null    int64  
 7   Bowl Avg    432 non-null    float64
dtypes: float64(2), int64(4), object(2)
memory usage: 43.8+ KB


- There are 95 missing entries in batting average column, probable cause would be that player is full-time bowler.
- Similarily, 266 missing entries in bowling average column, probable cause would be that player is full time batsman.
- Also, there is no need of Unnamed column, so it must be dropped.

In [38]:
df_players.drop(columns=['Unnamed: 0'], inplace=True)
df_players.rename(columns={'Bat Avg':'Bat_Avg'},inplace=True)
df_players.rename(columns={'Bowl Avg':'Bowl_Avg'},inplace=True)

In [39]:
#checking for uniqueness and duplicity

len(df_players.Name.unique())


695

In [40]:
# Calculate the count of occurrences for each name
name_counts = df_players['Name'].value_counts()

# Filter out the names that are not unique
non_unique_names = name_counts[name_counts > 1].index.tolist()

# Print the non-unique names
print("Non-unique names:")
print(non_unique_names)


Non-unique names:
['S Singh', 'Harmeet Singh']


In [41]:
# Identify duplicate names
duplicate_mask = df_players.duplicated(subset=['Name'], keep=False)

# Invert the mask to keep only unique names
df_players = df_players[~duplicate_mask]

df_players.info()

<class 'pandas.core.frame.DataFrame'>
Index: 693 entries, 0 to 697
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      693 non-null    object 
 1   Team(s)   693 non-null    object 
 2   Matches   693 non-null    int64  
 3   Runs      693 non-null    int64  
 4   Bat_Avg   599 non-null    float64
 5   Wickets   693 non-null    int64  
 6   Bowl_Avg  429 non-null    float64
dtypes: float64(2), int64(3), object(2)
memory usage: 43.3+ KB


In [42]:
# Now, fill null values with 0
df_players['Bowl_Avg'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_players['Bowl_Avg'].fillna(0, inplace=True)


In [43]:
df_players.info()

<class 'pandas.core.frame.DataFrame'>
Index: 693 entries, 0 to 697
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      693 non-null    object 
 1   Team(s)   693 non-null    object 
 2   Matches   693 non-null    int64  
 3   Runs      693 non-null    int64  
 4   Bat_Avg   599 non-null    float64
 5   Wickets   693 non-null    int64  
 6   Bowl_Avg  693 non-null    float64
dtypes: float64(2), int64(3), object(2)
memory usage: 43.3+ KB


In [44]:
df_players.isnull().count()

Name        693
Team(s)     693
Matches     693
Runs        693
Bat_Avg     693
Wickets     693
Bowl_Avg    693
dtype: int64

## MOST RUN IN ALL SEASON (2008-2023)

In [32]:
df_highest_run.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  51 non-null     int64  
 1   Player      51 non-null     object 
 2   Team(s)     50 non-null     object 
 3   Matches     50 non-null     float64
 4   Inns        50 non-null     float64
 5   NO          50 non-null     float64
 6   Runs        50 non-null     float64
 7   HS          50 non-null     object 
 8   100s        50 non-null     float64
 9   50s         50 non-null     float64
 10  Avg         50 non-null     float64
 11  S/R         50 non-null     float64
 12  4s          50 non-null     float64
 13  6s          50 non-null     float64
dtypes: float64(10), int64(1), object(3)
memory usage: 5.7+ KB


- There is an extra entry with Null values, which will be removed.
- Also, redundant columns need to be removed.

In [45]:
df_highest_run.drop(columns=['Unnamed: 0'],inplace = True)

In [47]:
# Remove '\t' characters from the 'Player' column inplace
df_highest_run['Player'].replace('\t', '', regex=True, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_highest_run['Player'].replace('\t', '', regex=True, inplace=True)


In [48]:
df_highest_run.tail(5)

Unnamed: 0,Player,Team(s),Matches,Inns,NO,Runs,HS,100s,50s,Avg,S/R,4s,6s
46,K S Williamson,"SUN, GTI",77.0,75.0,17.0,2101.0,89,0.0,18.0,36.22,126.03,181.0,64.0
47,A J Finch,"RRO, DDV, PWI, SUN, MIN, GUJ, KXI, RCB, KKR",92.0,90.0,7.0,2091.0,88*,0.0,15.0,25.19,128.2,213.0,78.0
48,R A Tripathi,"RPS, RRO, KKR, SUN",89.0,87.0,11.0,2071.0,93,0.0,11.0,27.25,138.99,205.0,78.0
49,A C Gilchrist,"DCH, KXI",80.0,80.0,4.0,2069.0,109*,2.0,11.0,27.22,138.39,239.0,92.0
50,No. of Records = 50,,,,,,,,,,,,


In [49]:
df_highest_run = df_highest_run.dropna(axis=0)
df_highest_run.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 0 to 49
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Player   50 non-null     object 
 1   Team(s)  50 non-null     object 
 2   Matches  50 non-null     float64
 3   Inns     50 non-null     float64
 4   NO       50 non-null     float64
 5   Runs     50 non-null     float64
 6   HS       50 non-null     object 
 7   100s     50 non-null     float64
 8   50s      50 non-null     float64
 9   Avg      50 non-null     float64
 10  S/R      50 non-null     float64
 11  4s       50 non-null     float64
 12  6s       50 non-null     float64
dtypes: float64(10), object(3)
memory usage: 5.5+ KB


In [50]:
#checking for uniqueness
len(df_highest_run.Player.unique())

50

## HIGHEST BATTING AVERAGE (2008-2023)

In [44]:
df_highest_batting_avg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  51 non-null     int64  
 1   Player      51 non-null     object 
 2   Team(s)     50 non-null     object 
 3   Matches     50 non-null     float64
 4   Inns        50 non-null     float64
 5   NO          50 non-null     float64
 6   Runs        50 non-null     float64
 7   HS          50 non-null     object 
 8   100s        50 non-null     float64
 9   50s         50 non-null     float64
 10  Avg         50 non-null     float64
 11  S/R         50 non-null     float64
 12  4s          50 non-null     float64
 13  6s          50 non-null     float64
dtypes: float64(10), int64(1), object(3)
memory usage: 5.7+ KB


In [51]:
df_highest_batting_avg.head(2)

Unnamed: 0.1,Unnamed: 0,Player,Team(s),Matches,Inns,NO,Runs,HS,100s,50s,Avg,S/R,4s,6s
0,0,M N\t van Wyk,KKR,5.0,5.0,2.0,167.0,74,0.0,1.0,55.67,126.52,19.0,1.0
1,1,C\t Green,MIN,16.0,16.0,7.0,452.0,100*,1.0,2.0,50.22,160.28,40.0,22.0


- Dropping the redundant column and fixing the Player names with '\t' error.

In [52]:
df_highest_batting_avg.drop(columns = ['Unnamed: 0'], inplace=True)
df_highest_batting_avg['Player'].replace('\t', '', regex=True, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_highest_batting_avg['Player'].replace('\t', '', regex=True, inplace=True)


In [53]:
df_highest_batting_avg.tail(5)

Unnamed: 0,Player,Team(s),Matches,Inns,NO,Runs,HS,100s,50s,Avg,S/R,4s,6s
46,B J Rohrer,DDV,8.0,8.0,2.0,193.0,64*,0.0,1.0,32.17,137.86,21.0,5.0
47,S A Yadav,"MIN, KKR",139.0,124.0,22.0,3249.0,103*,1.0,21.0,31.85,143.32,349.0,112.0
48,S S Iyer,"DDV, KKR",101.0,101.0,13.0,2776.0,96,0.0,19.0,31.55,125.38,237.0,97.0
49,G Gambhir,"DDV, KKR",154.0,152.0,16.0,4217.0,93,0.0,36.0,31.01,123.88,492.0,59.0
50,No. of Records = 50,,,,,,,,,,,,


In [54]:
df_highest_batting_avg = df_highest_batting_avg.dropna(axis=0)
df_highest_batting_avg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 0 to 49
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Player   50 non-null     object 
 1   Team(s)  50 non-null     object 
 2   Matches  50 non-null     float64
 3   Inns     50 non-null     float64
 4   NO       50 non-null     float64
 5   Runs     50 non-null     float64
 6   HS       50 non-null     object 
 7   100s     50 non-null     float64
 8   50s      50 non-null     float64
 9   Avg      50 non-null     float64
 10  S/R      50 non-null     float64
 11  4s       50 non-null     float64
 12  6s       50 non-null     float64
dtypes: float64(10), object(3)
memory usage: 5.5+ KB


## HIGHEST SCORING RATE (2008-2023)

In [55]:
df_highest_scoring_rate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  51 non-null     int64  
 1   Player      51 non-null     object 
 2   Team(s)     50 non-null     object 
 3   Matches     50 non-null     float64
 4   Inns        50 non-null     float64
 5   NO          50 non-null     float64
 6   Runs        50 non-null     float64
 7   HS          50 non-null     object 
 8   100s        50 non-null     float64
 9   50s         50 non-null     float64
 10  Avg         50 non-null     float64
 11  S/R         50 non-null     float64
 12  4s          50 non-null     float64
 13  6s          50 non-null     float64
dtypes: float64(10), int64(1), object(3)
memory usage: 5.7+ KB


- Removing players name error, removing redundant column and null records.

In [56]:
df_highest_scoring_rate.drop(columns = ['Unnamed: 0'], inplace=True)
df_highest_scoring_rate = df_highest_scoring_rate.dropna(axis=0)
# Remove '\t' characters from the 'Player' column inplace
df_highest_scoring_rate['Player'].replace('\t', '', regex=True, inplace=True)
df_highest_scoring_rate.tail(5)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_highest_scoring_rate['Player'].replace('\t', '', regex=True, inplace=True)


Unnamed: 0,Player,Team(s),Matches,Inns,NO,Runs,HS,100s,50s,Avg,S/R,4s,6s
45,S A Yadav,"MIN, KKR",139.0,124.0,22.0,3249.0,103*,1.0,21.0,31.85,143.32,349.0,112.0
46,M J Lumb,"RRO, DCH",12.0,12.0,0.0,278.0,83,0.0,1.0,23.17,143.3,45.0,6.0
47,S M Curran,"KXI, CSK",46.0,36.0,11.0,613.0,55*,0.0,3.0,24.52,143.22,50.0,31.0
48,M M Ali,"RCB, CSK",59.0,52.0,6.0,1034.0,93,0.0,5.0,22.48,143.02,88.0,59.0
49,Y K Pathan,"RRO, KKR, SUN",174.0,154.0,44.0,3204.0,100,1.0,13.0,29.13,142.97,262.0,158.0


# Players performance ( 2008-2019 )

In [57]:
df_players_2008_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   batsman        516 non-null    object 
 1   total_runs     516 non-null    int64  
 2   out            516 non-null    int64  
 3   numberofballs  516 non-null    int64  
 4   average        482 non-null    float64
 5   strikerate     516 non-null    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 24.3+ KB


- Some values in average columns are not present. Lets fill them with the 0.

In [58]:
#checking for unique records
len(df_players_2008_2019.batsman.unique())

516

- All entries are unique. No duplicity.

In [59]:
average_bat = df_players_2008_2019['average'].mean()
df_players_2008_2019['average'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_players_2008_2019['average'].fillna(0, inplace=True)


In [60]:
df_players_2008_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   batsman        516 non-null    object 
 1   total_runs     516 non-null    int64  
 2   out            516 non-null    int64  
 3   numberofballs  516 non-null    int64  
 4   average        516 non-null    float64
 5   strikerate     516 non-null    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 24.3+ KB


## KPIs / Inferences 
Q. Player with Most Runs Across All Seasons: Identify the player who has scored the highest total runs across all seasons. This indicates the most prolific run-scorer in the history of IPL.

Q. Average Strike Rate Across Top Performers and All Seasons: Calculate the average strike rate for the top performers (e.g., top 10 run-scorers) and compare it with the average strike rate across all seasons. This helps in understanding the scoring efficiency of top players compared to the overall league performance.

Q. Most Consistent Players with Batting Average Greater than the Average: Identify players whose batting average is consistently above the average batting average across all seasons. This highlights players who consistently perform well with the bat.

Q. T20 Most Valuable Players with Above-Average Strike Rate: Determine the players with a strike rate higher than the average strike rate across all seasons. These players are considered valuable in the T20 format due to their ability to score quickly.

Q. All-rounders with Decent Bowling and Batting Average: Identify players who have both a decent bowling average and batting average. This indicates players who contribute significantly to both batting and bowling aspects of the game, making them valuable assets to their teams.

<b>Strike-rate</b> vs <b>Batting average?</b>

<b>Batting Average</b>: This metric indicates the average number of runs scored by a player per dismissal (out). A higher batting average suggests that the player consistently contributes runs to the team and has a good ability to stay at the crease. It reflects the player's consistency and ability to build innings.

<b>Strike Rate</b>: Strike rate measures how quickly a batsman scores runs. It represents the number of runs scored by a player per 100 balls faced. A higher strike rate indicates that the player scores runs more quickly, often through aggressive stroke play. While a high strike rate can be valuable in limited-overs cricket formats like T20 and One Day Internationals (ODIs), it's also important to balance it with consistency and the ability to build innings.

In [61]:
# calculating average batting avg across all seasons

df_players.info()

<class 'pandas.core.frame.DataFrame'>
Index: 693 entries, 0 to 697
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      693 non-null    object 
 1   Team(s)   693 non-null    object 
 2   Matches   693 non-null    int64  
 3   Runs      693 non-null    int64  
 4   Bat_Avg   599 non-null    float64
 5   Wickets   693 non-null    int64  
 6   Bowl_Avg  693 non-null    float64
dtypes: float64(2), int64(3), object(2)
memory usage: 43.3+ KB


In [62]:
df_players.describe()

Unnamed: 0,Matches,Runs,Bat_Avg,Wickets,Bowl_Avg
count,693.0,693.0,599.0,693.0,693.0
mean,32.675325,440.168831,16.327112,15.777778,21.467114
std,44.329363,980.613178,10.81801,30.617201,21.510253
min,1.0,0.0,0.0,0.0,0.0
25%,5.0,10.0,8.0,0.0,0.0
50%,13.0,56.0,14.81,3.0,23.76
75%,42.0,300.0,23.04,15.0,33.2
max,250.0,7263.0,69.0,187.0,136.0


Bowl_Avg:

- The average bowling average (Bowl_Avg) across all players is approximately 21.47.
- The standard deviation is around 21.51, indicating a significant variation in the bowling performance among players.
- The minimum bowling average is 0, which might indicate players who have not bowled or have not taken any wickets.
- The maximum bowling average is 136, suggesting outstanding bowling performances by some players.

Runs:

- The average number of runs scored by players is approximately 440.17.
- There is a considerable variation in the runs scored, as indicated by the standard deviation of approximately 980.61.
- The minimum runs scored is 0, which could represent players who have not batted or have not scored any runs.
- The maximum runs scored is 7263, indicating exceptional batting performances by certain players.

Batting Avg:

- The average batting average (Bat_Avg) among players is about 14.11.
- The standard deviation of approximately 11.51 suggests variability in batting performance among players.
- The minimum batting average is 0, which might indicate players with no batting performance or not-outs.
- The maximum batting average is 69, showing exceptional consistency and effectiveness in batting for some players.

Wickets:

- The average number of wickets taken by players is around 15.78.
- There is a notable variation in the wickets taken, as indicated by the standard deviation of approximately 30.62.
- The minimum number of wickets taken is 0, indicating players who have not bowled or have not taken any wickets.
- The maximum number of wickets taken is 187, indicating exceptional bowling performances by certain players.


# Top-10 run Scorer ( across all seasons )

In [63]:
# Top-10 Run Scorer
df_top_players = df_players.sort_values(by='Runs', ascending=False)

# Select the top 10 players based on total runs
top_10_players = df_top_players.head(10)

# Print the top 10 players
print(top_10_players[['Name', 'Runs', 'Matches','Bat_Avg']])


                Name  Runs  Matches  Bat_Avg
280          V Kohli  7263      237    37.25
138         S Dhawan  6617      217    35.39
674       D A Warner  6397      176    41.54
552       R G Sharma  6211      243    29.58
467        S K Raina  5528      205    32.52
132  A B de Villiers  5162      184    39.71
139        M S Dhoni  5082      250    38.79
175        C H Gayle  4965      142    39.72
649      R V Uthappa  4952      205    27.51
261      K D Karthik  4516      242    25.81


- V Kohli has scored the maximum runs (7263) with batting average of 37. One of the most consistent and promising players.

In [64]:
top_10_players.describe()

Unnamed: 0,Matches,Runs,Bat_Avg,Wickets,Bowl_Avg
count,10.0,10.0,10.0,10.0,10.0
mean,210.1,5669.3,34.782,6.6,22.733
std,34.955368,895.910468,5.601791,9.252027,30.485785
min,142.0,4516.0,25.81,0.0,0.0
25%,189.25,4994.25,30.315,0.0,0.0
50%,211.0,5345.0,36.32,2.0,8.25
75%,240.75,6350.5,39.48,12.25,38.5075
max,250.0,7263.0,41.54,25.0,92.0


- Top-10 Run scorers have batting average ranging near 34 in T20-format.
- Also, the average number of matches played to score such runs range about 210.

# Top-10 players with Highest strike rate ( across all seasons )

In [65]:
# Top-10 players with highest S/R

df_top_sr = df_highest_scoring_rate.sort_values(by='S/R',ascending=False)
top_10_sr = df_top_sr.head(10)
print(top_10_sr[['Player','Matches','Runs','S/R']])

            Player  Matches    Runs     S/R
0        T H David     25.0   418.0  177.87
1       L J Wright      7.0   106.0  176.67
2      A D Russell    112.0  2262.0  174.00
3        D C Jurel     13.0   152.0  172.73
4        K  Cooper     25.0   116.0  170.59
5    B C J Cutting     21.0   238.0  168.79
6        K Gowtham     35.0   247.0  166.89
7      Rashid Khan    109.0   443.0  166.54
8        H Klaasen     19.0   514.0  165.81
9  L S Livingstone     32.0   828.0  165.60


In [66]:
top_10_sr.describe()

Unnamed: 0,Matches,Inns,NO,Runs,100s,50s,Avg,S/R,4s,6s
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,39.8,29.6,7.9,532.4,0.1,1.9,22.815,170.549,36.3,40.0
std,38.156986,26.704972,6.367452,647.074476,0.316228,3.414023,7.954687,4.561149,42.5886,56.170376
min,7.0,6.0,1.0,106.0,0.0,0.0,12.89,165.6,9.0,3.0
25%,19.5,14.75,4.0,173.5,0.0,0.0,15.68,166.6275,15.0,11.0
50%,25.0,20.5,5.5,332.5,0.0,0.0,21.675,169.69,20.0,22.5
75%,34.25,30.75,9.0,496.25,0.0,1.75,28.7175,173.6825,34.75,34.0
max,112.0,96.0,20.0,2262.0,1.0,10.0,36.71,177.87,150.0,193.0


In [67]:
df_highest_scoring_rate.describe()

Unnamed: 0,Matches,Inns,NO,Runs,100s,50s,Avg,S/R,4s,6s
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,55.48,47.88,10.26,1088.58,0.48,5.82,26.1226,154.0416,93.44,59.76
std,52.926302,47.408188,12.084414,1297.757203,1.19932,8.515748,8.200183,10.367368,114.560056,74.096616
min,6.0,6.0,0.0,106.0,0.0,0.0,12.89,142.97,9.0,3.0
25%,16.25,14.0,1.25,223.0,0.0,0.0,21.31,145.0425,16.0,12.5
50%,31.5,25.0,6.0,447.5,0.0,2.0,26.485,151.475,39.0,28.5
75%,92.25,68.0,14.25,1245.5,0.0,7.5,29.5925,162.365,136.5,72.25
max,189.0,171.0,52.0,5162.0,6.0,40.0,50.22,177.87,413.0,355.0


- Top-10 players with highest s/r have played 39 matches on average.
- A D Russell scored more than 2000 Runs in almost 112 matches with strike rate of 174.
- The mean of batting average of top S/R players(s/r >=140 only) is 26.


# Top-10 Players with highest batting average ( across all seasons )

In [68]:
top_10_batting_avg = df_highest_batting_avg.head(10)
print(top_10_batting_avg)

            Player             Team(s)  Matches   Inns    NO    Runs    HS  \
0      M N van Wyk                 KKR      5.0    5.0   2.0   167.0    74   
1          C Green                 MIN     16.0   16.0   7.0   452.0  100*   
2       D P Conway                 CSK     23.0   22.0   3.0   924.0   92*   
3        K L Rahul  RCB, SUN, KXI, LSG    118.0  109.0  20.0  4163.0  132*   
4    Sai Sudharsan                 GTI     13.0   13.0   2.0   507.0    96   
5        A C Voges                 RRO      9.0    7.0   3.0   181.0   45*   
6         H M Amla                 KXI     16.0   16.0   3.0   577.0  104*   
7    Iqbal Abdulla       KKR, RRO, RCB     49.0   13.0  11.0    88.0   33*   
8       D A Warner            DDV, SUN    176.0  176.0  22.0  6397.0   126   
9  P D Collingwood                 DDV      8.0    7.0   2.0   203.0   75*   

   100s   50s    Avg     S/R     4s     6s  
0   0.0   1.0  55.67  126.52   19.0    1.0  
1   1.0   2.0  50.22  160.28   40.0   22.0  
2   0.

In [69]:
top_10_batting_avg.describe()

Unnamed: 0,Matches,Inns,NO,Runs,100s,50s,Avg,S/R,4s,6s
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,43.3,38.4,7.5,1365.9,1.1,11.6,46.316,134.268,129.9,50.0
std,57.669653,57.318215,7.677529,2143.543559,1.66333,19.95662,4.398114,14.254843,208.848403,79.253461
min,5.0,5.0,2.0,88.0,0.0,0.0,40.6,104.76,9.0,1.0
25%,10.0,8.5,2.25,186.5,0.0,1.25,44.095,127.46,16.0,5.5
50%,16.0,14.5,3.0,479.5,0.0,3.0,45.67,135.725,43.5,18.0
75%,42.5,20.5,10.0,837.25,1.75,7.75,48.1675,140.94,89.25,28.0
max,176.0,176.0,22.0,6397.0,4.0,61.0,55.67,160.28,646.0,226.0


In [70]:
df_highest_batting_avg.describe()

Unnamed: 0,Matches,Inns,NO,Runs,100s,50s,Avg,S/R,4s,6s
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,75.5,71.82,13.26,2147.48,1.1,14.76,37.8498,135.948,199.08,80.74
std,65.285324,62.506062,14.803213,1902.771579,1.619398,14.689383,5.336624,10.732923,182.946724,78.10506
min,5.0,5.0,1.0,88.0,0.0,0.0,31.01,104.76,9.0,1.0
25%,23.5,22.0,3.0,641.75,0.0,4.0,33.8175,128.535,55.0,26.75
50%,55.5,54.0,9.0,1364.5,1.0,10.5,36.9,135.12,132.5,53.5
75%,102.5,100.0,19.25,2889.75,1.0,19.75,39.9075,142.0625,293.0,113.5
max,250.0,229.0,87.0,7263.0,7.0,61.0,55.67,165.81,750.0,355.0


- Players with highest batting average on median scale have played around 55 matches with median batting average of 36 and s/r of 135.
- Top-10 players with highest batting average have average strike rate of 135 with mean batting average of 46.

# No. of players with batting average greater than average of batting average (across all seasons)

In [71]:
df_players.describe()

Unnamed: 0,Matches,Runs,Bat_Avg,Wickets,Bowl_Avg
count,693.0,693.0,599.0,693.0,693.0
mean,32.675325,440.168831,16.327112,15.777778,21.467114
std,44.329363,980.613178,10.81801,30.617201,21.510253
min,1.0,0.0,0.0,0.0,0.0
25%,5.0,10.0,8.0,0.0,0.0
50%,13.0,56.0,14.81,3.0,23.76
75%,42.0,300.0,23.04,15.0,33.2
max,250.0,7263.0,69.0,187.0,136.0


In [72]:
average_batting_avg = df_players['Bat_Avg'].mean()
players_above_avg = df_players[df_players['Bat_Avg'] > average_batting_avg]
num_players_above_avg = len(players_above_avg)

print("Number of players with a batting average greater than the average:", num_players_above_avg)


Number of players with a batting average greater than the average: 269


# Most valuable players with strike rate and batting average greater than average for all players from season -  2008-2019 

In [73]:
df_players_2008_2019.head(5)

Unnamed: 0,batsman,total_runs,out,numberofballs,average,strikerate
0,V Kohli,5426,152,4111,35.697368,131.987351
1,SK Raina,5386,160,3916,33.6625,137.538304
2,RG Sharma,4902,161,3742,30.447205,130.999466
3,DA Warner,4717,114,3292,41.377193,143.286756
4,S Dhawan,4601,137,3665,33.583942,125.538881


In [74]:
df_players_2008_2019.describe()

Unnamed: 0,total_runs,out,numberofballs,average,strikerate
count,516.0,516.0,516.0,516.0,516.0
mean,430.625969,17.063953,335.645349,15.277414,105.433442
std,882.275431,28.124511,663.593679,11.512252,39.633938
min,0.0,0.0,1.0,0.0,0.0
25%,15.0,2.0,17.0,7.0,84.362069
50%,74.0,5.5,68.0,13.196429,111.651584
75%,340.5,18.0,290.0,22.600806,130.499036
max,5426.0,161.0,4111.0,88.0,250.0


In [75]:
average_bat_2008_2019 = df_players_2008_2019['average'].mean()
average_sr_2008_2019 = df_players_2008_2019['strikerate'].mean()

# Corrected the logical operator from 'and' to '&'
valuable_players_2008_2019 = df_players_2008_2019[(df_players_2008_2019['average'] > average_bat_2008_2019) & (df_players_2008_2019['strikerate'] > average_sr_2008_2019)]

valuable_players_2008_2019

Unnamed: 0,batsman,total_runs,out,numberofballs,average,strikerate
0,V Kohli,5426,152,4111,35.697368,131.987351
1,SK Raina,5386,160,3916,33.662500,137.538304
2,RG Sharma,4902,161,3742,30.447205,130.999466
3,DA Warner,4717,114,3292,41.377193,143.286756
4,S Dhawan,4601,137,3665,33.583942,125.538881
...,...,...,...,...,...,...
322,M Santner,33,1,23,33.000000,143.478261
323,D Salunkhe,33,1,24,33.000000,137.500000
355,RR Raje,20,1,18,20.000000,111.111111
368,Y Gnaneswara Rao,19,1,17,19.000000,111.764706


- 201 Players with average and strike rate greater than average of all players for season 2008-2019.

# Finding all rounders with bowling avg. and batting avg. greater than or equal to average of all players for season 2008-2023

In [76]:
df_players.describe()

Unnamed: 0,Matches,Runs,Bat_Avg,Wickets,Bowl_Avg
count,693.0,693.0,599.0,693.0,693.0
mean,32.675325,440.168831,16.327112,15.777778,21.467114
std,44.329363,980.613178,10.81801,30.617201,21.510253
min,1.0,0.0,0.0,0.0,0.0
25%,5.0,10.0,8.0,0.0,0.0
50%,13.0,56.0,14.81,3.0,23.76
75%,42.0,300.0,23.04,15.0,33.2
max,250.0,7263.0,69.0,187.0,136.0


In [77]:
bowl_avg_2008_2023 = df_players['Bowl_Avg'].mean()
bat_avg_2008_2023 = df_players['Bat_Avg'].mean()

all_rounders = df_players[(df_players['Bowl_Avg'] >= bowl_avg_2008_2023) & (df_players['Bat_Avg'] >= bat_avg_2008_2023)]
len(all_rounders)

115

- Total 115 all-rounders found decent batting and bowling averages.