### Importing required libraries

In [4]:
import numpy as np
import pandas as pd
import openpyxl

### Reading test cricket bowler data set

In [6]:
df = pd.read_csv("wickets.csv")

# Display of first 10 rows of the data set
display(df.head(10))

Unnamed: 0,Player,Span,Mat,Inns,Balls,Runs,Wkts,BBI,BBM,Ave,Econ,SR,5,10
0,M Muralitharan (ICC/SL),1992-2010,133,230,44039,18180,800,9/51,16/220,22.72,2.47,55.0,67,22
1,SK Warne (AUS),1992-2007,145,273,40705,17995,708,8/71,12/128,25.41,2.65,57.4,37,10
2,JM Anderson (ENG),2003-2021,164*,304,35079,16575,623,7/42,11/71,26.6,2.83,56.3,30,3
3,A Kumble (INDIA),1990-2008,132,236,40850,18355,619,10/74,14/149,29.65,2.69,65.9,35,8
4,GD McGrath (AUS),1993-2007,124,243,29248,12186,563,8/24,10/27,21.64,2.49,51.9,29,3
5,SCJ Broad (ENG),2007-2021,149,274,29863,14590,524,8/15,11/121,27.84,2.93,56.9,18,3
6,CA Walsh (WI),1984-2001,132,242,30019,12688,519,7/37,13/55,24.44,2.53,57.8,22,3
7,DW Steyn (SA),2004-2019,93,171,18608,10077,439,7/51,11/60,22.95,3.24,42.3,26,5
8,N Kapil Dev (INDIA),1978-1994,131,227,27740,12867,434,9/83,11/146,29.64,2.78,63.9,23,2
9,HMRKB Herath (SL),1999-2018,93,170,25993,12157,433,9/127,14/184,28.07,2.8,60.0,34,9


##### Description of each column of the data set
- Player: Name of the player and their country name or for whom they played for
- Span: Total duration of their career
- Mat: Total number of matches they played during their career
- Inns: Total number of innings they played
- Balls: Total number of balls they bowled
- Runs: Total number of runs they conceded from total number of balls bowled
- Wkts: Total number of wickets they took
- BBI: stands for 'Best Bowling in Innings' i.e. figure gives best bowling in one inning
- BBM: stands for 'Best Bowling in Match' and gives the combined score over 2 or more innings in one match
- Average: The average number of runs conceded per wicket. (Ave = Runs/W)
- Econ: The average number of runs conceded per over. (Econ = Runs/Overs bowled)
- SR: Stands for 'strike rate'. This is a bowling SR, which is different from batting SR. Bowling SR is defined for a bowler as the average number of balls bowled per wicket taken. The lower the strike rate, the more effective a bowler is at taking wickets quickly.
- 5: The number of innings in which the bowler took at least five wickets
- 10: The number of matches in which the bowler took at least ten wickets


###  number of rows and columns of the dataframe

In [7]:
# number of rows
print("number of rows = ", df.shape[0])

# number of columns
print("number of columns = ", df.shape[1])

number of rows =  79
number of columns =  14


###  Data statistics and data types check

In [9]:
# checking data statistics
display(df.describe())

# checking for missing values and data types of each column
print(df.info())

Unnamed: 0,Inns,Balls,Runs,Wkts,Ave,Econ,SR,5,10
count,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0
mean,144.911392,18638.35443,8599.35443,317.21519,27.469747,2.806835,59.193671,16.35443,2.797468
std,51.180222,7199.256972,3085.168807,121.924911,3.655658,0.351577,9.350132,9.642372,3.235935
min,67.0,8785.0,4846.0,200.0,20.94,1.98,41.2,3.0,0.0
25%,110.0,13583.0,6456.5,229.0,24.5,2.6,53.3,9.5,1.0
50%,129.0,16498.0,7742.0,266.0,28.0,2.82,57.4,14.0,2.0
75%,169.0,21742.5,9756.0,374.5,29.87,3.08,63.95,20.5,3.5
max,304.0,44039.0,18355.0,800.0,34.79,3.46,91.9,67.0,22.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  79 non-null     object 
 1   Span    79 non-null     object 
 2   Mat     79 non-null     object 
 3   Inns    79 non-null     int64  
 4   Balls   79 non-null     int64  
 5   Runs    79 non-null     int64  
 6   Wkts    79 non-null     int64  
 7   BBI     79 non-null     object 
 8   BBM     79 non-null     object 
 9   Ave     79 non-null     float64
 10  Econ    79 non-null     float64
 11  SR      79 non-null     float64
 12  5       79 non-null     int64  
 13  10      79 non-null     int64  
dtypes: float64(3), int64(6), object(5)
memory usage: 8.8+ KB
None


#### There is no missing value found in the data set

### Renaming the column names of the data set

In [10]:
# column names
print(df.columns)

Index(['Player', 'Span', 'Mat', 'Inns', 'Balls', 'Runs', 'Wkts', 'BBI', 'BBM',
       'Ave', 'Econ', 'SR', '5', '10'],
      dtype='object')


In [11]:
df = df.rename(columns={'Mat':'Match', 
                        'Inns':'Innings',
                        'Wkts': 'Wickets',
                        'BBI': 'Best_bowling_innings',
                        'BBM': 'Best_bowling_match',
                        'Ave': 'Average',
                        'Econ': 'Economy',
                        'SR': 'Bowling_strike_rate',
                        5: 'Five_wickets_taken',
                        10: 'Ten_wickets_taken',})

display(df.head())

Unnamed: 0,Player,Span,Match,Innings,Balls,Runs,Wickets,Best_bowling_innings,Best_bowling_match,Average,Economy,Bowling_strike_rate,5,10
0,M Muralitharan (ICC/SL),1992-2010,133,230,44039,18180,800,9/51,16/220,22.72,2.47,55.0,67,22
1,SK Warne (AUS),1992-2007,145,273,40705,17995,708,8/71,12/128,25.41,2.65,57.4,37,10
2,JM Anderson (ENG),2003-2021,164*,304,35079,16575,623,7/42,11/71,26.6,2.83,56.3,30,3
3,A Kumble (INDIA),1990-2008,132,236,40850,18355,619,10/74,14/149,29.65,2.69,65.9,35,8
4,GD McGrath (AUS),1993-2007,124,243,29248,12186,563,8/24,10/27,21.64,2.49,51.9,29,3


### Romoving a column

In [13]:
df = df.drop('Best_bowling_match', axis=1)
display(df.head())

Unnamed: 0,Player,Span,Match,Innings,Balls,Runs,Wickets,Best_bowling_innings,Average,Economy,Bowling_strike_rate,5,10
0,M Muralitharan (ICC/SL),1992-2010,133,230,44039,18180,800,9/51,22.72,2.47,55.0,67,22
1,SK Warne (AUS),1992-2007,145,273,40705,17995,708,8/71,25.41,2.65,57.4,37,10
2,JM Anderson (ENG),2003-2021,164*,304,35079,16575,623,7/42,26.6,2.83,56.3,30,3
3,A Kumble (INDIA),1990-2008,132,236,40850,18355,619,10/74,29.65,2.69,65.9,35,8
4,GD McGrath (AUS),1993-2007,124,243,29248,12186,563,8/24,21.64,2.49,51.9,29,3
