In [12]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [13]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
from fuzzywuzzy import process
import warnings

# Set the working directory
os.chdir('E:/JESIN/DOCUMENTS/scma/A1b')
print(os.getcwd())

# Load IPL data
ipl_bbb = pd.read_csv('IPL_ball_by_ball_updated till 2024.csv', low_memory=False)
ipl_salary = pd.read_excel('IPL SALARIES 2024.xlsx')

# Display a sample of the salary data
print("Sample of IPL Salary data:")
print(ipl_salary.head(2))

# Group data to get aggregated information
grouped_data = ipl_bbb.groupby(['Season', 'Innings No', 'Striker', 'Bowler']).agg({'runs_scored': sum, 'wicket_confirmation': sum}).reset_index()
player_runs = grouped_data.groupby(['Season', 'Striker'])['runs_scored'].sum().reset_index()
player_wickets = grouped_data.groupby(['Season', 'Bowler'])['wicket_confirmation'].sum().reset_index()

# Display top run getters in 2023
print("\nTop Run Getters in 2023:")
print(player_runs[player_runs['Season'] == 2023].sort_values(by='runs_scored', ascending=False).head())

# Find top three run getters and wicket takers per season
top_run_getters = player_runs.groupby('Season').apply(lambda x: x.nlargest(3, 'runs_scored')).reset_index(drop=True)
bottom_wicket_takers = player_wickets.groupby('Season').apply(lambda x: x.nlargest(3, 'wicket_confirmation')).reset_index(drop=True)

print("\nTop Three Run Getters:")
print(top_run_getters)
print("\nTop Three Wicket Takers:")
print(bottom_wicket_takers)

# Create DataFrame for year and Match id
ipl_year_id = pd.DataFrame(columns=["id", "year"])
ipl_year_id["id"] = ipl_bbb["Match id"]
ipl_year_id["year"] = pd.to_datetime(ipl_bbb["Date"], dayfirst=True).dt.year

# Create a copy of ipl_bbb dataframe and add 'year' column
ipl_bbbc = ipl_bbb.copy()
ipl_bbbc['year'] = pd.to_datetime(ipl_bbb["Date"], dayfirst=True).dt.year
ipl_bbbc = ipl_bbbc[["Match id", "year", "runs_scored", "wicket_confirmation", "Bowler", "Striker"]]

# Define function to determine best distribution fit using Kolmogorov-Smirnov test
def get_best_distribution(data):
    dist_names = ['alpha', 'beta', 'betaprime', 'burr12', 'crystalball',
                  'dgamma', 'dweibull', 'erlang', 'exponnorm', 'f', 'fatiguelife',
                  'gamma', 'gengamma', 'gumbel_l', 'johnsonsb', 'kappa4',
                  'lognorm', 'nct', 'norm', 'norminvgauss', 'powernorm', 'rice',
                  'recipinvgauss', 't', 'trapz', 'truncnorm']
    dist_results = []
    params = {}
    for dist_name in dist_names:
        dist = getattr(st, dist_name)
        param = dist.fit(data)
        params[dist_name] = param
        # Applying the Kolmogorov-Smirnov test
        D, p = st.kstest(data, dist_name, args=param)
        print("p value for "+dist_name+" = "+str(p))
        dist_results.append((dist_name, p))
    # select the best fitted distribution
    best_dist, best_p = (max(dist_results, key=lambda item: item[1]))
    # store the name of the best fit and its p value
    print("\nBest fitting distribution: "+str(best_dist))
    print("Best p value: "+ str(best_p))
    print("Parameters for the best fit: "+ str(params[best_dist]))
    return best_dist, best_p, params[best_dist]

# Group by year and Striker to get total runs scored each year
total_run_each_year = ipl_bbbc.groupby(["year", "Striker"])["runs_scored"].sum().reset_index()
total_run_each_year.sort_values(["year", "runs_scored"], ascending=False, inplace=True)

# Print total runs each year for reference
print("\nTotal Runs Each Year:")
print(total_run_each_year)

# Get top batsmen for last three years
list_top_batsman_last_three_year = {}
for year in total_run_each_year["year"].unique()[:3]:
    list_top_batsman_last_three_year[year] = total_run_each_year[total_run_each_year.year == year].nlargest(3, 'runs_scored')["Striker"].tolist()

# Debug: print top batsmen for last three years
print("\nTop Batsmen for Last Three Years:")
print(list_top_batsman_last_three_year)

# Suppress warnings
warnings.filterwarnings('ignore')

# Group by Striker and Match id to get runs scored by each batsman in each match
runs = ipl_bbbc.groupby(['Striker', 'Match id'])[['runs_scored']].sum().reset_index()

# Iterate through top batsmen for last three years and find best distribution for runs scored
for year, batsmen in list_top_batsman_last_three_year.items():
    for batsman in batsmen:
        print("********")
        print("Year:", year, " Batsman:", batsman)
        get_best_distribution(runs[runs["Striker"] == batsman]["runs_scored"])
        print("\n\n")

# Group by year and Bowler to get total wickets taken each year
total_wicket_each_year = ipl_bbbc.groupby(["year", "Bowler"])["wicket_confirmation"].sum().reset_index()
total_wicket_each_year.sort_values(["year", "wicket_confirmation"], ascending=False, inplace=True)

# Print total wickets each year for reference
print("\nTotal Wickets Each Year:")
print(total_wicket_each_year)

# Get top bowlers for last three years
list_top_bowler_last_three_year = {}
for year in total_wicket_each_year["year"].unique()[:3]:
    list_top_bowler_last_three_year[year] = total_wicket_each_year[total_wicket_each_year.year == year].nlargest(3, 'wicket_confirmation')["Bowler"].tolist()

# Debug: print top bowlers for last three years
print("\nTop Bowlers for Last Three Years:")
print(list_top_bowler_last_three_year)

# Group by Bowler and Match id to get wickets taken by each bowler in each match
wickets = ipl_bbbc.groupby(['Bowler', 'Match id'])[['wicket_confirmation']].sum().reset_index()

# Iterate through top bowlers for last three years and find best distribution for wickets taken
for year, bowlers in list_top_bowler_last_three_year.items():
    for bowler in bowlers:
        print("********")
        print("Year:", year, " Bowler:", bowler)
        get_best_distribution(wickets[wickets["Bowler"] == bowler]["wicket_confirmation"])
        print("\n\n")

# Filter runs data for the year 2024
R2024 = total_run_each_year[total_run_each_year['year'] == 2024]

# Function to match names and calculate correlation between Salary and Runs
def match_names_and_correlate(name, names_list, salary_df, runs_df):
    match, score = process.extractOne(name, names_list)
    if score >= 80:
        matched_data = salary_df[salary_df['Player'] == match]
        matched_runs = runs_df[runs_df['Striker'] == match]
        df_merged = pd.merge(matched_data, matched_runs, left_on='Player', right_on='Striker')
        # Calculate correlation
        correlation = df_merged['Rs'].corr(df_merged['runs_scored'])
        print(f"Correlation between Salary and Runs for {match}: {correlation}")
    else:
        print(f"No sufficient match found for {name}.")

# Iterate through salary data and find correlations for Ishan Kishan
df_salary = ipl_salary.copy()
df_runs = R2024.copy()

for player in df_salary['Player'].unique():
    print("********")
    print(f"Player: {player}")
    match_names_and_correlate(player, df_runs['Striker'].tolist(), df_salary, df_runs)
    print("\n")

# End of script
print("Script execution completed.")



E:\JESIN\DOCUMENTS\scma\A1b
Sample of IPL Salary data:
           Player     Salary   Rs  international  iconic
0  Abhishek Porel    20 lakh   20              0     NaN
1   Anrich Nortje  6.5 crore  650              1     NaN


  grouped_data = ipl_bbb.groupby(['Season', 'Innings No', 'Striker', 'Bowler']).agg({'runs_scored': sum, 'wicket_confirmation': sum}).reset_index()
  grouped_data = ipl_bbb.groupby(['Season', 'Innings No', 'Striker', 'Bowler']).agg({'runs_scored': sum, 'wicket_confirmation': sum}).reset_index()



Top Run Getters in 2023:
Empty DataFrame
Columns: [Season, Striker, runs_scored]
Index: []

Top Three Run Getters:
     Season          Striker  runs_scored
0   2007/08         SE Marsh          616
1   2007/08        G Gambhir          534
2   2007/08    ST Jayasuriya          514
3      2009        ML Hayden          572
4      2009     AC Gilchrist          495
5      2009   AB de Villiers          465
6   2009/10     SR Tendulkar          618
7   2009/10        JH Kallis          572
8   2009/10         SK Raina          528
9      2011         CH Gayle          608
10     2011          V Kohli          557
11     2011     SR Tendulkar          553
12     2012         CH Gayle          733
13     2012        G Gambhir          590
14     2012         S Dhawan          569
15     2013       MEK Hussey          733
16     2013         CH Gayle          720
17     2013          V Kohli          639
18     2014       RV Uthappa          660
19     2014         DR Smith          566
20

p value for beta = 0.2800374272685796
p value for betaprime = 0.7272275700648236
p value for burr12 = 0.03413730383965219
p value for crystalball = 0.835174953613428
p value for dgamma = 0.9003132708081405
p value for dweibull = 0.8965770306228721
p value for erlang = 0.2710277691398305
p value for exponnorm = 0.8246418777999891
p value for f = 0.9743698554720728
p value for fatiguelife = 0.8259440652110397
p value for gamma = 0.004088711345359375
p value for gengamma = 0.029688848326628436
p value for gumbel_l = 0.391243924609637
p value for johnsonsb = 0.6775536294207896
p value for kappa4 = 0.04273156928199129
p value for lognorm = 0.9006026891568572
p value for nct = 0.9627359408368513
p value for norm = 0.8351750214399875
p value for norminvgauss = 0.8696382419018381
p value for powernorm = 0.837790705015941
p value for rice = 0.8419161308192361
p value for recipinvgauss = 0.7846020832234206
p value for t = 0.8945403499225024
p value for trapz = 4.962305050994183e-07
p value for t

p value for f = 4.2346585152678845e-12
p value for fatiguelife = 0.12498847851930361
p value for gamma = 0.027350558506526124
p value for gengamma = 0.0926892512677634
p value for gumbel_l = 9.485045980257123e-06
p value for johnsonsb = 0.3450941869097196
p value for kappa4 = 3.832745782875419e-18
p value for lognorm = 2.3658846096591403e-28
p value for nct = 0.2843302460638113
p value for norm = 0.058469111112182226
p value for norminvgauss = 0.2268711891858597
p value for powernorm = 0.033823716873628396
p value for rice = 0.03349090516310227
p value for recipinvgauss = 0.1073883725317526
p value for t = 0.041656498991066715
p value for trapz = 3.947363741930107e-50
p value for truncnorm = 0.08860764609495919

Best fitting distribution: burr12
Best p value: 0.4931279667432148
Parameters for the best fit: (590926023.7998527, 0.05483081555360233, -969803927.022117, 969803927.160071)




Total Wickets Each Year:
      year             Bowler  wicket_confirmation
1836  2024           HV 

p value for truncnorm = 2.539236515610462e-06

Best fitting distribution: alpha
Best p value: 0.0005609846480252995
Parameters for the best fit: (6.734843933630203, -5.500744811228249, 44.826257131250145)



********
Year: 2023  Bowler: Rashid Khan
p value for alpha = 1.4259399000489275e-06
p value for beta = 8.8954046965209e-27
p value for betaprime = 3.407105814148136e-65
p value for burr12 = 2.5587675833251047e-18
p value for crystalball = 2.99049361738744e-09
p value for dgamma = 6.928485900596178e-10
p value for dweibull = 6.928168431614811e-10
p value for erlang = 1.052461604472364e-41
p value for exponnorm = 7.720335528170629e-07
p value for f = 4.940207066298226e-10
p value for fatiguelife = 1.4667845015790087e-07
p value for gamma = 3.120866167200452e-31
p value for gengamma = 3.3780076161228415e-35
p value for gumbel_l = 7.911140658362043e-09
p value for johnsonsb = 6.659510229977693e-18
p value for kappa4 = 6.390225516379688e-22
p value for lognorm = 6.677625232671758e-27
p 

Correlation between Salary and Runs for S Dube: nan


********
Player: Simarjeet Singh
Correlation between Salary and Runs for RK Singh: nan


********
Player: Tushar Deshpande
No sufficient match found for Tushar Deshpande.


********
Player: Abhinav Sadarangani
No sufficient match found for Abhinav Sadarangani.


********
Player: B. Sai Sudharsan
Correlation between Salary and Runs for B Sai Sudharsan: nan


********
Player: Darshan Nalkande
Correlation between Salary and Runs for DG Nalkande: nan


********
Player: David Miller
Correlation between Salary and Runs for TH David: nan


********
Player: Jayant Yadav
Correlation between Salary and Runs for SA Yadav: nan


********
Player: Joshua Little
No sufficient match found for Joshua Little.


********
Player: Kane Williamson
Correlation between Salary and Runs for KS Williamson: nan


********
Player: Matthew Wade
Correlation between Salary and Runs for MS Wade: nan


********
Player: Mohammad Shami
Correlation between Salary and R

No sufficient match found for Yuzvendra Chahal.


********
Player: Akash Deep
Correlation between Salary and Runs for Akash Deep: nan


********
Player: Anuj Rawat
Correlation between Salary and Runs for Anuj Rawat: nan


********
Player: Dinesh  Karthik
Correlation between Salary and Runs for KD Karthik: nan


********
Player: Faf Du Plessis
Correlation between Salary and Runs for F du Plessis: nan


********
Player: Glenn Maxwell
No sufficient match found for Glenn Maxwell.


********
Player: Himanshu Sharma
Correlation between Salary and Runs for RG Sharma: nan


********
Player: Karn Sharma
Correlation between Salary and Runs for I Sharma: nan


********
Player: Mahipal Lomror
Correlation between Salary and Runs for MK Lomror: nan


********
Player: Manoj Bhandage
No sufficient match found for Manoj Bhandage.


********
Player: Mayank Dagar (T)
Correlation between Salary and Runs for Mayank Dagar: nan


********
Player: Mohammed Siraj
Correlation between Salary and Runs for Mohamme