### Loading the Libraries

In [None]:
import numpy as np
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings('ignore')

### Loading the Datasets

In [None]:
df_train = pd.read_csv('/content/663e2b6d54457_train_data_with_samplefeatures.csv')

In [None]:
df_train_copy = df_train.copy()

In [None]:
df_test = pd.read_csv('/content/6644a1e287df6_test_data_with_samplefeatures.csv')

In [None]:
df_test_copy = df_test.copy()

In [None]:
bowler_lvl_data = pd.read_csv('/content/663e2b2c60743_bowler_level_scorecard.csv')

In [None]:
batsman_lvl_data = pd.read_csv('/content/663e2b548c98c_batsman_level_scorecard.csv')

In [None]:
df_match_score = pd.read_csv('/content/664389efa0868_match_level_scorecard.csv')

In [None]:
df_match_score_copy = df_match_score.copy()

### Veiwing the Datasets

In [None]:
df_train

In [None]:
df_test

In [None]:
bowler_lvl_data

In [None]:
batsman_lvl_data

In [None]:
df_match_score.columns

### Useful Function

Creating features by merging the information present in different files.

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
from matplotlib import pyplot as plt
plt.style.use('seaborn-v0_8')  # safer and avoids warning

In [None]:
df_train['target']=(df_train["winner_id"]==df_train["team1_id"])*1 # assign 1 if team 1 wins and 0 otherwise

In [None]:
import re
def createRnP(X_12, feature, N=5, ylim_lb=0.1, ylim_ub=0.9):
    '''
    Rank and Plot of input feature on the input data. The y-axis shows %team1 wins in each bucket.

    Parameters-
    1. X_12: dataset to build the RnP on.
    2. feature: Feature to build RnP of.
    3. N: number of bins on x-axis. Default 5.
    4. ylim_lb: lower bound of y axis on plot.
    5. ylim_ub: upper bound of y axis on plot.

    Output-
    1. Rank and Plot

    Returns- None
    '''
    df = X_12.copy()
    df[f'{feature}_bin'] = df[feature].rank(pct=True)//(1/N) # divide feature values for all games in 5 equi-volume buckets.
    df['count'] = 1
    df['team1_win%'] = df['target'].apply(lambda x: x) # invert winner_01 to get team1 winner indicator
    df['team2_win%'] = df['target'].copy()
    df[f'{feature}_min'] = df[feature].copy()
    df[f'{feature}_max'] = df[feature].copy()
    df_g = df.groupby(f'{feature}_bin').agg({'team1_win%':'mean', 'team2_win%':'mean', 'count':'sum', f'{feature}_min':'min',\
                                            f'{feature}_max':'max'}).reset_index()
    N = min(N,df_g.shape[0])
    blue_bar = df_g['team1_win%'].values.tolist()
    ind = np.arange(N)
    # plotting starts
    plt.figure(figsize=(10,5));
    plt.bar(ind, blue_bar, label='Team 1 win%');
    plt.axhline(y=0.5, linewidth=0.5, color='k', linestyle = '--')
    xlabel = re.sub('team_','ratio_',feature)
    plt.xlabel(f'{xlabel} (team1 / team2) bins');
    plt.ylabel('Win %');
    plt.title(f'RnP - {feature} vs win');
    df_g['xticks'] = df_g.apply(lambda x: str(round(x[f'{feature}_min'],2)) + ' - ' + str(round(x[f'{feature}_max'],2)), axis=1)
    plt.xticks(ind, df_g['xticks']);
    plt.ylim([ylim_lb,ylim_ub]);
    plt.legend(loc='best');
    x2,x1 = blue_bar[-1],blue_bar[0]
    slope = x2/x1
    if slope < 1:
        slope = 1/slope
        x1,x2 = x2,x1
    print('slope:', round(x2,2),'/',round(x1,2), '= ',round(slope,2))
    plt.show();

In [None]:
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.

    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}

    Output-None

    Returns- dataframe having bowling/batting stats from last n games of a player before an input date.
    The results are sorted by date.
    '''
    if bat_or_bowl == 'bat':
        df_topick = batsman_lvl_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_lvl_data
        id_col = 'bowler_id'

    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

In [None]:
def no50sLastn(player_list, date, n):
    '''
    Function to get total number of 50s scored by players in the roster of a team in last n games.

    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.

    Output-None

    Returns- int value denoting sum of 50s scored by all players in the roster.
    '''

    player_list = str(player_list).split(':') # split string of ':' separated ids into a list of ids
    res_list = []
    for player in player_list: # loop over each player_id in roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat') # getting batting stats from last n games for each player.
        df_rel['gte_50runs'] = np.where(df_rel['runs']>=50, 1, 0) # binary indicator to denote whether the player scored a 50 in the game (runs>=50).
        res_list.append(np.nansum(df_rel['gte_50runs']))# Sum up number of 50s for the player and append to a list. We will do this for all players.
    return np.nansum(res_list)# Sum up values of the list which is sum of 50s by all players in the roster.

In [None]:
def topThreeBatsmenRuns(player_list, date, n):
    '''
    Function to get the total runs of the top 3 batsmen in the last n games before a specified date.

    Input-
    1. player_list: ':' separated list of player ids.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of games to look-back and create this feature.

    Output-None

    Returns- The total number of runs scored by the top 3 batsmen in the last n games.
    '''

    player_list = str(player_list).split(':')  # Split string of ':' separated ids into a list of ids
    batsman_runs = []

    for player in player_list:  # Loop over each player_id in the roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')  # Get batting stats from last n games for each player
        total_runs = np.nansum(df_rel['runs'])  # Sum up the runs scored in these games
        batsman_runs.append((player, total_runs))  # Append a tuple of (player_id, total_runs)

    # Sort the list of tuples by total_runs in descending order and get the top 3
    batsman_runs.sort(key=lambda x: x[1], reverse=True)
    top_3_batsmen = batsman_runs[:3]

    # Calculate the total runs scored by the top 3 batsmen
    total_runs_top_3 = sum([runs for _, runs in top_3_batsmen])

    return total_runs_top_3


    return total_runs_top_3
def topThreeBowlersWickets(player_list, date, n):
    '''
    Function to get the total wickets of the top 3 bowlers in the last n games before a specified date.

    Input-
    1. player_list: ':' separated list of player ids.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of games to look-back and create this feature.

    Output-None

    Returns- The total number of wickets taken by the top 3 bowlers in the last n games.
    '''

    player_list = str(player_list).split(':')  # Split string of ':' separated ids into a list of ids
    bowler_wickets = []

    for player in player_list:  # Loop over each player_id in the roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bowl')  # Get bowling stats from last n games for each player
        total_wickets = np.nansum(df_rel['wicket_count'])  # Sum up the wickets taken in these games
        bowler_wickets.append((player, total_wickets))  # Append a tuple of (player_id, total_wickets)

    # Sort the list of tuples by total_wickets in descending order and get the top 3
    bowler_wickets.sort(key=lambda x: x[1], reverse=True)
    top_3_bowlers = bowler_wickets[:3]

    # Calculate the total wickets taken by the top 3 bowlers
    total_wickets_top_3 = sum([wickets for _, wickets in top_3_bowlers])

    return total_wickets_top_3


In [None]:
def topThreeBatsmenStrikeRates(player_list, date, n=15):
    '''
    Function to get the average strike rates of the top 3 batsmen in the last 15 games before a specified date,
    provided they have an average of more than 20 runs.

    Input-
    1. player_list: ':' separated list of player ids.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of games to look-back and create this feature.

    Output-None

    Returns- The average strike rate of the top 3 batsmen who have an average of more than 20 runs in the last n games.
    '''

    player_list = str(player_list).split(':')  # Split string of ':' separated ids into a list of ids
    batsman_strike_rates = []

    for player in player_list:  # Loop over each player_id in the roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')  # Get batting stats from last n games for each player

        # Calculate total runs and total balls faced
        total_runs = np.nansum(df_rel['runs'])
        total_balls = np.nansum(df_rel['balls_faced'])

        # Calculate average runs
        num_games = len(df_rel)
        average_runs = total_runs / num_games if num_games > 0 else 0

        # Calculate strike rate
        strike_rate = (total_runs / total_balls) * 100 if total_balls > 0 else 0

        # Only consider batsmen with an average of more than 20 runs
        if average_runs > 20:
            batsman_strike_rates.append((player, strike_rate))

    # Sort the list of tuples by strike rate in descending order and get the top 3
    batsman_strike_rates.sort(key=lambda x: x[1], reverse=True)
    top_3_batsmen = batsman_strike_rates[:3]

    # Calculate the average strike rate of the top 3 batsmen
    total_strike_rate_top_3 = sum([strike_rate for _, strike_rate in top_3_batsmen])
    average_strike_rate_top_3 = total_strike_rate_top_3 / 3 if len(top_3_batsmen) == 3 else 0

    return average_strike_rate_top_3


In [None]:
def topThreeBowlersEconomy(player_list, date, n=15):
    '''
    Function to get the average economy of the top 3 bowlers in the last 15 games before a specified date,
    provided they have bowled more than 2 overs on average.

    Input-
    1. player_list: ':' separated list of player ids.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of games to look-back and create this feature.

    Output-None

    Returns- The average economy rate of the top 3 bowlers who have bowled more than 2 overs on average in the last n games.
    '''

    player_list = str(player_list).split(':')  # Split string of ':' separated ids into a list of ids
    bowler_economy_rates = []

    for player in player_list:  # Loop over each player_id in the roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bowl')  # Get bowling stats from last n games for each player

        # Calculate total runs conceded and total overs bowled
        total_runs_conceded = np.nansum(df_rel['runs'])
        total_balls_bowled = np.nansum(df_rel['balls_bowled'])

        # Calculate total overs bowled
        total_overs_bowled = total_balls_bowled / 6

        # Calculate average overs bowled
        num_games = len(df_rel)
        average_overs_bowled = total_overs_bowled / num_games if num_games > 0 else 0

        # Calculate economy rate
        economy_rate = (total_runs_conceded / total_overs_bowled) if total_overs_bowled > 0 else 0

        # Only consider bowlers who have bowled more than 2 overs on average
        if average_overs_bowled > 2:
            bowler_economy_rates.append((player, economy_rate))

    # Sort the list of tuples by economy rate in ascending order and get the top 3 (lower economy rate is better)
    bowler_economy_rates.sort(key=lambda x: x[1])
    top_3_bowlers = bowler_economy_rates[:3]

    # Calculate the average economy rate of the top 3 bowlers
    total_economy_rate_top_3 = sum([economy_rate for _, economy_rate in top_3_bowlers])
    average_economy_rate_top_3 = total_economy_rate_top_3 / 3 if len(top_3_bowlers) == 3 else 0

    return average_economy_rate_top_3


In [None]:
def no30sLastn(player_list, date, n):
    '''
    Function to get total number of 30s scored by players in the roster of a team in last n games.

    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.

    Output-None

    Returns- int value denoting sum of 30s scored by all players in the roster.
    '''

    player_list = str(player_list).split(':') # split string of ':' separated ids into a list of ids
    res_list = []
    for player in player_list: # loop over each player_id in roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat') # getting batting stats from last n games for each player.
        df_rel['gte_30runs'] = np.where(df_rel['runs']>=30, 1, 0) # binary indicator to denote whether the player scored a 50 in the game (runs>=50).
        res_list.append(np.nansum(df_rel['gte_30runs']))# Sum up number of 50s for the player and append to a list. We will do this for all players.
    return np.nansum(res_list)# Sum up values of the list which is sum of 50s by all players in the roster.

In [None]:
df_match_score['team1_bat_inning'] = np.where( ((df_match_score['team1']==df_match_score['toss winner'])&(df_match_score['toss decision']=='bat'))|\

                                               ((df_match_score['team2']==df_match_score['toss winner'])&(df_match_score['toss decision']=='field')) , 1, 2)


def teamAvgRunsLastn(team_id, date, n):
    '''
    Function to calculate a team's average runs in their last n games.

    Input-
    1. team_id: ID of the team to calculate average runs.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.

    Output- None

    Return- Float value denoting average of runs scored by team1 in their last n games.
    '''
    # filter out games with either team1/2_id as input team_id, match date less than current game's input date, sort desc by date, and top n rows (games) returned
    df_rel = df_match_score[(df_match_score['match_dt']<date)&\
                      ((df_match_score['team1_id']==team_id)|(df_match_score['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    # combine two dataframes - one where input team is batting first, and another one where input team is batting second.
    df_rel = pd.concat([ df_rel[df_rel['team1_bat_inning']==1][['inning1_runs']].rename(columns={'inning1_runs':'runs'}), \
                         df_rel[df_rel['team1_bat_inning']==2][['inning2_runs']].rename(columns={'inning2_runs':'runs'}) ] )
    return df_rel['runs'].mean() # return mean of the combined dataframe.


def teamAvgStrikeRateLastn(team_id, date, n):
    '''
    Function to calculate a team's average runs in their last n games.

    Input-
    1. team_id: ID of the team to calculate average strike rate.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.

    Output- None

    Return- Float value denoting average of strike rate scored by team1 in their last n games.
    '''
    # filter out games with either team1/2_id as input team_id, match date less than current game's input date, sort desc by date, and top n rows (games) returned
    df_rel = df_match_score[(df_match_score['match_dt']<date)&\
                      ((df_match_score['team1_id']==team_id)|(df_match_score['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    # combine two dataframes - one where input team is batting first, and another one where input team is batting second.
    df_rel = pd.concat([ df_rel[df_rel['team1_bat_inning']==1][['inning1_runs']].rename(columns={'inning1_runs':'strike_rate'}), \
                         df_rel[df_rel['team1_bat_inning']==2][['inning2_runs']].rename(columns={'inning2_runs':'strike_rate'}) ] )
    return df_rel['strike_rate'].mean() # return mean of the combined dataframe.

In [None]:
df_train

In [None]:
df_train['best_batsman1']=df_train.progress_apply(lambda x: topThreeBatsmenRuns(x['team1_roster_ids'], x['match_dt'], 15), axis=1)
df_train['best_batsman2']=df_train.progress_apply(lambda x: topThreeBatsmenRuns(x['team2_roster_ids'], x['match_dt'], 15), axis=1)
df_train['best_bowler1']=df_train.progress_apply(lambda x: topThreeBowlersWickets(x['team1_roster_ids'], x['match_dt'], 15), axis=1)
df_train['best_bowler2']=df_train.progress_apply(lambda x: topThreeBowlersWickets(x['team2_roster_ids'], x['match_dt'], 15), axis=1)
df_train['runsratio']=(df_train['best_batsman1']+1)/(df_train['best_batsman2']+1)
df_train['wicketsratio']=(df_train['best_bowler1']+1)/(df_train['best_bowler2']+1)
df_train['best_strikerate1']=df_train.progress_apply(lambda x: topThreeBatsmenStrikeRates(x['team1_roster_ids'], x['match_dt'], 15), axis=1)
df_train['best_strikerate2']=df_train.progress_apply(lambda x: topThreeBatsmenStrikeRates(x['team2_roster_ids'], x['match_dt'], 15), axis=1)


In [None]:
df_train['strike_ratio']=(df_train['best_strikerate1']+1)/(df_train['best_strikerate2']+1)
df_train['best_economy1']=df_train.progress_apply(lambda x: topThreeBowlersEconomy(x['team1_roster_ids'], x['match_dt'], 15), axis=1)
df_train['best_economy2']=df_train.progress_apply(lambda x: topThreeBowlersEconomy(x['team2_roster_ids'], x['match_dt'], 15), axis=1)
df_train['economy_ratio']=(df_train['best_economy1']+1)/(df_train['best_economy2']+1)
#cols=['best_batsman1','best_batsman2']

In [None]:
createRnP(df_train, 'runsratio'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train, 'wicketsratio'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train, 'best_strikerate1'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train, 'best_strikerate2'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train, 'best_economy1'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train,'best_economy2', N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train,'strike_ratio', N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train,'economy_ratio', N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
df_train['team1only_avg_runs_last15'] = df_train.progress_apply(lambda x: \
                                  teamAvgRunsLastn(x['team1_id'], x['match_dt'], 15), axis=1)
df_train['team2only_avg_runs_last15'] = df_train.progress_apply(lambda x: \
                                  teamAvgRunsLastn(x['team2_id'], x['match_dt'], 15), axis=1)
df_train['ratio_avg_runs_last_15'] = (df_train['team1only_avg_runs_last15']+1)/(df_train['team2only_avg_runs_last15']+1)
df_train.drop(columns=['team1only_avg_runs_last15','team2only_avg_runs_last15'], inplace=True) # dropping intermediate columns

In [None]:
df_train['team1_avg_strike_rate_last_15'] = df_train.progress_apply(lambda x: \
                                  teamAvgStrikeRateLastn(x['team1_id'], x['match_dt'], 15), axis=1)
df_train['team2_avg_strike_rate_last_15'] = df_train.progress_apply(lambda x: \
                                  teamAvgStrikeRateLastn(x['team2_id'], x['match_dt'], 15), axis=1)
df_train['ratio_strike_rate'] = (df_train['team1_avg_strike_rate_last_15']+1)/(df_train['team2_avg_strike_rate_last_15']+1)
df_train.drop(columns=['team1_avg_strike_rate_last_15','team2_avg_strike_rate_last_15'], inplace=True) # dropping intermediate columns


In [None]:
df_train['team1only_avg_runs_last5'] = df_train.progress_apply(lambda x: \
                                  teamAvgRunsLastn(x['team1_id'], x['match_dt'], 5), axis=1)
df_train['team2only_avg_runs_last5'] = df_train.progress_apply(lambda x: \
                                  teamAvgRunsLastn(x['team2_id'], x['match_dt'], 5), axis=1)

df_train['ratio_avg_runs_last_5'] = (df_train['team1only_avg_runs_last5']+1)/(df_train['team2only_avg_runs_last5']+1)
df_train.drop(columns=['team1only_avg_runs_last5','team2only_avg_runs_last5'], inplace=True) # dropping intermediate columns

In [None]:
# Computing number of 50 runs in last 15 games for team1 for train dataset.
df_train['team1_count_50runs_last15_'] = df_train.progress_apply(lambda x: \
            no50sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
df_train['team2_count_50runs_last15_'] = df_train.progress_apply(lambda x: \
            no50sLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

df_train['team_count_50runs_last15'] = (df_train['team1_count_50runs_last15_']+1)/(df_train['team2_count_50runs_last15_']+1)
df_train.drop(columns=['team1_count_50runs_last15_','team2_count_50runs_last15_'], inplace=True) # dropping intermediate columns

In [None]:
df_train['team1_count_30runs_last15_'] = df_train.progress_apply(lambda x: \
            no30sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 30 runs in last 5 games for team2 for train dataset.
df_train['team2_count_30runs_last15_'] = df_train.progress_apply(lambda x: \
            no30sLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

df_train['team_count_30runs_last15'] = (df_train['team1_count_30runs_last15_']+1)/(df_train['team2_count_30runs_last15_']+1)
df_train.drop(columns=['team1_count_30runs_last15_','team2_count_30runs_last15_'], inplace=True) # dropping intermediate columns

In [None]:
createRnP(df_train, 'team_count_50runs_last15'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train, 'ratio_avg_runs_last_5'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train, 'ratio_strike_rate'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train, 'team_count_30runs_last15'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
# ### 2. team_winp_last5 <br>
#
# Ratio of team1's win % to team2's win % in last 5 games

# In[20]:


def winpLastn(team_id, date, n):
    '''
    Get a team's win % in last n games. If a team has won 3 game out of their last 5, win% is 60%.

    Input-
    1. team_id: ID of the team to get their last n games and winner information from those games.
    2. date: match date from which to get last n historical games.
    3. n: look-back window of games.

    Output- None

    Returns- Float value denoting win% of the team in last n games.
    '''
    # filter out games with either team1/2_id as input team id, match_dt being before current game's date, sort desc by date, and get top n rows (games)
    df_rel = df_match_score[(df_match_score['match_dt']<date)&\
                      ((df_match_score['team1_id']==team_id)|(df_match_score['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    win_count = df_rel[df_rel['winner_id']==team_id].shape[0] # count number of rows having winner as the input team
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points

In [None]:
def winpCrossLastn(team1_id, team2_id, date, n):
    '''
    Function to compute team1's win% against team2 from the current game in their past n encounters.

    Input-
    1. team1_id: ID of team1 to calculate win% of.
    2. team2_id: ID of team2 to calculate win% against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. n: look-back window of games for both these teams.

    Output- None

    Returns- Float value denoting team1's win% against team2 in their past n games against each other.
    '''
    # filter out games where either team1_id is input team1 and team2_id is input team2, or where team2_id is input team1 and team1_id is input team2.
    # Also, match date is less than current games's input date, sort desc by date and get top n rows (games)
    df_rel = df_match_score[(df_match_score['match_dt']<date)&\
                      (((df_match_score['team1_id']==team1_id)&(df_match_score['team2_id']==team2_id))|((df_match_score['team1_id']==team2_id)&(df_match_score['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    win_count = df_rel[df_rel['winner_id']==team1_id].shape[0] # Counting number of rows (games) where winner is input team1.
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return Float denoting team1's win% against team2 in past n games rounded to 2 decimal places.



In [None]:
df_train['team1_winp_team2_last15'] = df_train.progress_apply(lambda x: \
                                  winpCrossLastn(x['team1_id'], x['team2_id'], x['match_dt'], 15), axis=1)

In [None]:
df_train['team1_winp_team2_last5'] = df_train.progress_apply(lambda x: \
                                  winpCrossLastn(x['team1_id'], x['team2_id'], x['match_dt'], 5), axis=1)

In [None]:
# Compute team1's win% in last 5 games
df_train['team1_winp_last5'] = df_train.progress_apply(lambda x: \
                                  winpLastn(x['team1_id'], x['match_dt'], 5), axis=1)
# Compute team2's win% in last 5 games
df_train['team2_winp_last5'] = df_train.progress_apply(lambda x: \
                                  winpLastn(x['team2_id'], x['match_dt'], 5), axis=1)


# In[22]:


# Take the ratio of (team1's win% in their last 5 games)/(team2's win% in their last 5 games). Adding 1 to avoid divide by zero error
df_train['team_winp_last5_versus'] = (df_train['team1_winp_last5']+1)/(df_train['team2_winp_last5']+1)
df_train.drop(columns=['team1_winp_last5', 'team2_winp_last5'], inplace=True) # drop intermediate columns

In [None]:
# Compute team1's win% in last 15 games
df_train['team1_winp_last15'] = df_train.progress_apply(lambda x: \
                                  winpLastn(x['team1_id'], x['match_dt'], 15), axis=1)
# Compute team2's win% in last 15 games
df_train['team2_winp_last15'] = df_train.progress_apply(lambda x: \
                                  winpLastn(x['team2_id'], x['match_dt'], 15), axis=1)


# In[22]:


# Take the ratio of (team1's win% in their last 5 games)/(team2's win% in their last 15 games). Adding 1 to avoid divide by zero error
df_train['team_winp_last15'] = (df_train['team1_winp_last15']+1)/(df_train['team2_winp_last15']+1)
df_train.drop(columns=['team1_winp_last15', 'team2_winp_last15'], inplace=True) # drop intermediate columns

In [None]:
# Compute team1's win% in last 1 games
df_train['team1_winp_last1'] = df_train.progress_apply(lambda x: \
                                  winpLastn(x['team1_id'], x['match_dt'], 1), axis=1)
# Compute team2's win% in last 1 games
df_train['team2_winp_last1'] = df_train.progress_apply(lambda x: \
                                  winpLastn(x['team2_id'], x['match_dt'], 1), axis=1)


# In[22]:


# Take the ratio of (team1's win% in their last 5 games)/(team2's win% in their last 5 games). Adding 1 to avoid divide by zero error
df_train['team_winp_last1'] = (df_train['team1_winp_last1']+1)/(df_train['team2_winp_last1']+1)
df_train.drop(columns=['team1_winp_last1', 'team2_winp_last1'], inplace=True) # drop intermediate columns

In [None]:
createRnP(df_train, 'team_winp_last5'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train, 'team1_winp_team2_last15'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train, 'team1_winp_team2_last5'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train, 'team_winp_last5_versus'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train, 'team_winp_last15'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
# ### 5. ground_avg_runs_last15 <br>
# average runs scored in the ground in last 15 games

# In[38]:


def avgRunsGround(ground_id, date, n):
    '''
    Function to calculate average runs scored in ground/venue.

    Input-
    1. ground_id: ID of the ground to calculate the feature for.
    2. date: match date of the current game to calculate the feature for.
    3. n: look-back window of games for the ground.

    Output- None

    Returns- Average runs scored in the ground.
    '''
    # filter out games with ground_id being the input ground_id and date earlier than current game's input date. Sort desc by date, and select top n rows (games).
    df_rel = df_match_score[(df_match_score['match_dt']<date)&(df_match_score['ground_id']==ground_id)].sort_values(by='match_dt', ascending=False).head(n)
    df_rel['avg_runs_inn'] = (df_rel['inning1_runs']+df_rel['inning2_runs'])/2 # take the mean of inning1_runs and inning2_runs in a separate column.
    return df_rel['avg_runs_inn'].mean() # Return the mean value of the computed column above.


In [None]:
## Calculate average runs in the ground for last 5 games hosted in that venue for train data.
df_train['ground_avg_runs_last5'] = df_train.progress_apply(lambda x: \
                                  avgRunsGround(x['ground_id'], x['match_dt'], 5), axis=1)


In [None]:
createRnP(df_train, 'ground_avg_runs_last15'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train, 'ground_avg_runs_last5'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
def avg_economy_last_n(player_list, date, n):
    '''
    Function to get average economy of players in the roster of a team in last n games.

    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.

    Output-None

    Returns- int value denoting sum of economy scored by all players in the roster.
    '''

    player_list = str(player_list).split(':') # split string of ':' separated ids into a list of ids
    res_list = []
    for player in player_list: # loop over each player_id in roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bowl') # getting bowling stats from last n games for each player.
        res_list.append(np.nanmean(df_rel['economy']))# Sum economy for the player and append to a list. We will do this for all players.
    return np.nanmean(res_list)# Sum up values of the list which is sum of economy by all players in the roster.

In [None]:
df_train['team1_economy'] = df_train.progress_apply(lambda x: \
            avg_economy_last_n(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
df_train['team2_economy'] = df_train.progress_apply(lambda x: \
            avg_economy_last_n(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)


# In[15]:


# Taking ratio of ( economy in last 15 games for team1) to (economy in last 15 games for team2). Adding 1 to handle divide by zero exceptions.
df_train['team_avg_economy_last15'] = (df_train['team1_economy']+1)/(df_train['team2_economy']+1)
df_train.drop(columns=['team1_economy','team2_economy'], inplace=True) # dropping intermediate columns

In [None]:
df_train['team1_economy'] = df_train.progress_apply(lambda x: \
            avg_economy_last_n(player_list=x['team1_roster_ids'], date=x['match_dt'], n=5), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
df_train['team2_economy'] = df_train.progress_apply(lambda x: \
            avg_economy_last_n(player_list=x['team2_roster_ids'], date=x['match_dt'], n=5), axis=1)


# In[15]:


# Taking ratio of (economy in last 5 games for team1) to (economyin last 15 games for team2). Adding 1 to handle divide by zero exceptions.
df_train['team_avg_economy_last5'] = (df_train['team1_economy']+1)/(df_train['team2_economy']+1)
df_train.drop(columns=['team1_economy','team2_economy'], inplace=True) # dropping intermediate columns

In [None]:
createRnP(df_train, 'team_avg_economy_last15'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
createRnP(df_train, 'team_avg_economy_last5'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
df_train

In [None]:
def avg_wickets_last_n(player_list, date, n):
    '''
    Function to get average wickets of players in the roster of a team in last n games.

    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.

    Output-None

    Returns- int value denoting wickets scored by all players in the roster.
    '''

    player_list = str(player_list).split(':') # split string of ':' separated ids into a list of ids
    res_list = []
    for player in player_list: # loop over each player_id in roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bowl') # getting economy from last n games for each player.
        res_list.append(np.nanmean(df_rel['wicket_count']))# Sum up economy for the player and append to a list. We will do this for all players.
    return np.nanmean(res_list)# Sum up values of the list which is economy by all players in the roster.

In [None]:
df_train['team1_wickets'] = df_train.progress_apply(lambda x: \
            avg_economy_last_n(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
df_train['team2_wickets'] = df_train.progress_apply(lambda x: \
            avg_economy_last_n(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)


# In[15]:


# Taking ratio of (number of 50 runs in last 15 games for team1) to (number of 50 runs in last 15 games for team2). Adding 1 to handle divide by zero exceptions.
df_train['team_avg_wickets_last15'] = (df_train['team1_wickets']+1)/(df_train['team2_wickets']+1)
df_train.drop(columns=['team1_wickets','team2_wickets'], inplace=True) # dropping intermediate columns

In [None]:
createRnP(df_train, 'team_avg_wickets_last15'	, N=5, ylim_lb=0.3, ylim_ub=0.7)

In [None]:
def won_last_game(team_name,team_id,date):
  # 1 if a team won their last game,-1 if they lost it and 0 if the team hasn't played any game before
  df = df_match_score[((df_match_score['team1_id'] == team_id) | (df_match_score['team2_id'] == team_id) )& (df_match_score['match_dt'] < date)]
  df.sort_values(by=['match_dt'],inplace = True)
  df = df.tail(1)

  n=1
  if(df.shape[0] < 1):
    n = df.shape[0]
  if(n==0):
    return 0
    # if its the teams debut then 0 gets alloted
  for i in range(n):
    if (df.iloc[i]['winner'] == team_name):
      return 1
    else:
      return -1

In [None]:
df_train['won_last_game_1'] = df_train.apply(lambda x: won_last_game(x['team1'], x['team1_id'],x['match_dt']) , axis=1)
df_train['won_last_game_2'] = df_train.apply(lambda x: won_last_game(x['team2'], x['team2_id'],x['match_dt']),axis =1)

In [None]:
def won_last_game_at_ground(team_name,team_id,date,ground_id):
  # 1 if a team won their last game,-1 if they lost it and 0 if the team hasn't played any game before
  df = df_match_score[(((df_match_score['team1_id'] == team_id) | (df_match_score['team2_id'] == team_id) )& (df_match_score['match_dt'] < date)) & (df_match_score['ground_id'] == ground_id)]
  df.sort_values(by=['match_dt'],inplace = True)
  df = df.tail(1)

  n=1
  if(df.shape[0] < 1):
    n = df.shape[0]
  if(n==0):
    return 0
    # if the team hasn't played on the ground before then 0 gets alloted
  for i in range(n):
    if (df.iloc[i]['winner'] == team_name):
      return 1
    else:
      return -1

In [None]:
df_train['won_last_game_at_ground1'] = df_train.apply(lambda x: won_last_game(x['team1'], x['team1_id'],x['match_dt']) , axis=1)
df_train['won_last_game_at_ground2'] = df_train.apply(lambda x: won_last_game(x['team2'], x['team2_id'],x['match_dt']),axis =1)

### Cleaning the final dataset

In [None]:
del df_train['team1_roster_ids']
del df_train['team2_roster_ids']
del df_train['winner']
del df_train['venue']
del df_train['city']
del df_train['match_dt']
del df_train['lighting']
del df_train['series_name']
del df_train['ground_id']
del df_train['match id']
del df_train['season']


In [None]:
df_train.columns

In [None]:
Index(['team1', 'team1_id', 'team2', 'team2_id', 'winner_id', 'toss winner',
       'toss decision', 'team_count_50runs_last15', 'team_winp_last5',
       'team1_winp_team2_last15', 'ground_avg_runs_last15', 'target',
       'best_batsman1', 'best_batsman2', 'best_bowler1', 'best_bowler2',
       'runsratio', 'wicketsratio', 'best_strikerate1', 'best_strikerate2',
       'strike_ratio', 'best_economy1', 'best_economy2', 'economy_ratio',
       'ratio_avg_runs_last_15', 'ratio_strike_rate', 'ratio_avg_runs_last_5',
       'team_count_30runs_last15', 'team1_winp_team2_last5',
       'team_winp_last5_versus', 'team_winp_last15', 'team_winp_last1',
       'ground_avg_runs_last5', 'team_avg_economy_last15',
       'team_avg_economy_last5', 'team_avg_wickets_last15', 'won_last_game_1',
       'won_last_game_2', 'won_last_game_at_ground1',
       'won_last_game_at_ground2'],
      dtype='object')

In [None]:
cols=['strike_ratio','economy_ratio']
df_train.drop(cols,axis=1,inplace=True)

In [None]:
df_train['toss_winner']=(df_train["toss winner"]==df_train["team1"])*1

In [None]:
del df_train['toss decision']


In [None]:
del df_train['team1']
del df_train['team2']


In [None]:
del df_train['team1_id']
del df_train['team2_id']
del df_train['toss winner']

In [None]:
del df_train['winner_id']

In [None]:
df_train

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train

### Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_df,cv_df = train_test_split(df_train,test_size = 0.2,random_state = 40)

In [None]:
train_df

In [None]:
px.histogram(train_df, x='target',title = 'Distribution of Target')

In [None]:
df_train

### Hyperparameter tuning on XGBOOST and LightGBM

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

param_grid = {
    'n_estimators': [5,6,7,8,9],
    'learning_rate': [ 0.05, 0.1],
    'max_depth': [3, 4 ],
    'min_child_weight': [ 3, 5],
    'subsample': [0.6, 0.7],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'gamma': [0, 0.05, 0.1],
    'reg_alpha': [0, 0.01, 0.1],
    'min_split_loss': [0, 5 ,10, 20,]
}

xgb = XGBClassifier()

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(df_train.drop(columns=['target']), df_train['target'])
best_params = grid_search.best_params_

print("Best parameters found: ", best_params)

Fitting 5 folds for each of 8640 candidates, totalling 43200 fits


In [None]:
model = XGBClassifier(colsample_bytree=0.6, gamma=0, learning_rate=0.1, max_depth=4, min_child_weight=5, min_split_loss=5, n_estimators=8, reg_alpha=0.1, subsample=0.6)

In [None]:
score = cross_val_score(model,df_train.drop(columns=['target']),df_train['target'],cv=5)
score.mean()

In [None]:
X = df_train.drop('target',axis=1)
y = df_train['target']

In [None]:
df_train_copy2 = df_train.copy()

In [None]:
df_train.fillna(df_train.mean(),inplace=True) # Testing whether NaN values should be left too the model or dealt with ourselves.

In [None]:
X = df_train.drop('target',axis=1)
y = df_train['target']

In [None]:
# It is best if NaN values are filled before hand.

In [None]:
model

In [None]:
model = LGBMClassifier(n_estimators=9,reg_alpha=0,max_depth=4,colsample_bytree=0.6,min_split_gain=5,learning_rate=0.1)

In [None]:
score = cross_val_score(model,df_train.drop(columns=['target']),df_train['target'],cv=5)
score.mean()

### Training the model

In [None]:
model.fit(df_train.drop(columns=['target']),df_train['target'])

In [None]:
train_preds = model.predict_proba(df_train.drop(columns=['target']))

In [None]:
train_preds

### **FORMATTING THE TEST SET**

In [None]:
# df_test=pd.read_csv('/content/6644a1e287df6_test_data_with_samplefeatures.csv')

In [None]:
df_test['team1only_avg_runs_last15'] = df_test.progress_apply(lambda x: \
                                  teamAvgRunsLastn(x['team1_id'], x['match_dt'], 15), axis=1)
df_test['team2only_avg_runs_last15'] = df_test.progress_apply(lambda x: \
                                  teamAvgRunsLastn(x['team2_id'], x['match_dt'], 15), axis=1)
df_test['ratio_avg_runs_last_15'] = (df_test['team1only_avg_runs_last15']+1)/(df_test['team2only_avg_runs_last15']+1)
df_test.drop(columns=['team1only_avg_runs_last15','team2only_avg_runs_last15'], inplace=True) # dropping intermediate columns
df_test['team1_avg_strike_rate_last_15'] = df_test.progress_apply(lambda x: \
                                  teamAvgStrikeRateLastn(x['team1_id'], x['match_dt'], 15), axis=1)
df_test['team2_avg_strike_rate_last_15'] = df_test.progress_apply(lambda x: \
                                  teamAvgStrikeRateLastn(x['team2_id'], x['match_dt'], 15), axis=1)
df_test['ratio_strike_rate'] = (df_test['team1_avg_strike_rate_last_15']+1)/(df_test['team2_avg_strike_rate_last_15']+1)
df_test.drop(columns=['team1_avg_strike_rate_last_15','team2_avg_strike_rate_last_15'], inplace=True) # dropping intermediate columns
df_test['team1only_avg_runs_last5'] = df_test.progress_apply(lambda x: \
                                  teamAvgRunsLastn(x['team1_id'], x['match_dt'], 5), axis=1)
df_test['team2only_avg_runs_last5'] = df_test.progress_apply(lambda x: \
                                  teamAvgRunsLastn(x['team2_id'], x['match_dt'], 5), axis=1)
df_test['ratio_avg_runs_last_5'] = (df_test['team1only_avg_runs_last5']+1)/(df_test['team2only_avg_runs_last5']+1)
df_test.drop(columns=['team1only_avg_runs_last5','team2only_avg_runs_last5'], inplace=True) # dropping intermediate columns
# Computing number of 50 runs in last 15 games for team1 for train dataset.
df_test['team1_count_50runs_last15_'] = df_test.progress_apply(lambda x: \
            no50sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
df_test['team2_count_50runs_last15_'] = df_test.progress_apply(lambda x: \
            no50sLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

df_test['team_count_50runs_last15'] = (df_test['team1_count_50runs_last15_']+1)/(df_test['team2_count_50runs_last15_']+1)
df_test.drop(columns=['team1_count_50runs_last15_','team2_count_50runs_last15_'], inplace=True) # dropping intermediate columns
df_test['team1_count_30runs_last15_'] = df_test.progress_apply(lambda x: \
            no30sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 30 runs in last 5 games for team2 for train dataset.
df_test['team2_count_30runs_last15_'] = df_test.progress_apply(lambda x: \
            no30sLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

df_test['team_count_30runs_last15'] = (df_test['team1_count_30runs_last15_']+1)/(df_test['team2_count_30runs_last15_']+1)
df_test.drop(columns=['team1_count_30runs_last15_','team2_count_30runs_last15_'], inplace=True) # dropping intermediate columns
df_test['team1_winp_team2_last15'] = df_test.progress_apply(lambda x: \
                                  winpCrossLastn(x['team1_id'], x['team2_id'], x['match_dt'], 15), axis=1)
df_test['team1_winp_team2_last5'] = df_test.progress_apply(lambda x: \
                                  winpCrossLastn(x['team1_id'], x['team2_id'], x['match_dt'], 5), axis=1)
# Compute team1's win% in last 5 games
df_test['team1_winp_last5'] = df_test.progress_apply(lambda x: \
                                  winpLastn(x['team1_id'], x['match_dt'], 5), axis=1)
# Compute team2's win% in last 5 games
df_test['team2_winp_last5'] = df_test.progress_apply(lambda x: \
                                  winpLastn(x['team2_id'], x['match_dt'], 5), axis=1)

df_test['best_batsman1']=df_test.progress_apply(lambda x: topThreeBatsmenRuns(x['team1_roster_ids'], x['match_dt'], 15), axis=1)
df_test['best_batsman2']=df_test.progress_apply(lambda x: topThreeBatsmenRuns(x['team2_roster_ids'], x['match_dt'], 15), axis=1)
df_test['best_bowler1']=df_test.progress_apply(lambda x: topThreeBowlersWickets(x['team1_roster_ids'], x['match_dt'], 15), axis=1)
df_test['best_bowler2']=df_test.progress_apply(lambda x: topThreeBowlersWickets(x['team2_roster_ids'], x['match_dt'], 15), axis=1)
df_test['runsratio']=(df_test['best_batsman1']+1)/(df_test['best_batsman2']+1)
df_test['wicketsratio']=(df_test['best_bowler1']+1)/(df_test['best_bowler2']+1)
#cols=['best_batsman1','best_batsman2']
# In[22]:


# Take the ratio of (team1's win% in their last 5 games)/(team2's win% in their last 5 games). Adding 1 to avoid divide by zero error
df_test['team_winp_last5_versus'] = (df_test['team1_winp_last5']+1)/(df_test['team2_winp_last5']+1)
df_test.drop(columns=['team1_winp_last5', 'team2_winp_last5'], inplace=True) # drop intermediate columns
# Compute team1's win% in last 15 games
df_test['team1_winp_last15'] = df_test.progress_apply(lambda x: \
                                  winpLastn(x['team1_id'], x['match_dt'], 15), axis=1)
# Compute team2's win% in last 15 games
df_test['team2_winp_last15'] = df_test.progress_apply(lambda x: \
                                  winpLastn(x['team2_id'], x['match_dt'], 15), axis=1)


# In[22]:


# Take the ratio of (team1's win% in their last 5 games)/(team2's win% in their last 15 games). Adding 1 to avoid divide by zero error
df_test['team_winp_last15'] = (df_test['team1_winp_last15']+1)/(df_test['team2_winp_last15']+1)
df_test.drop(columns=['team1_winp_last15', 'team2_winp_last15'], inplace=True) # drop intermediate columns
# Compute team1's win% in last 1 games
df_test['team1_winp_last1'] = df_test.progress_apply(lambda x: \
                                  winpLastn(x['team1_id'], x['match_dt'], 1), axis=1)
# Compute team2's win% in last 1 games
df_test['team2_winp_last1'] = df_test.progress_apply(lambda x: \
                                  winpLastn(x['team2_id'], x['match_dt'], 1), axis=1)


# In[22]:


# Take the ratio of (team1's win% in their last 5 games)/(team2's win% in their last 5 games). Adding 1 to avoid divide by zero error
df_test['team_winp_last1'] = (df_test['team1_winp_last1']+1)/(df_test['team2_winp_last1']+1)
df_test.drop(columns=['team1_winp_last1', 'team2_winp_last1'], inplace=True) # drop intermediate columns
## Calculate average runs in the ground for last 5 games hosted in that venue for train data.
df_test['ground_avg_runs_last5'] = df_test.progress_apply(lambda x: \
                                  avgRunsGround(x['ground_id'], x['match_dt'], 5), axis=1)

df_test['team1_economy'] = df_test.progress_apply(lambda x: \
            avg_economy_last_n(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
df_test['team2_economy'] = df_test.progress_apply(lambda x: \
            avg_economy_last_n(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)


# In[15]:


# Taking ratio of ( economy in last 15 games for team1) to (economy in last 15 games for team2). Adding 1 to handle divide by zero exceptions.
df_test['team_avg_economy_last15'] = (df_test['team1_economy']+1)/(df_test['team2_economy']+1)
df_test.drop(columns=['team1_economy','team2_economy'], inplace=True) # dropping intermediate columns
df_test['team1_economy'] = df_test.progress_apply(lambda x: \
            avg_economy_last_n(player_list=x['team1_roster_ids'], date=x['match_dt'], n=5), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
df_test['team2_economy'] = df_test.progress_apply(lambda x: \
            avg_economy_last_n(player_list=x['team2_roster_ids'], date=x['match_dt'], n=5), axis=1)


# In[15]:


# Taking ratio of (economy in last 5 games for team1) to (economyin last 15 games for team2). Adding 1 to handle divide by zero exceptions.
df_test['team_avg_economy_last5'] = (df_test['team1_economy']+1)/(df_test['team2_economy']+1)
df_test.drop(columns=['team1_economy','team2_economy'], inplace=True) # dropping intermediate columns
df_test['team1_wickets'] = df_test.progress_apply(lambda x: \
            avg_economy_last_n(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
df_test['team2_wickets'] = df_test.progress_apply(lambda x: \
            avg_economy_last_n(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)


# In[15]:


# Taking ratio of (number of 50 runs in last 15 games for team1) to (number of 50 runs in last 15 games for team2). Adding 1 to handle divide by zero exceptions.
df_test['team_avg_wickets_last15'] = (df_test['team1_wickets']+1)/(df_test['team2_wickets']+1)
df_test.drop(columns=['team1_wickets','team2_wickets'], inplace=True) # dropping intermediate columns


In [None]:
df_test.columns

In [None]:
df_test['best_strikerate1']=df_test.progress_apply(lambda x: topThreeBatsmenStrikeRates(x['team1_roster_ids'], x['match_dt'], 15), axis=1)
df_test['best_strikerate2']=df_test.progress_apply(lambda x: topThreeBatsmenStrikeRates(x['team2_roster_ids'], x['match_dt'], 15), axis=1)
df_test['best_economy1']=df_test.progress_apply(lambda x: topThreeBowlersEconomy(x['team1_roster_ids'], x['match_dt'], 15), axis=1)
df_test['best_economy2']=df_test.progress_apply(lambda x: topThreeBowlersEconomy(x['team2_roster_ids'], x['match_dt'], 15), axis=1)

In [None]:
df_train.columns

In [None]:
df_test['won_last_game_1'] = df_test.apply(lambda x: won_last_game(x['team1'], x['team1_id'],x['match_dt']) , axis=1)
df_test['won_last_game_2'] = df_test.apply(lambda x: won_last_game(x['team2'], x['team2_id'],x['match_dt']),axis =1)
df_test['won_last_game_at_ground1'] = df_test.apply(lambda x: won_last_game(x['team1'], x['team1_id'],x['match_dt']) , axis=1)
df_test['won_last_game_at_ground2'] = df_test.apply(lambda x: won_last_game(x['team2'], x['team2_id'],x['match_dt']),axis =1)
del df_test['team1_roster_ids']
del df_test['team2_roster_ids']
del df_test['venue']
del df_test['city']
del df_test['match_dt']
del df_test['lighting']
del df_test['series_name']
del df_test['ground_id']
del df_test['match id']
del df_test['season']
df_test['toss_winner']=(df_test["toss winner"]==df_test["team1"])*1
del df_test['toss decision']
del df_test['team1']
del df_test['team2']
del df_test['team1_id']
del df_test['team2_id']
del df_test['toss winner']
# average_team1only_avg_runs_last15 = np.mean(df_test.team1only_avg_runs_last15)
# average_ground_avg_runs_last15 = np.mean(df_test.ground_avg_runs_last15)
# df_test['team1only_avg_runs_last15'].fillna(average_team1only_avg_runs_last15 ,inplace = True)
# df_test['ground_avg_runs_last15'].fillna(average_ground_avg_runs_last15,inplace = True)



In [None]:
df_test.fillna(df_test.mean(),inplace=True)

In [None]:
df_train.columns

In [None]:
df_test

In [None]:
desired_columns = ['team_count_50runs_last15', 'team_winp_last5',
       'team1_winp_team2_last15', 'ground_avg_runs_last15',
       'best_batsman1', 'best_batsman2', 'best_bowler1', 'best_bowler2',
       'runsratio', 'wicketsratio', 'best_strikerate1', 'best_strikerate2',
       'best_economy1', 'best_economy2', 'ratio_avg_runs_last_15',
       'ratio_strike_rate', 'ratio_avg_runs_last_5',
       'team_count_30runs_last15', 'team1_winp_team2_last5',
       'team_winp_last5_versus', 'team_winp_last15', 'team_winp_last1',
       'ground_avg_runs_last5', 'team_avg_economy_last15',
       'team_avg_economy_last5', 'team_avg_wickets_last15', 'won_last_game_1',
       'won_last_game_2', 'won_last_game_at_ground1',
       'won_last_game_at_ground2', 'toss_winner']

# Reindex both dataframes to match the desired column order

df_test = df_test.reindex(columns=desired_columns)

In [None]:
test_preds = model.predict_proba(df_test)

In [None]:
test_preds

In [None]:
df_submission = pd.DataFrame(columns = ['match id','dataset_type','win_pred_team_id','win_pred_score(for win_pred_team_id)','train_algorithm','Ensemble?(if yes, then comma separated train_algo)','train_hps_trees','train_hps_depth','train_hps_lr','indep_feat_id1',
                                        'indep_feat_id2','indep_feat_id3','indep_feat_id4','indep_feat_id5','indep_feat_id6','indep_feat_id7'])

In [None]:
df_submission

In [None]:
for i in range(len(test_preds)):
  df_submission.loc[i] = [df_test_copy['match id'].iloc[i],'r1','placeholder',test_preds[i][1],'lightboost','no','9','4','0.1',df_test['won_last_game_1'].iloc[i],df_test['won_last_game_2'].iloc[i],
                          df_test['team_count_50runs_last15'].iloc[i],df_test['team_winp_last5'].iloc[i],df_test['team_count_50runs_last15'].iloc[i],df_test['team1_winp_team2_last15'].iloc[i],
                         df_test['ground_avg_runs_last15'].iloc[i] ]



In [None]:
for i in range(len(test_preds)):
  if(df_submission['win_pred_score(for win_pred_team_id)'][i]>0.5):
    df_submission['win_pred_team_id'][i] = df_test_copy['team1_id'][i]
  else:
    df_submission['win_pred_team_id'][i] = df_test_copy['team2_id'][i]

df_test['win_pred_team_id'] = df_submission['win_pred_team_id']

In [None]:
df_train['dataset_type'] = 'train'
df_test['dataset_type'] = 'r1'
df_test['match id'] = df_test_copy['match id']
df_train['match id'] = df_train_copy['match id']
df_test['win_pred_score'] = test_preds[:,1]

In [None]:
for i in range(len(train_preds)):
  df_submission.loc[271+i] = [df_train_copy['match id'].iloc[i],'train','placeholder',train_preds[i][1],'lightboost','no','9','4','0.1',df_train['won_last_game_1'].iloc[i],df_train['won_last_game_2'].iloc[i],df_train['team_count_50runs_last15'].iloc[i],df_train['team_winp_last5'].iloc[i],df_train['team_count_50runs_last15'].iloc[i],df_train['team1_winp_team2_last15'].iloc[i],
                         df_train['ground_avg_runs_last15'].iloc[i]]




In [None]:
df_train['win_pred_score'] = 0

In [None]:
df_train['win_pred_team_id'] = 0

In [None]:
for i in range(len(train_preds)):
  if(df_submission['win_pred_score(for win_pred_team_id)'][i]>0.5):
    df_submission['win_pred_team_id'].loc[271+i] = df_train_copy['team1_id'][i]
    df_train['win_pred_team_id'][i] = df_submission['win_pred_team_id'].iloc[271+i]

    df_train['win_pred_score'][i] = df_submission['win_pred_score(for win_pred_team_id)'].iloc[271+i]
  else:
    df_submission['win_pred_team_id'].loc[271+i] = df_train_copy['team2_id'][i]
    df_train['win_pred_team_id'][i] = df_submission['win_pred_team_id'].iloc[271+i]
    df_train['win_pred_score'][i] = df_submission['win_pred_score(for win_pred_team_id)'].iloc[271+i]

In [None]:
model

In [None]:
model.fit(df_train.drop(['win_pred_team_id','win_pred_score','dataset_type','match id','target'],axis = 1), df_train.drop(['win_pred_team_id','win_pred_score','dataset_type','match id'],axis = 1).target)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
df_train

In [None]:
model = XGBClassifier(n_estimators=9,reg_alpha=0,max_depth=4,colsample_bytree=0.6,min_split_gain=5,learning_rate=0.1)

In [None]:
model.fit(df_train.drop(['win_pred_team_id','win_pred_score','dataset_type','match id','target'],axis=1),df_train.target)

In [None]:
df_feat_importance = pd.DataFrame({'feat_name':df_train.drop(['win_pred_team_id','win_pred_score','dataset_type','match id','target'],axis = 1).columns.tolist(), 'model_feat_imp_train':model.feature_importances_}).sort_values(by='model_feat_imp_train', ascending=False)\
                                                                                                                        .reset_index(drop=True).head(10)
df_feat_importance

In [None]:
feature_desc = {'team_count_50runs_last15':'Ratio of number of 50s by players in team1 to number of 50s by players in team2 in last 15 games',\
               'team_winp_last5':'Ratio of team1\'s win % to team2\'s win % in last 5 games',\
               'ground_avg_runs_last15':'average runs scored in the ground in last 15 games',\
               'team1_winp_team2_last15':'Team1\'s win percentage against Team2 in last 15 games',\
               'team1only_avg_runs_last15':'team1\'s avg inning runs in last 15 games',\
               'won_last_game_2' : 'Whether team2 won their last game',\
               'won_last_game_1' : 'Whether team1 won their last game',\
                'ratio_avg_runs_last_15' : 'Ratio of team1\'s avg runs to team2\'s avg runs in last 15 games',\
                'team_winp_last15' : 'Ratio of team1\'s win % to team2\'s win % in last 15 games',\
                'team_count_30runs_last15' : 'Ratio of number of 30s by players in team1 to number of 30s by players in team2 in last 15 games',\
                'team_avg_wickets_last15' : 'Ratio of team1\'s avg wickets to team2\'s avg wickets in last 15 games',\
                'ground_avg_runs_last5' : 'average runs scored in the ground in last 5 games',\
                'team_avg_economy_last15' : 'Ratio of team1\'s avg economy to team2\'s avg economy in last 15 games',\
                'toss_winner' : '1 if team1 won the toss',\
                'toss_winner_01' : 'Toss winner to numerical - 1 if team2 wins, else 0',\
                'toss_decision_01' : 'Toss decision - categorical - 1 if winner bats, 0 otherwise',\
                'beststrikerate1': 'Best batsman\'s strike rate in last 15 games',\
                'beststrikerate2': 'Best bowler\'s strike rate in last 15 games',\
                'boundary_runs_scored1': 'Number of boundary runs scored by team1 in last 5 games',\
                'boundary_runs_scored2': 'Number of boundary runs scored by team2 in last 5 games',\
                'wicketsratio': 'Ratio of team1\'s wickets to team2\'s wickets in last 15 games',\
                'best_batsman1': 'Best batsman\'s runs in last 15 games',\
                'best_batsman2': 'Best bowler\'s runs in last 15 games',\
                'runsratio':'ratio of team 1s runs in last 15 games to team2s runs',\
                'boundary_runs_percentage1': 'Percentage of boundary runs scored by team1 in last 5 games',\
                'boundary_runs_percentage2': 'Percentage of boundary runs scored by team2 in last 5 games',\
                'best_strikerate1': 'Best batsman\'s strike rate in last 15 games',\
                'best_strikerate2': 'Best bowler\'s strike rate in last 15 games',\
                'boundary_runs_scoredratio_last15': 'Ratio of team1\'s boundary runs scored to team2\'s boundary runs scored in last 15 games',\
                'boundary_runs_percentage_ratio_last15': 'Ratio of team1\'s boundary runs percentage to team2\'s boundary runs percentage in last 15 games',\
                'best_bowler1': 'Best bowler\'s wickets in last 15 games',\
                'best_bowler2': 'Best bowler\'s wickets in last 15 games',\
                'runsratio_last20': 'Ratio of team1\'s runs to team2\'s runs in last 20 games',\
                'economy_ratio': 'Ratio of team1\'s economy to team2\'s economy in last 15 games',\
                'team1_economy': 'team1\'s economy in last 15 games',\
                'team2_economy': 'team2\'s economy in last 15 games',\
                'team1_wickets': 'team1\'s wickets in last 15 games',\
                'team2_wickets': 'team2\'s wickets in last 15 games',\
                'team_avg_economy_last5': 'Ratio of team1\'s avg economy to team2\'s avg economy in last 5 games',\
                'team_count_50runs_last15': 'Ratio of number of 50s by players in team1 to number of 50s by players in team2 in last 15 games',\
                'team1_winp_team2_last5': 'Team1\'s win percentage against Team2 in last 5 games',\
                'team_winp_last1': 'Ratio of team1\'s win % to team2\'s win % in last 1 games',\
                'boundary_runs_concededratio_last15': 'Ratio of team1\'s boundary runs conceded to team2\'s boundary runs conceded in last 15 games',\
                'boundary_runs_concededratio': 'Ratio of team1\'s boundary runs conceded to team2\'s boundary runs conceded in last 5 games',\
                'maidensratio': 'Ratio of team1\'s maidens to team2\'s maidens in last 15 games',\
                'maidensratio_last_15': 'Ratio of team1\'s maidens to team2\'s maidens in last 15 games',\
                'threewickethaulsratio': 'Ratio of team1\'s three wicket haul to team2\'s three wicket haul in last 15 games',\
                'boundary_runs_percentage_ratio': 'Ratio of team1\'s boundary runs percentage to team2\'s boundary runs percentage in last 5 games',\
                'strike_ratio': 'Ratio of team1\'s strike rate to team2\'s strike rate in last 15 games',\
                'team_winp_last5_versus': 'Ratio of team1\'s win % to team2\'s win % in their last 5 games',\
                'best_economy1': 'Best bowler\'s economy in last 15 games',\
                'best_economy2': 'Best bowler\'s economy in last 15 games',\

                }

In [None]:
df_file2 = df_feat_importance
df_file2['feat_id'] = [i+1 for i in df_file2.index]
df_file2['feat_rank_train'] = [i+1 for i in df_file2.index]
df_file2 = df_file2.set_index('feat_id')
df_file2['feat_description'] = df_file2['feat_name'].map(feature_desc)

In [None]:
df_file2

In [None]:
df_file2.reset_index(inplace = True)

In [None]:
df_file2

In [None]:
df_file2.to_csv('File2.csv', sep=',', index=False, encoding='utf-8')

### file_1

In [None]:
df_file1 = pd.concat([df_test[['match id','dataset_type','win_pred_team_id','win_pred_score',] + list(df_feat_importance['feat_name'].head(10))], \
                     df_train[['match id','dataset_type','win_pred_team_id','win_pred_score',] + list(df_feat_importance['feat_name'].head(10))]])

renaming_dict = {}
for i,col in enumerate(list(df_feat_importance['feat_name'].head(10))):
    renaming_dict[col] = f'indep_feat_id{i+1}'
df_file1.rename(columns=renaming_dict, inplace=True)

for i in range(1,11):
    if f'indep_feat_id{i}' not in df_file1.columns:
        df_file1[f'indep_feat_id{i}'] = np.nan

df_file1['train_algorithm'] = 'lightboost'
df_file1['is_ensemble'] = 'no'
df_file1['train_hps_trees'] = 9
df_file1['train_hps_depth'] = 4
df_file1['train_hps_lr'] = 0.1

In [None]:
df_file1

In [None]:
df_file1.isna().sum()

In [None]:
df_file1.fillna(0,inplace = True)

In [None]:
df_file1.isna().sum()

In [None]:
df_file1.to_csv('File_1.csv', sep=',', index=False, encoding='utf-8')