# Action Prediction model

## Load modules and Utility functions

In [None]:
import pandas as pd
import numpy as np
import gc
import warnings 
from sklearn.preprocessing import LabelEncoder
from IPython.display import display

warnings.filterwarnings('ignore')

## Load data & Preprocessing

### Basic preprocessing

In [None]:
from math import atan, sqrt

def reduce_memory_usage(df):
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type not in  ['object', 'datetime64[ns]']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)

        else:
            df[col] = df[col].astype('category')
            # pass
    
    return df



# function to drop some columns from the game statsics and load the files 
def preprocess_all(path, Goals=True):
    '''
        path: the original data path
        Goals: bool if True the Goal_scored and Goals_coceded columns will be dropped
        modnames: the new names for the dataframes
    '''

    # read the original data
    Train = pd.read_csv(path+'Train.csv')
    Test = pd.read_csv(path+'Test.csv')
    trgs = pd.read_csv(path+'train_game_statistics.csv')
    tsgs = pd.read_csv(path+'test_game_statistics.csv')

    cols = ["next_action"]
    if Goals:
        cols.extend(["Goals_scored", "Goals_conceded"])

    # drop the columns that are found only in train
    trgs.drop(
        columns=cols,
        inplace=True
    )


    nx_cols = ['next_player',
                'next_x',
                'next_y',
                'next_team',
                'next_event_id',
                'event_id',
                'xt_value'
    ]
    # since this columns as almost null to train let's drop them
    trgs.drop(columns=nx_cols, inplace=True)
    tsgs.drop(columns=nx_cols, inplace=True)

    trgs = reduce_memory_usage(trgs)
    tsgs = reduce_memory_usage(tsgs)


    return Train, Test, trgs, tsgs 




# calculate the distance from the postion of a player to goal
def distance_from_goal(xy, xg=0, yg=34):
    '''
        Calculate distance from (xg, yg) to (x, y)
        by default the (xg, yg) = (0, 34)
    '''
    x, y = xy

    return ((x-xg)**2 + (y-yg)**2)**.5


# the angle view of position to a goal
def angle_width(xy, gs=7.32):
    '''
        Calculate angle view from (x, y) with a goal width of gs
        by default goal size is 7.32
    '''
    x, y = xy 

    return atan(gs*x/(x**2+y**2-(gs/2)**2))


# calulate euclidean distance b/n two points
def calc_distance(xy1xy2):
    '''
        Calculate distance from (x1, y1) to (x2, y2)
    '''
    x1, y1, x2, y2 = xy1xy2 
    return sqrt((x2 - x1)**2 + (y2 - y1)**2)


# Function that adds features to the game statstics 
def preprocess_fe(df):
    '''
        df : game stastics data frame 
        returns modified data frame
    '''

    # sort the gamestatics first by Game ID then Action ID
    df = df.sort_values(['Season', 'Game_ID', 'id'])
    # Drop unessary columns the manager and Action ID
    df = df.drop(columns=['Manager', 'id'])


    # Feature Engineering
    
    # distance from goal and angle view from (X, Y)
    df['dist_from_goal'] = list(map(distance_from_goal, zip(df['X'], df['Y'])))
    df['angle_width'] = list(map(angle_width, zip(df['X'], df['Y'])))
    df['goal_prob'] = 1 / (1 + np.exp(-3.9 + 3.54*df['angle_width']))

    # X
    df['last x'] = df.X.shift(1)   # last action's X position
    df['last2 x'] = df.X.shift(2)  # last 2 action's X position
    df['next x'] = df.X.shift(-1)  # next actions's X position
    df['next2 x'] = df.X.shift(-2)  # next 2 action's X position

    # Y
    df['last y'] = df.Y.shift(1)   # last action's Y position
    df['last2 y'] = df.Y.shift(2)  # last 2 action's Y position
    df['next y'] = df.Y.shift(-1)  # next action's Y position
    df['next2 y'] = df.Y.shift(-2)  # next 2 action's Y position



    # Team
    df['prev_same_team'] = (df['Team'].shift(1) == df.Team).astype(int)  # which team took the last action
    df['next_same_team'] = (df['Team'].shift(-1) == df.Team).astype(int) # which team will take the next action

    # Player
    df['prev_same_player'] = (df['Player_ID'].shift(1) == df.Player_ID).astype(int)  # which player took the last action
    df['next_same_player'] = (df['Player_ID'].shift(-1) == df.Player_ID).astype(int) # which player will take the next action

    # Time
    df['prev_time_diff'] = df['Start_minutes'] - df['Start_minutes'].shift(1) # action starting time difference from previous action
    df['event_time'] = df.End_minutes - df.Start_minutes                      # how long does the action took

    # Goal dist
    df['last_goal_dist'] = df['dist_from_goal'].shift(1)   # what was the last action's distance from the goal
    df['next_goal_dist'] = df['dist_from_goal'].shift(-1)  # what will be the next action's distance from the goal



    # Missing values 
    ncols = ['last x', 'last2 x', 'next2 x', 'next x', 'last y', 'last2 y',
        'next2 y', 'next y', 'prev_time_diff', 'last_goal_dist',
        'next_goal_dist']
    for col in ncols:
        df[col].fillna(-1, inplace=True) # for shifted values since we can't have the data let's fill them with -1


    # no need to add the actions with pass and shot since those columns are given to us
    df = df[~((df.Shots == 1)|(df.Passes == 1))]

    # drop the unnessary columns
    df.drop(columns=['Accurate passes', 'Inaccurate passes', 'Shots', 'SoT', 'Passes'], inplace=True)

  
    return df



In [None]:
path = '/content/drive/MyDrive/Landuma/' # change your path here

Train, Test, train_gs, test_gs = preprocess_all(path) # bring the files

# apply the preprocess function to both game stastics
train_gs = preprocess_fe(train_gs)
test_gs = preprocess_fe(test_gs)

train_gs.shape, test_gs.shape

### Label Encoding

In [None]:
game_map = None # game id decoder
team_map = None # team decoder

df = pd.concat([train_gs.assign(train=1), test_gs.assign(train=0)])
le = LabelEncoder()

# select the categorical and string type columns and encode them
for c in df.select_dtypes(['object', 'category']).columns.drop('Action'):
  df[c] = le.fit_transform(df[c]) 
  if c == "Game_ID":
    game_map = dict(zip(range(len(le.classes_)), le.classes_))
  elif c == 'Team':
    team_map = dict(zip(range(len(le.classes_)), le.classes_))

train_gs = df[df.train == 1].drop(columns='train')
test_gs = df[df.train == 0].drop(columns=['train', 'Action'])

In [None]:
# Limit the action types

train_gs['Action'] = train_gs.Action.str.replace("Positional attacks with shots", "Positional attacks")
train_gs = train_gs[train_gs.Action.apply(
    lambda x:x not in['Extra attacking pass','Inaccurate extra attacking pass', 'Not forced mistake']
)]

train_gs.Action.nunique()

In [None]:
train_gs.Action = le.fit_transform(train_gs.Action)
action_map = dict(zip(range(len(le.classes_)), le.classes_))
reverse_map = {i:j for j, i in action_map.items()}

def get_action_num(action):
    return reverse_map[action]

## Modeling & Prediction

### Separate the seasons

In [None]:
train_gs = train_gs.reset_index(drop=True)
test_gs = test_gs.reset_index(drop=True)

train_gs_s1 = train_gs[train_gs.Season == 1].drop(columns='Season')
train_gs_s2 = train_gs[train_gs.Season == 2].drop(columns='Season')
test_gs = test_gs.drop(columns='Season')
train_gs = train_gs.drop(columns='Season')

train_gs_s1.shape, train_gs_s2.shape, test_gs.shape

### Undersampling

In [None]:
import imblearn.under_sampling as us

def resample(X, y, ty='valid'):
      y = y.map(action_map)
      vc = y.value_counts()

      stra = {
            'Positional attacks': 5000,  'Counter-attacks': 2500,
            'Lost balls': 2500, 'Challenges lost': 2500,
            'Challenges won': 2500, 'Picking-ups': 1000,
            'Interceptions': 1000, 'Passes into the penalty box': 1000,
            'Corner attacks': 1000, 'Air challenges lost': 1000,
            'Air challenges won': 1000, 'Free-kick attacks': 1000,
            'Opp half pick-ups': 1000, 'Dribbling': 1000,
            'Successful tackles': 800, 'Opp half lost balls': 800,
            'Fouls': 800,  'Successful dribbles': 800,
            'Unsuccessful tackles': 800, 'Bad ball control': 800,
            'Inaccurate crosses': 800, 'Unsuccessful dribbles': 800,
            'Throw-in attacks': 800, 'Opp half interceptions': 800,
            'Goal kicks': 800, 'Wide shot (Goalkeepers)': 800,
            'Wide shot': 800, 'Errors': 800,
            'Inaccurate key passes': 600,  'Accurate crosses': 600,
            'Shot on target (saved)': 600, 'Shot on target': 600,
            'Supersaves': 600, 'Accurate key passes': 600,
            'Offsides': 600, 'Errors leading to goal': 600,
            'Goals conceded': vc['Goals conceded'], 'Goals': vc['Goals'],
            'Assists': vc['Assists'], 'Bar/Post shots': vc['Bar/Post shots'],
            'Penalty': vc['Penalty'], 'Penalty attack': vc['Penalty attack'],
            'Own goal':vc['Own goal']
            }
            
      sampler = us.RandomUnderSampler(random_state=21, sampling_strategy=stra)

      X, y = sampler.fit_resample(X, y)
      y = y.map(reverse_map)
      
      return X, y


### Model

In [None]:
from xgboost import XGBClassifier

def get_model():

    return  XGBClassifier(n_estimators=50, random_state=21)

### Training and prediction

In [None]:
goals_related = [
    get_action_num(a) for a in [
        'Goals', 'Goals conceded', 'Own goal'
    ]
]

#### training on season 1 prediction on season 2 & season 3

In [None]:
X, y = resample(train_gs_s1.drop(columns='Action'), train_gs_s1.Action, 'valid')
val = train_gs_s2.drop(columns='Action')
gr_inx = train_gs_s2[train_gs_s2.Action.map(lambda x:x in goals_related)].index
eval_x = val.loc[gr_inx]
eval_y = train_gs_s2.Action[gr_inx]

In [None]:
model = get_model()
model.fit(X, y, eval_set=[(eval_x, eval_y)], verbose=0)
preds2 = model.predict_proba(val)
test_pred1 = model.predict_proba(test_gs.fillna(-1))

In [None]:
del model
gc.collect()

#### training on season 2 prediction on season 1&3

In [None]:
X, y = resample(train_gs_s2.drop(columns='Action'), train_gs_s2.Action, 'valid')
val = train_gs_s1.drop(columns='Action') 
gr_inx = train_gs_s1[train_gs_s1.Action.map(lambda x:x in goals_related)].index
eval_x = val.loc[gr_inx]
eval_y = train_gs_s1.Action[gr_inx]

In [None]:
model = get_model()
model.fit(X, y, eval_set=[(eval_x, eval_y)], verbose=0)
preds1 = model.predict_proba(val)
test_pred2 = model.predict_proba(test_gs.fillna(-1))

In [None]:
del model
gc.collect()

## Post processing

In [None]:
from tqdm import tqdm

A = 'Assists'
G = 'Goals'
C = 'Goals conceded'
O = 'Own goal'
TH = 0.95 # threshold

class PostProcess:

    '''
        Used for Post process the predicted games

        #Parameters
        game_statics : the preprocessed game statstics data frame
        prediction : model predictions 
        action_map : dictionary for action column mapping
    '''

    def __init__(self, game_statics, prediction, action_map):
        self.df = game_statics[['Game_ID', 'Team']].reset_index(drop=True)
        self.prediction = prediction 
        self.action_map = action_map
        self._initialize()


    def _initialize(self):
        reverse_map = {i:j for j, i in self.action_map.items()}
        gl_inx = reverse_map['Goals']
        gc_inx = reverse_map['Goals conceded']
        self.df['pred'] = pd.Series(np.argmax(self.prediction, axis=1)).map(self.action_map)
        self.df['goal_prob'] = self.prediction[:, gl_inx]
        self.df['conced_prob'] = self.prediction[:, gc_inx]
        
        # fetch only where the model predicts Assists, Goals, Goals conceded or Own goal
        self.df = self.df[self.df.pred.apply(lambda x:x in [A, G, C, O])]


    def postprocess(self, show_unabel=True):

        '''
            will run a simple 1, 2, 3 group prediction checks to improve models performance
        '''
        group_indices = self._get_groups() # fetch closer goal related predictions
        group_indices2 = {} # store the predictions that are not modified
        for _, val in tqdm(group_indices.items()):
            flag, ptype = self._is_okay(val)
            if not flag:
                try:
                    group_indices2[ptype].append(val)
                except:
                    group_indices2[ptype] = [val]
        if show_unabel:
            tot = 0 
            for v in group_indices2.items():
                tot += len(v)
            if tot == 0:
                print("All are corrected")
            else:
                print(f"{tot} number of groups unfixed")
            print(group_indices2)

        return self.df.drop(columns='pred')


    def _get_groups(self): # depending on their index in the data fetch the closer ones
        group_indices = {}
        num = -1 
        curr_ind = -4
        for ind in self.df.index:
            pred, goal_prob, conced_prob, team = \
                self.df.loc[ind][['pred', 'goal_prob', 'conced_prob', 'Team']]
            val = (ind, pred, goal_prob, conced_prob, team)

            if abs(curr_ind - ind) <= 3:
                group_indices[num].append(val)
                curr_ind = ind
            else:
                num += 1 
                curr_ind = ind
                group_indices[num] = [val]

        return group_indices


    def _corr_3probs(self, preds, teams):
        '''
            Same team means Goals or either Own goal or Assits
            The unique team indicates goal conceded
        '''
        if teams[0] == teams[1]:
            return [(0, 0), (TH, 0), (0, TH)]
        elif teams[1] == teams[2]:
            return [(0, TH), (TH, 0), (0, 0)]
        elif teams[0] == teams[2]:
            return [(0, 0), (0, TH), (TH, 0)]
        else:
            raise Exception("Atleast two teams must be similar")


    def _corr_2probs(self, gprobs, cprobs):
        '''
            The higher the sum of goal probability, more probable that 
            it would be a Goals
        '''

        fr_inx = gprobs[0] + cprobs[1]
        sc_inx = gprobs[1] + cprobs[0]

        if fr_inx >= sc_inx:
            return [(TH/2, 0), (0, TH/2)]
        else:
            return [(0, TH/2), (TH/2, 0)]


    def _is_okay(self, conn_ind):
        '''
            Check whether the prediction of a model is good or not
            from the groups created


        '''
        preds = [i[1] for i in conn_ind] # predictions
        teams = [i[-1] for i in conn_ind] # team
        indces = [i[0] for i in conn_ind] # incices
        gprobs = [i[2] for i in conn_ind] # goal probabilities
        cprobs = [i[3] for i in conn_ind] # goals conceded probabilities

        if len(conn_ind) == 1: # single group
            # Assist predictions
            self.df.loc[indces[0], "goal_prob"] = 0
            self.df.loc[indces[0], "conced_prob"] = 0

            return True, None
        
        elif len(conn_ind) == 2: # two group
            if sorted(preds) == [G, C]:
                # n = preds.index(G)
                # self.df.loc[indces[int(not n)], "goal_prob"] =  0
                # self.df.loc[indces[n], "conced_prob"] = 0

                return True, None
            
            elif C in preds and teams[0] != teams[1]:
                n = preds.index(C)
                self.df.loc[indces[int(not n)], "goal_prob"] =  cprobs[n]
                self.df.loc[indces[int(not n)], "conced_prob"] = gprobs[n]

                return True, None
            
            elif teams[0] != teams[1]:
                for i, (gl, cn) in enumerate(self._corr_2probs(gprobs, cprobs)):
                    self.df.loc[indces[i], "goal_prob"] = gl
                    self.df.loc[indces[i], "conced_prob"] = cn

                return True, None
            
            else:
                return False, 2
        
        elif len(conn_ind) == 3: # three group
            if sorted(preds) == [A, G, C]:
                return True, None

            elif (preds.count(C) > 1 or preds.count(G) > 1 \
                or sorted(preds) == [G, C, O] or sorted(preds)== [A, A, C]\
                or sorted(preds) == [G, O, O]) and len(set(teams))==2:
                for i, (gl, cn) in enumerate(self._corr_3probs(preds, teams)):
                    self.df.loc[indces[i], "goal_prob"] = gl
                    self.df.loc[indces[i], "conced_prob"] = cn

                return True, None
            else:
                return False, 3 
        elif len(conn_ind) == 4:
            return False, 4
        else:
            return False, False



In [None]:
# from postprocess import PostProcess

train_pred = np.concatenate([preds1, preds2])
test_pred = (test_pred1 * .5 + test_pred2 * .5)

tr_pp = PostProcess(train_gs, train_pred, action_map)
ts_pp = PostProcess(test_gs, test_pred, action_map)

train_df = tr_pp.postprocess(show_unabel=False)
test_df = ts_pp.postprocess(show_unabel=False)

train_df.shape, test_df.shape

### Revert the original data of game ID and Team info

In [None]:
train_df['Game_ID'] = train_df['Game_ID'].map(game_map)
test_df['Game_ID'] = test_df['Game_ID'].map(game_map)

train_df['Team'] = train_df['Team'].map(team_map)
test_df['Team'] = test_df['Team'].map(team_map)

train_df

### Aggregation of the probabiities by sum

In [None]:
# grouping by first Game ID then Team
trgpby = train_df.groupby(['Game_ID', 'Team']).agg(
    goal_prob_sum=('goal_prob', 'sum'),
    goal_prob_conced_sum=('conced_prob', 'sum'),
    )
train_goal_prob = trgpby['goal_prob_sum']
train_goal_conced_prob = trgpby['goal_prob_conced_sum']
train_goal_prob

In [None]:
# grouping by first Game ID then Team
tsgpby = test_df.groupby(['Game_ID', 'Team']).agg(
    goal_prob_sum=('goal_prob', 'sum'),
    goal_prob_conced_sum=('conced_prob', 'sum'),
)
test_goal_prob = tsgpby['goal_prob_sum']
test_goal_conced_prob = tsgpby['goal_prob_conced_sum']
test_goal_prob

### Add the aggregation to the Train and Test data

In [None]:
# function to get the probabiity sum values from the aggregated value 
def add_prob(df, gp, gcp):
    hgpr = []     # home goal probability
    hgcpr = []    # home goal conceded probability
    agpr = []     # away goal probability
    agcpr = []    # away goal conceded probability
    for i in df.index:
        gid, htm, atm = df.loc[i][['Game_ID', 'Home Team', 'Away Team']].values 
        try:
            hgpr.append(gp[gid][htm])
        except:
            hgpr.append(0)
        try:
            hgcpr.append(gcp[gid][htm])
        except:
            hgcpr.append(0)
        try:
            agpr.append(gp[gid][atm])
        except:
            agpr.append(0)
        try:
            agcpr.append(gcp[gid][atm])
        except:
            agcpr.append(0)
    df['Home_goal_prob'] = hgpr 
    df['Home_goal_conced_prob'] = hgcpr 
    df['Away_goal_prob'] = agpr 
    df['Away_goal_conced_prob'] = agcpr 
    df['Diff_goal_prob'] = df['Home_goal_prob'] - df['Away_goal_prob'] 
    df['Diff_goal_conced_prob'] = df['Home_goal_conced_prob'] - df['Away_goal_conced_prob'] 
    df['Diff_goalpr_conced'] = df['Diff_goal_prob'] + df['Diff_goal_conced_prob'] 

    return df 

Train = add_prob(Train, train_goal_prob, train_goal_conced_prob) # add it to train
Test = add_prob(Test, test_goal_prob, test_goal_conced_prob)     # add it to test

display(Train.head(5), Test.head(2))

### Export the new data

In [None]:
Train.to_csv(path+'Train_modified.csv', index=False)
Test.to_csv(path+'Test_modified.csv', index=False)