In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [64]:
df_original = pd.read_csv('../data/Wimbledon_featured_matches_modified.csv')

In [65]:
#hyper
last_n = 5
weights_last_n = pd.Series([0.05, 0.05, 0.1, 0.2, 0.3])

In [66]:
def add_feature(name, df, weights, normalized=None):
    df[name] = 0
    for i in range(last_n, len(df)):
        last_n_entries = df.iloc[i-last_n:i].reset_index(drop=True)
        last_n_entries[name] = 0
        for key in weights.keys():
            last_n_entries[name] += last_n_entries[key] * weights[key]
        for j in range(last_n):
            df.at[i, name] += last_n_entries.at[j, name] * weights_last_n[j]
        if normalized == 'sigmoid':
            df.at[i, name] = 1 / (1 + np.exp(-df.at[i, name]))
        elif normalized == 'tanh':
            df.at[i, name] = np.tanh(df.at[i, name])
        elif normalized == 'min_max':
            df.at[i, name] = (df.at[i, name] - df[name].min()) / (df[name].max() - df[name].min())
        elif normalized == 'z_score':
            df.at[i, name] = (df.at[i, name] - df[name].mean()) / df[name].std()



## critical points

In [67]:
weights_critical_points_p1 = {
    'p1_break_pt_won' : 10, 
    'p2_break_pt_won' : -10, 
    'p1_ace' : 3, 
    'p2_ace' : -3, 
    'p1_winner' : 2, 
    'p2_winner' : -2, 
    'winner_shot_type' : 0, 
    'p1_break_pt_missed' : -2, 
    'p2_break_pt_missed' : 2, 
    'p1_net_pt_won' : 1, 
    'p2_net_pt_won' : -1
}

In [68]:
add_feature('critical_points_p1', df_original, weights_critical_points_p1, 'z_score')

## faults

In [70]:
weights_faults_p1 = {
    'p1_double_fault' : 1,
    'p2_double_fault' : -1,
    'serve_no' : 0,
    'p1_unf_err' : 1,
    'p2_unf_err' : -1
}

weights_faults_p2 = {
    'p1_double_fault' : -1,
    'p2_double_fault' : 1,
    'serve_no' : 0,
    'p1_unf_err' : -1,
    'p2_unf_err' : 1
}

In [71]:
add_feature('faults_p1', df_original, weights_faults_p1, 'z_score')

In [72]:
# add_feature('faults_p2', df_original, weights_faults_p2, 'sigmoid')

## points diff p1 - p2

In [73]:
weights_points_diff = {
    'p1_points_won' : 1,
    'p2_points_won' : -1
}

In [74]:
add_feature('points_diff', df_original, weights_points_diff, 'sigmoid')

## fatigue p1 / p2

In [75]:
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7284 entries, 0 to 7283
Data columns (total 51 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   match_id            7284 non-null   int64  
 1   player1             7284 non-null   object 
 2   player2             7284 non-null   object 
 3   elapsed_time        7284 non-null   object 
 4   set_no              7284 non-null   int64  
 5   game_no             7284 non-null   int64  
 6   point_no            7284 non-null   int64  
 7   p1_sets             7284 non-null   int64  
 8   p2_sets             7284 non-null   int64  
 9   p1_games            7284 non-null   int64  
 10  p2_games            7284 non-null   int64  
 11  p1_score            7284 non-null   int64  
 12  p2_score            7284 non-null   int64  
 13  server              7284 non-null   int64  
 14  serve_no            7284 non-null   int64  
 15  point_victor        7284 non-null   int64  
 16  p1_poi

In [85]:
df_original['p1_ave_distance_run'] = df_original['p1_distance_run'] / (df_original['rally_count'] + 1)
df_original['p2_ave_distance_run'] = df_original['p2_distance_run'] / (df_original['rally_count'] + 1)
df_original['distance_diff_p1_p2'] = df_original['p1_ave_distance_run'] - df_original['p2_ave_distance_run'] 
df_original['distance_diff_p2_p1'] = df_original['p2_ave_distance_run'] - df_original['p1_ave_distance_run'] 
df_original['p1_distance_last_n'] = 0
df_original['p2_distance_last_n'] = 0
for i in range(last_n, len(df_original)):
    for j in range(1, last_n + 1):
        df_original.at[i,'p1_distance_last_n'] += df_original.at[i-j, 'p1_distance_run']
        df_original.at[i,'p2_distance_last_n'] += df_original.at[i-j, 'p2_distance_run']

df_original['p1_ave_distance_run'] = (df_original['p1_distance_run'] - df_original['p1_distance_run'].mean()) / df_original['p1_distance_run'].std()
df_original['p2_ave_distance_run'] = (df_original['p2_distance_run'] - df_original['p2_distance_run'].mean()) / df_original['p2_distance_run'].std()
# df_original['distance_diff_p1_p2'] = (df_original['distance_diff_p1_p2'] - df_original['distance_diff_p1_p2'].mean()) / df_original['distance_diff_p1_p2'].std()
# df_original['distance_diff_p2_p1'] = (df_original['distance_diff_p2_p1'] - df_original['distance_diff_p2_p1'].mean()) / df_original['distance_diff_p2_p1'].std()
# df_original['distance_ratio'] = (df_original['p1_ave_distance_run'] - df_original['p1_ave_distance_run'].mean()) / df_original['p1_ave_distance_run'].std()
df_original['p1_distance_last_n'] = (df_original['p1_distance_last_n'] - df_original['p1_distance_last_n'].mean()) / df_original['p1_distance_last_n'].std()
df_original['p2_distance_last_n'] = (df_original['p2_distance_last_n'] - df_original['p2_distance_last_n'].mean()) / df_original['p2_distance_last_n'].std()
df_original['speed_mph'] = (df_original['speed_mph'] - df_original['speed_mph'].mean()) / df_original['speed_mph'].std()


In [86]:
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7284 entries, 0 to 7283
Data columns (total 58 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   match_id             7284 non-null   int64  
 1   player1              7284 non-null   object 
 2   player2              7284 non-null   object 
 3   elapsed_time         7284 non-null   object 
 4   set_no               7284 non-null   int64  
 5   game_no              7284 non-null   int64  
 6   point_no             7284 non-null   int64  
 7   p1_sets              7284 non-null   int64  
 8   p2_sets              7284 non-null   int64  
 9   p1_games             7284 non-null   int64  
 10  p2_games             7284 non-null   int64  
 11  p1_score             7284 non-null   int64  
 12  p2_score             7284 non-null   int64  
 13  server               7284 non-null   int64  
 14  serve_no             7284 non-null   int64  
 15  point_victor         7284 non-null   i

In [79]:
df_original['p1_ave_distance_run'].info(), df_original['p2_ave_distance_run'].info(), df_original['distance_diff_p1_p2'].info(), df_original['distance_diff_p2_p1'].info(), df_original['p1_distance_last_n'].info(), df_original['p2_distance_last_n'].info(), df_original['speed_mph'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 7284 entries, 0 to 7283
Series name: p1_ave_distance_run
Non-Null Count  Dtype  
--------------  -----  
7284 non-null   float64
dtypes: float64(1)
memory usage: 57.0 KB
<class 'pandas.core.series.Series'>
RangeIndex: 7284 entries, 0 to 7283
Series name: p2_ave_distance_run
Non-Null Count  Dtype  
--------------  -----  
7284 non-null   float64
dtypes: float64(1)
memory usage: 57.0 KB
<class 'pandas.core.series.Series'>
RangeIndex: 7284 entries, 0 to 7283
Series name: distance_diff_p1_p2
Non-Null Count  Dtype  
--------------  -----  
6553 non-null   float64
dtypes: float64(1)
memory usage: 57.0 KB
<class 'pandas.core.series.Series'>
RangeIndex: 7284 entries, 0 to 7283
Series name: distance_diff_p2_p1
Non-Null Count  Dtype  
--------------  -----  
6553 non-null   float64
dtypes: float64(1)
memory usage: 57.0 KB
<class 'pandas.core.series.Series'>
RangeIndex: 7284 entries, 0 to 7283
Series name: p1_distance_last_n
Non-Null Count  Dtype  


(None, None, None, None, None, None, None)

In [80]:
weights_fatigue_p1 = {
    'p1_ave_distance_run' : 1,
    'p2_ave_distance_run' : -1,
    'distance_diff_p1_p2' : 1,
    'p1_distance_last_n' : 1,
    'p2_distance_last_n' : -1,
    # 'speed_mph' : 1,
}

weights_fatigue_p2 = {
    'p1_ave_distance_run' : -1,
    'p2_ave_distance_run' : 1,
    'distance_diff_p2_p1' : 1,
    'p1_distance_last_n' : -1,
    'p2_distance_last_n' : 1,
    # 'speed_mph' : 1,
}

In [87]:
add_feature('fatigue_p1', df_original, weights_fatigue_p1, 'z_score')

In [88]:
df_original['fatigue_p1'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 7284 entries, 0 to 7283
Series name: fatigue_p1
Non-Null Count  Dtype  
--------------  -----  
7284 non-null   float64
dtypes: float64(1)
memory usage: 57.0 KB


In [83]:
# add_feature('fatigue_p2', df_original, weights_fatigue_p2, 'sigmoid')

In [89]:
df_original.to_csv('../data/Wimbledon_featured_matches_modified_add_features.csv', index=False)