In [1]:
import pandas as pd
import tqdm
import datetime
import numpy as np
from dateutil.relativedelta import relativedelta

In [2]:
df = pd.DataFrame()

for i in range(2001,2020):
    filename = f'atp_matches_{i}.csv'
    curr = pd.read_csv(filename)
    df = pd.concat([df, curr])

df = df.reset_index()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [3]:
print(df.shape)
print(df.columns)

(58192, 50)
Index(['index', 'best_of', 'draw_size', 'l_1stIn', 'l_1stWon', 'l_2ndWon',
       'l_SvGms', 'l_ace', 'l_bpFaced', 'l_bpSaved', 'l_df', 'l_svpt',
       'loser_age', 'loser_entry', 'loser_hand', 'loser_ht', 'loser_id',
       'loser_ioc', 'loser_name', 'loser_rank', 'loser_rank_points',
       'loser_seed', 'match_num', 'minutes', 'round', 'score', 'surface',
       'tourney_date', 'tourney_id', 'tourney_level', 'tourney_name',
       'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_ace', 'w_bpFaced',
       'w_bpSaved', 'w_df', 'w_svpt', 'winner_age', 'winner_entry',
       'winner_hand', 'winner_ht', 'winner_id', 'winner_ioc', 'winner_name',
       'winner_rank', 'winner_rank_points', 'winner_seed'],
      dtype='object')


Preprocessing
Keep relevant features
Clean and categorize data


In [4]:
print(df.isna().sum())

# Several matches do not have rankings for players.  I will assume these players were amateurs given wildcards or 
# or other low ranked players and assign a rank of 1000

index                     0
best_of                   0
draw_size             55411
l_1stIn                5353
l_1stWon               5353
l_2ndWon               5353
l_SvGms                5353
l_ace                  5353
l_bpFaced              5353
l_bpSaved              5353
l_df                   5353
l_svpt                 5353
loser_age                31
loser_entry           46747
loser_hand               46
loser_ht               7981
loser_id                  0
loser_ioc                 0
loser_name                0
loser_rank             1214
loser_rank_points      1214
loser_seed            44894
match_num                 0
minutes                6656
round                     0
score                     1
surface                 118
tourney_date              0
tourney_id                0
tourney_level             0
tourney_name              0
w_1stIn                5353
w_1stWon               5353
w_2ndWon               5353
w_SvGms                5353
w_ace               

In [5]:
#handle missing values for player specific data
df['loser_age'].fillna((df['loser_age'].mean()), inplace=True)
df['loser_ht'].fillna((df['loser_ht'].mean()), inplace=True)
df['loser_hand'].fillna('R', inplace=True)
df['loser_entry'].fillna('REG', inplace=True)
df['loser_rank'].fillna(1000, inplace=True)

df['winner_age'].fillna((df['loser_age'].mean()), inplace=True)
df['winner_ht'].fillna((df['winner_ht'].mean()), inplace=True)
df['winner_hand'].fillna('R', inplace=True)
df['winner_entry'].fillna('REG', inplace=True)
df['winner_rank'].fillna(1000, inplace=True)

In [6]:
#remove retirement and walkover matches for more accurate data
df = df[df['score'] != 'W/O']
df = df[~df.score.str.contains("RET", na=False)]
#Remove Davis Cup matches due to lack of point level data
df = df[df['tourney_level'] != 'D']

In [7]:
drop_cols = ['winner_rank_points', 'winner_seed', 'minutes', 'draw_size', 'loser_rank_points', 'loser_seed', 
             'winner_ioc', 'loser_ioc',  ]
df = df.drop(drop_cols, axis = 1)
df = df.dropna()

In [8]:
df['tourney_date'] = df['tourney_date'].astype(str)
df['tourney_date'] = pd.to_datetime(df['tourney_date'])

In [9]:
df.loc[(df['round'] == 'R128') | (df['round'] == 'RR') | (df['round'] == 'BR'), ['round']] = 1
df.loc[df['round'] == 'R64', ['round']] = 2
df.loc[df['round'] == 'R32', ['round']] = 3
df.loc[df['round'] == 'R16', ['round']] = 4
df.loc[df['round'] == 'QF', ['round']] = 5
df.loc[df['round'] == 'SF', ['round']] = 6
df.loc[df['round'] == 'F', ['round']] = 7

In [10]:
df.loc[df['loser_hand'] == 'L', ['loser_hand']] = 1
df.loc[df['loser_hand'] == 'R', ['loser_hand']] = 0

df.loc[df['winner_hand'] == 'L', ['winner_hand']] = 1
df.loc[df['winner_hand'] == 'R', ['winner_hand']] = 0

In [11]:
df['w_wildcard'] = 0
df['w_qualifier'] = 0

df['l_wildcard'] = 0
df['l_qualifier'] = 0

df.loc[df['winner_entry'] == 'WC', ['w_wildcard']] = 1
df.loc[(df['winner_entry'] == 'Q') | (df['winner_entry'] == 'LL'), ['w_qualifier']] = 1

df.loc[df['loser_entry'] == 'WC', ['l_wildcard']] = 1
df.loc[(df['loser_entry'] == 'Q') | (df['loser_entry'] == 'LL'), ['l_qualifier']] = 1


In [12]:
df['is_clay'] = 0
df['is_hard'] = 0
df['is_grass'] = 0
df['is_carpet'] = 0

df.loc[df['surface'] == 'Hard', ['is_hard']] = 1
df.loc[df['surface'] == 'Grass', ['is_grass']] = 1
df.loc[df['surface'] == 'Clay', ['is_clay']] = 1
df.loc[df['surface'] == 'Carpet', ['is_carpet']] = 1

In [13]:
df.loc[df['tourney_level'] == 'G', ['tourney_level']] = 2000
df.loc[df['tourney_level'] == 'F', ['tourney_level']] = 1500
df.loc[df['tourney_level'] == 'M', ['tourney_level']] = 1000
df.loc[df['tourney_level'] == 'A', ['tourney_level']] = 500

In [14]:
df.loc[df['best_of'] == 3, ['best_of']] = 0
df.loc[df['best_of'] == 5, ['best_of']] = 1

In [15]:
df.shape

(50395, 50)

In [16]:
df.describe()

Unnamed: 0,index,best_of,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_ace,l_bpFaced,l_bpSaved,l_df,...,winner_id,winner_rank,w_wildcard,w_qualifier,l_wildcard,l_qualifier,is_clay,is_hard,is_grass,is_carpet
count,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,...,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0,50395.0
mean,1550.303205,0.184681,48.767775,32.594444,15.171584,12.405973,5.170751,8.653517,4.788729,3.384383,...,104869.506876,58.672468,0.043119,0.085663,0.075365,0.142732,0.323564,0.538764,0.112968,0.024705
std,889.305383,0.388042,18.789162,14.1213,7.163326,4.105031,4.899582,4.089673,3.244931,2.50373,...,6370.736413,74.94373,0.203128,0.279869,0.263981,0.349803,0.46784,0.4985,0.316556,0.155226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,100644.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,798.0,0.0,35.0,22.0,10.0,9.0,2.0,6.0,2.0,2.0,...,103484.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1557.0,0.0,45.0,30.0,14.0,11.0,4.0,8.0,4.0,3.0,...,104198.0,41.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,2291.0,0.0,59.0,40.0,19.0,15.0,7.0,11.0,7.0,5.0,...,104918.0,76.5,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
max,3298.0,1.0,328.0,284.0,101.0,91.0,103.0,34.0,25.0,23.0,...,206173.0,1890.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Adding Features
- Need to use previous player performance to predict matchups
1. Previous Career High Rank


In [17]:
#TODO fix print order for loser first case

def addHeadToHead(tdf):
    matchtracker = {}
    
    for id, row in tqdm.tqdm_notebook(tdf.iterrows(), total=tdf.shape[0]):
        winner = row['winner_id']
        loser = row['loser_id']
        
        if (winner, loser) in matchtracker:
            tdf.loc[id, 'w_wins_vs_opp'] = matchtracker[(winner, loser)][0]
            tdf.loc[id, 'l_wins_vs_opp'] = matchtracker[(winner, loser)][1]
            matchtracker[(winner, loser)][0]+=1
        elif (loser, winner) in matchtracker:
            tdf.loc[id, 'w_wins_vs_opp'] = matchtracker[(loser, winner)][1]
            tdf.loc[id, 'l_wins_vs_opp'] = matchtracker[(loser, winner)][0]
            matchtracker[(loser, winner)][1]+=1
        else:
            matchtracker[(winner, loser)] = [1,0]
            tdf.loc[id, 'w_wins_vs_opp'] = 0
            tdf.loc[id, 'l_wins_vs_opp'] = 0
        
addHeadToHead(df)
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=50395.0), HTML(value='')))




In [18]:
# Window stats
# look at average performance metrics for past year


def addPlayerYearRecords(tdf):
    for (idx, date, winner_id, loser_id, surface) in tqdm.tqdm_notebook(tdf[['tourney_date', 'winner_id', 'loser_id', 'surface']].itertuples(), total=tdf.shape[0], leave=True):
        one_year_ago = date - relativedelta(years=1)
        one_year_df =  tdf[(tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < date)]
        
        tdf.loc[idx, 'w_1yr_wins'] = len(one_year_df[(one_year_df['winner_id'].values == winner_id)]['winner_id'])
        tdf.loc[idx, 'w_1yr_losses'] = len(one_year_df[(one_year_df['loser_id'].values == winner_id)]['winner_id'])
        
        tdf.loc[idx, 'l_1yr_wins'] = len(one_year_df[(one_year_df['winner_id'].values == loser_id)]['winner_id'])
        tdf.loc[idx, 'l_1yr_losses'] = len(one_year_df[(one_year_df['loser_id'].values == loser_id)]['winner_id'])
        
        tdf.loc[idx, 'w_1yr_wins_on_surface'] = len(one_year_df[(one_year_df['surface'].values ==surface) & (one_year_df['winner_id'].values == winner_id)]['winner_id'])
        tdf.loc[idx, 'w_1yr_losses_on_surface'] = len(one_year_df[(one_year_df['surface'].values ==surface) & (one_year_df['loser_id'].values == winner_id)]['winner_id'])
        
        tdf.loc[idx, 'l_1yr_wins_on_surface'] = len(one_year_df[(one_year_df['surface'].values ==surface) & (one_year_df['winner_id'].values == loser_id)]['winner_id'])
        tdf.loc[idx, 'l_1yr_losses_on_surface'] = len(one_year_df[(one_year_df['surface'].values ==surface) & (one_year_df['loser_id'].values == loser_id)]['winner_id'])
        
addPlayerYearRecords(df)

# def addWindowStats(tdf):
#     tdf['winner_1yr_wins'] = -1
#     tdf['winner_1yr_loses'] = -1
#     for idx, row in tqdm.tqdm(tdf.iterrows(), total=tdf.shape[0]):
#         one_year_ago = (row['tourney_date'] - relativedelta(years=1))
#         #print(one_year_ago)
#         tdf.loc[idx, 'winner_1yr_wins'] = tdf[(tdf['winner_id'] == row['winner_id']) & (tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < row['tourney_date'])]['winner_id'].count()
#         #print(test)
        

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=50395.0), HTML(value='')))




In [19]:
df['w_year_win_pct'] = df['w_1yr_wins'] / (df['w_1yr_wins'] + df['w_1yr_losses'])
df['w_year_surface_win_pct'] = df['w_1yr_wins_on_surface'] / (df['w_1yr_wins_on_surface'] + df['w_1yr_losses_on_surface'])

df['l_year_win_pct'] = df['l_1yr_wins'] / (df['l_1yr_wins'] + df['l_1yr_losses'])
df['l_year_surface_win_pct'] = df['l_1yr_wins_on_surface'] / (df['l_1yr_wins_on_surface'] + df['l_1yr_losses_on_surface'])

In [20]:
#add return stats
# df['w_break_points_conversion_percentage'] = (df['l_bpFaced'] - df['l_bpSaved']) / df['l_bpFaced']
# df['w_return_points_won_percentage'] = (df['l_svpt'] - (df['l_1stWon'] + df['l_2ndWon']))/ df['l_svpt']

# df['l_break_points_conversion_percentage'] = (df['w_bpFaced'] - df['w_bpSaved']) / df['w_bpFaced']
# df['l_return_points_won_percentage'] = (df['l_svpt'] - (df['w_1stWon'] + df['w_2ndWon']))/ df['w_svpt']

def addPlayerYearReturn(tdf):
    for idx, row in tqdm.tqdm(tdf.iterrows(), total=tdf.shape[0]):
        
        one_year_ago = row['tourney_date'] - relativedelta(years=1)
        
        #compute serve stats for the winner's past year
        wwin_df = tdf[(tdf['winner_id'] == row['winner_id']) & (tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < row['tourney_date'])]
        wlost_df = tdf[(tdf['loser_id'] == row['winner_id']) & (tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < row['tourney_date'])]
        
        winner_bp_played = wwin_df.l_bpFaced.sum() + wlost_df.w_bpFaced.sum()
        winner_bp_not_converted = wwin_df.l_bpSaved.sum() + wlost_df.w_bpSaved.sum()
        
        winner_rtpt = wwin_df.l_svpt.sum() + wlost_df.w_svpt.sum()
        winner_1st_return_lost = wwin_df.l_1stWon.sum() + wlost_df.w_1stWon.sum()
        winner_2nd_return_lost = wwin_df.l_2ndWon.sum() + wlost_df.w_2ndWon.sum()
        
        tdf.loc[idx, 'w_1yr_break_pct'] = (winner_bp_played - winner_bp_not_converted) / winner_bp_played
        tdf.loc[idx, 'w_1yr_total_ret_pts_won_pct'] = (winner_rtpt - winner_1st_return_lost - winner_2nd_return_lost)  / winner_rtpt
        
        #compute serve stats for the loser's last year
        lwin_df = tdf[(tdf['winner_id'] == row['loser_id']) & (tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < row['tourney_date'])]
        llost_df = tdf[(tdf['loser_id'] == row['loser_id']) & (tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < row['tourney_date'])]
        
        loser_bp_played = lwin_df.l_bpFaced.sum() + llost_df.w_bpFaced.sum()
        loser_bp_not_converted = lwin_df.l_bpSaved.sum() + llost_df.w_bpSaved.sum()
        
        loser_rtpt = lwin_df.l_svpt.sum() + llost_df.w_svpt.sum()
        loser_1st_return_lost = lwin_df.l_1stWon.sum() + llost_df.w_1stWon.sum()
        loser_2nd_return_lost = lwin_df.l_2ndWon.sum() + llost_df.w_2ndWon.sum()
        
        tdf.loc[idx, 'l_1yr_break_pct'] = (loser_bp_played - loser_bp_not_converted) / loser_bp_played
        tdf.loc[idx, 'l_1yr_total_ret_pts_won_pct'] = (loser_rtpt - loser_1st_return_lost - loser_2nd_return_lost)  / loser_rtpt
        
addPlayerYearReturn(df)

100%|██████████| 50395/50395 [07:50<00:00, 107.09it/s]


In [21]:
def addPlayerYearServe(tdf):
    for idx, row in tqdm.tqdm(tdf.iterrows(), total=tdf.shape[0]):
        
        one_year_ago = row['tourney_date'] - relativedelta(years=1)
        
        #compute serve stats for the winner's past year
        wwin_df = tdf[(tdf['winner_id'] == row['winner_id']) & (tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < row['tourney_date'])]
        wlost_df = tdf[(tdf['loser_id'] == row['winner_id']) & (tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < row['tourney_date'])]
        
        winner_firstin = wwin_df.w_1stIn.sum() + wlost_df.l_1stIn.sum()
        winner_svpt = wwin_df.w_svpt.sum() + wlost_df.l_svpt.sum()
        winner_ace = wwin_df.w_ace.sum() + wlost_df.l_ace.sum()
        winner_bp_faced = wwin_df.w_bpFaced.sum() + wlost_df.l_bpFaced.sum()
        winner_bp_saved = wwin_df.w_bpSaved.sum() + wlost_df.l_bpSaved.sum()
        winner_1stWon = wwin_df.w_1stWon.sum() + wlost_df.l_1stWon.sum()
        winner_2ndWon = wwin_df.w_2ndWon.sum() + wlost_df.l_2ndWon.sum()
        
        tdf.loc[idx, 'w_1yr_serve_pct'] = winner_firstin / winner_svpt
        tdf.loc[idx, 'w_1yr_ace_pct'] = winner_ace / winner_svpt
        tdf.loc[idx, 'w_1yr_bp_save_pct'] = winner_bp_saved / winner_bp_faced
        tdf.loc[idx, 'w_1yr_total_serve_pts_won_pct'] = (winner_1stWon + winner_2ndWon)/ winner_svpt
        
        #compute serve stats for the loser's last year
        lwin_df = tdf[(tdf['winner_id'] == row['loser_id']) & (tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < row['tourney_date'])]
        llost_df = tdf[(tdf['loser_id'] == row['loser_id']) & (tdf['tourney_date'] > one_year_ago) & (tdf['tourney_date'] < row['tourney_date'])]
        
        loser_firstin = lwin_df.w_1stIn.sum() + llost_df.l_1stIn.sum()
        loser_svpt = lwin_df.w_svpt.sum() + llost_df.l_svpt.sum()
        loser_ace = lwin_df.w_ace.sum() + llost_df.l_ace.sum()
        loser_bp_faced = lwin_df.w_bpFaced.sum() + llost_df.l_bpFaced.sum()
        loser_bp_saved = lwin_df.w_bpSaved.sum() + llost_df.l_bpSaved.sum()
        loser_1stWon = lwin_df.w_1stWon.sum() + llost_df.l_1stWon.sum()
        loser_2ndWon = lwin_df.w_2ndWon.sum() + llost_df.l_2ndWon.sum()
        
        tdf.loc[idx, 'l_1yr_serve_pct'] = loser_firstin / loser_svpt
        tdf.loc[idx, 'l_1yr_ace_pct'] = loser_ace / loser_svpt
        tdf.loc[idx, 'l_1yr_bp_save_pct'] = loser_bp_saved / loser_bp_faced
        tdf.loc[idx, 'l_1yr_total_serve_pts_won_pct'] = (loser_1stWon + loser_2ndWon)/ loser_svpt
        
addPlayerYearServe(df)

100%|██████████| 50395/50395 [09:36<00:00, 87.47it/s]


In [64]:
test_df = df[df['winner_name'] == 'Roger Federer']

def addCareerHighRank(tdf):
    tdf['winner_high_rank'] = 1001
    tdf['loser_high_rank'] = 1001
    
    col_loc = tdf.columns.get_loc("winner_high_rank")
    
    for id, row in tqdm.tqdm(tdf.iterrows(), total=tdf.shape[0]):
        curr_df = tdf.iloc[0:id]
        curr_id = row['winner_id']
        curr_df = curr_df[curr_df['winner_id'] == curr_id]
        max_winner_rank = curr_df['winner_rank'].max()
        tdf.iat[id, col_loc] = max_winner_rank
        
for id, row in tqdm.tqdm(tdf.iterrows(), total=tdf.shape[0]):

SyntaxError: unexpected EOF while parsing (<ipython-input-64-e6cac5ec4ae6>, line 16)

In [53]:
df[df['tourney_date']>= '2010-01-01'][['w_1yr_break_pct', 'l_1yr_break_pct', 'w_1yr_total_ret_pts_won_pct', 'l_1yr_total_ret_pts_won_pct']].describe()

Unnamed: 0,w_1yr_break_pct,l_1yr_break_pct,w_1yr_total_ret_pts_won_pct,l_1yr_total_ret_pts_won_pct
count,25901.0,25212.0,25917.0,25258.0
mean,0.394136,0.387098,0.369079,0.360947
std,0.055792,0.071424,0.032716,0.032589
min,0.0,0.0,0.162791,0.096774
25%,0.368263,0.358974,0.351144,0.344645
50%,0.398496,0.391304,0.37013,0.36339
75%,0.424,0.418994,0.389154,0.380363
max,1.0,1.0,0.677778,0.75


In [54]:
df[df['tourney_date']>= '2010-01-01'][['w_1yr_serve_pct', 'w_1yr_ace_pct', 'w_1yr_bp_save_pct', 'w_1yr_total_serve_pts_won_pct', 'l_1yr_serve_pct', 'l_1yr_ace_pct', 'l_1yr_bp_save_pct', 'w_1yr_total_serve_pts_won_pct']].describe()

Unnamed: 0,w_1yr_serve_pct,w_1yr_ace_pct,w_1yr_bp_save_pct,w_1yr_total_serve_pts_won_pct,l_1yr_serve_pct,l_1yr_ace_pct,l_1yr_bp_save_pct,w_1yr_total_serve_pts_won_pct.1
count,25917.0,25917.0,25914.0,25917.0,25258.0,25258.0,25249.0,25917.0
mean,0.612219,0.081606,0.617462,0.642185,0.608461,0.074884,0.602597,0.642185
std,0.041681,0.042241,0.055156,0.03645,0.04329,0.040235,0.066235,0.03645
min,0.403846,0.0,0.0,0.391304,0.294118,0.0,0.0,0.391304
25%,0.584423,0.050663,0.588235,0.619442,0.580818,0.046909,0.574468,0.619442
50%,0.611448,0.076536,0.619171,0.640751,0.606651,0.067601,0.607306,0.640751
75%,0.639323,0.101456,0.649757,0.665116,0.635896,0.094545,0.638122,0.665116
max,0.866667,0.34375,1.0,0.846154,0.846154,0.34375,1.0,0.846154


In [22]:
df = df.rename(columns={"winner_name": "w_name", "winner_rank": "w_rank", "winner_id": "w_id", "winner_age": "w_age",
                   "winner_hand": "w_hand", "winner_ht": "w_ht"})

df = df.rename(columns={"loser_name": "l_name", "loser_rank": "l_rank", "loser_id": "l_id", "loser_age": "l_age",
                   "loser_hand": "l_hand", "loser_ht": "l_ht"})

In [23]:
def ShuffleCols(tdf):
    tdf['P1_win'] = 0
    tdf.loc[tdf.index.isin(tdf.sample(frac=0.5).index), ['P1_win']] = 1
    
    swap_cols = ['rank', 'age', 'ht', 'hand', '1yr_serve_pct', '1yr_ace_pct', '1yr_bp_save_pct', '1yr_total_serve_pts_won_pct',
                '1yr_break_pct', '1yr_total_ret_pts_won_pct', '1yr_win_pct', '1yr_win_surface_pct',
                '1yr_losses_on_surface', 'wins_vs_opp', 'wildcard', 'qualifier']
    
    swap_colss = ['rank', 'age', 'name', 'ht', 'hand', 'wildcard', 'qualifier']
    
    for col in swap_colss:
        l_col = 'l_' + col
        w_col = 'w_' + col
        
        df[w_col], df[l_col] = np.where(df['P1_win'] == 0, [df[l_col], df[w_col]], [df[w_col], df[l_col]])
        
        df.rename(columns={w_col: "P1_"+ col, l_col: "P2_"+ col}, inplace=True)

ShuffleCols(df)

In [26]:
df.to_csv('match_data_cleaned.csv')