In [1]:
# Import dependencies
import pandas as pd
# For data base creation
from sqlalchemy import create_engine
from config import db_password

## Initial Clean up of sourced data
* We want to combine the winners and looser columns into one player column and add an extra column to distinguish wether that specific player won the match or not.
    * First we need to separate the loosers from the winners and drop their titles so that they are same
    * Next we need to add a column called 'result' and place values for each player in the form of 1 and 0 to represent wether they won or not.
    * Now combine them back together.
* Now we want to add a column that holds the value of the year the match was played.
    * We will use the title of the specific csv file to collect that information
* Finally we will create a final df with sampled data from each year for our data base.

In [2]:
# Define directory path for files to keep from retyping every time
file_dir = 'Data/Resources/'

In [3]:
# Read in data file containing match data for specified year
tennis_stats_98_df = pd.read_csv(f'{file_dir}atp_matches_1998.csv', low_memory=False)
tennis_stats_99_df = pd.read_csv(f'{file_dir}atp_matches_1999.csv', low_memory=False)
tennis_stats_00_df = pd.read_csv(f'{file_dir}atp_matches_2000.csv', low_memory=False)

In [4]:
# adding the year of the match to the data for data collection on specific players
tennis_stats_98_df = tennis_stats_98_df.assign(year = 1998)
tennis_stats_99_df = tennis_stats_99_df.assign(year = 1999)
tennis_stats_00_df = tennis_stats_00_df.assign(year = 2000)

In [5]:
# Concatenate 98, 99, and 00 files into one dataframe
tennis_stats_df = pd.concat([tennis_stats_98_df, tennis_stats_99_df, tennis_stats_00_df])

In [6]:
# Display list of column names sorted
sorted(tennis_stats_df.columns.tolist())

['best_of',
 'draw_size',
 'l_1stIn',
 'l_1stWon',
 'l_2ndWon',
 'l_SvGms',
 'l_ace',
 'l_bpFaced',
 'l_bpSaved',
 'l_df',
 'l_svpt',
 'loser_age',
 'loser_entry',
 'loser_hand',
 'loser_ht',
 'loser_id',
 'loser_ioc',
 'loser_name',
 'loser_rank',
 'loser_rank_points',
 'loser_seed',
 'match_num',
 'minutes',
 'round',
 'score',
 'surface',
 'tourney_date',
 'tourney_id',
 'tourney_level',
 'tourney_name',
 'w_1stIn',
 'w_1stWon',
 'w_2ndWon',
 'w_SvGms',
 'w_ace',
 'w_bpFaced',
 'w_bpSaved',
 'w_df',
 'w_svpt',
 'winner_age',
 'winner_entry',
 'winner_hand',
 'winner_ht',
 'winner_id',
 'winner_ioc',
 'winner_name',
 'winner_rank',
 'winner_rank_points',
 'winner_seed',
 'year']

In [7]:
# Remove unnecessary columns
tennis_stats_df.drop(columns=['best_of',
                             'draw_size',
                             'loser_entry',
                             'loser_hand',
                             'loser_ht',
                             'loser_ioc',
                             'loser_name',
                             'loser_rank',
                             'loser_rank_points',
                             'loser_seed',
                             'match_num',
                             'minutes',
                             'round',
                             'score',
                             'surface',
                             'tourney_date',
                             'tourney_id',
                             'tourney_level',
                             'tourney_name',
                             'winner_entry',
                              'winner_hand',
                              'winner_ht',
                              'winner_ioc',
                              'winner_name',
                              'winner_rank',
                              'winner_rank_points',
                              'winner_seed'
], inplace=True)
tennis_stats_df.head()

Unnamed: 0,winner_id,winner_age,loser_id,loser_age,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,...,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,year
0,102035,25.7,101917,26.6,7.0,1.0,56.0,29.0,22.0,17.0,...,2.0,4.0,54.0,29.0,18.0,11.0,9.0,2.0,5.0,1998
1,101727,27.7,102548,22.8,6.0,0.0,56.0,30.0,23.0,15.0,...,1.0,7.0,64.0,32.0,19.0,11.0,8.0,9.0,13.0,1998
2,102765,21.7,102491,23.0,12.0,0.0,51.0,28.0,22.0,9.0,...,3.0,2.0,60.0,38.0,18.0,7.0,9.0,5.0,12.0,1998
3,102563,22.7,101647,28.2,6.0,1.0,63.0,30.0,20.0,19.0,...,3.0,2.0,56.0,37.0,21.0,10.0,11.0,1.0,6.0,1998
4,102796,21.6,102104,25.3,9.0,6.0,80.0,27.0,22.0,23.0,...,5.0,5.0,99.0,57.0,33.0,20.0,13.0,12.0,17.0,1998


In [8]:
# Make a dopy of the tennis_stats dataframe
winners_df = tennis_stats_df.copy()

In [9]:
# Create winners dataframe 
winners_df.drop(columns=['loser_age',
                         'l_1stIn',
                         'l_1stWon', 
                         'l_2ndWon',
                         'l_SvGms',
                         'l_ace',
                         'l_bpFaced',
                         'l_bpSaved',
                         'l_df',
                         'l_svpt',
                         'loser_id'], inplace=True)
winners_df.head()

Unnamed: 0,winner_id,winner_age,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,year
0,102035,25.7,7.0,1.0,56.0,29.0,22.0,17.0,9.0,1.0,1.0,1998
1,101727,27.7,6.0,0.0,56.0,30.0,23.0,15.0,9.0,1.0,2.0,1998
2,102765,21.7,12.0,0.0,51.0,28.0,22.0,9.0,9.0,4.0,7.0,1998
3,102563,22.7,6.0,1.0,63.0,30.0,20.0,19.0,10.0,3.0,5.0,1998
4,102796,21.6,9.0,6.0,80.0,27.0,22.0,23.0,13.0,3.0,8.0,1998


In [10]:
# Add winner column to winners_df with a value of 1
winners_df['result'] = 1
winners_df.head()

Unnamed: 0,winner_id,winner_age,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,year,result
0,102035,25.7,7.0,1.0,56.0,29.0,22.0,17.0,9.0,1.0,1.0,1998,1
1,101727,27.7,6.0,0.0,56.0,30.0,23.0,15.0,9.0,1.0,2.0,1998,1
2,102765,21.7,12.0,0.0,51.0,28.0,22.0,9.0,9.0,4.0,7.0,1998,1
3,102563,22.7,6.0,1.0,63.0,30.0,20.0,19.0,10.0,3.0,5.0,1998,1
4,102796,21.6,9.0,6.0,80.0,27.0,22.0,23.0,13.0,3.0,8.0,1998,1


In [11]:
# Rename columns in winners_df
winners_df.rename(columns={"winner_age":"age",
                           "w_ace":"ace",
                           "w_df":"df",
                           "w_svpt":"svpt",
                           "w_1stIn":"1stIn",
                           "w_1stWon":"1stWon",
                           "w_2ndWon":"2ndWon",
                           "w_SvGms":"svGms",
                           "w_bpFaced":"bpFaced",
                           "w_bpSaved":"bpSaved",
                           "winner_id":"player_id"}, inplace=True)
winners_df.head()

Unnamed: 0,player_id,age,ace,df,svpt,1stIn,1stWon,2ndWon,svGms,bpSaved,bpFaced,year,result
0,102035,25.7,7.0,1.0,56.0,29.0,22.0,17.0,9.0,1.0,1.0,1998,1
1,101727,27.7,6.0,0.0,56.0,30.0,23.0,15.0,9.0,1.0,2.0,1998,1
2,102765,21.7,12.0,0.0,51.0,28.0,22.0,9.0,9.0,4.0,7.0,1998,1
3,102563,22.7,6.0,1.0,63.0,30.0,20.0,19.0,10.0,3.0,5.0,1998,1
4,102796,21.6,9.0,6.0,80.0,27.0,22.0,23.0,13.0,3.0,8.0,1998,1


In [12]:
# Make a dopy of the tennis_stats dataframe
losers_df = tennis_stats_df.copy()

In [13]:
# Create losers dataframe
losers_df.drop(columns=['winner_age',
                         'w_1stIn',
                         'w_1stWon', 
                         'w_2ndWon',
                         'w_SvGms',
                         'w_ace',
                         'w_bpFaced',
                         'w_bpSaved',
                         'w_df',
                         'w_svpt',
                         'winner_id'], inplace=True)
losers_df.head()

Unnamed: 0,loser_id,loser_age,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,year
0,101917,26.6,2.0,4.0,54.0,29.0,18.0,11.0,9.0,2.0,5.0,1998
1,102548,22.8,1.0,7.0,64.0,32.0,19.0,11.0,8.0,9.0,13.0,1998
2,102491,23.0,3.0,2.0,60.0,38.0,18.0,7.0,9.0,5.0,12.0,1998
3,101647,28.2,3.0,2.0,56.0,37.0,21.0,10.0,11.0,1.0,6.0,1998
4,102104,25.3,5.0,5.0,99.0,57.0,33.0,20.0,13.0,12.0,17.0,1998


In [14]:
# Add loser column to winners_df with a value of 0
losers_df['result'] = 0
losers_df.head()

Unnamed: 0,loser_id,loser_age,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,year,result
0,101917,26.6,2.0,4.0,54.0,29.0,18.0,11.0,9.0,2.0,5.0,1998,0
1,102548,22.8,1.0,7.0,64.0,32.0,19.0,11.0,8.0,9.0,13.0,1998,0
2,102491,23.0,3.0,2.0,60.0,38.0,18.0,7.0,9.0,5.0,12.0,1998,0
3,101647,28.2,3.0,2.0,56.0,37.0,21.0,10.0,11.0,1.0,6.0,1998,0
4,102104,25.3,5.0,5.0,99.0,57.0,33.0,20.0,13.0,12.0,17.0,1998,0


In [15]:
# Rename columns in losers_df
losers_df.rename(columns={"loser_age":"age",
                           "l_ace":"ace",
                           "l_df":"df",
                           "l_svpt":"svpt",
                           "l_1stIn":"1stIn",
                           "l_1stWon":"1stWon",
                           "l_2ndWon":"2ndWon",
                           "l_SvGms":"svGms",
                           "l_bpFaced":"bpFaced",
                           "l_bpSaved":"bpSaved",
                           "loser_id":"player_id"}, inplace=True)
losers_df.head()

Unnamed: 0,player_id,age,ace,df,svpt,1stIn,1stWon,2ndWon,svGms,bpSaved,bpFaced,year,result
0,101917,26.6,2.0,4.0,54.0,29.0,18.0,11.0,9.0,2.0,5.0,1998,0
1,102548,22.8,1.0,7.0,64.0,32.0,19.0,11.0,8.0,9.0,13.0,1998,0
2,102491,23.0,3.0,2.0,60.0,38.0,18.0,7.0,9.0,5.0,12.0,1998,0
3,101647,28.2,3.0,2.0,56.0,37.0,21.0,10.0,11.0,1.0,6.0,1998,0
4,102104,25.3,5.0,5.0,99.0,57.0,33.0,20.0,13.0,12.0,17.0,1998,0


In [16]:
# Concatenate two (winners and losers) dataframes together
match_stats_df = pd.concat([winners_df, losers_df])
cleaned_match_stats = match_stats_df.dropna()
cleaned_match_stats.head()

Unnamed: 0,player_id,age,ace,df,svpt,1stIn,1stWon,2ndWon,svGms,bpSaved,bpFaced,year,result
0,102035,25.7,7.0,1.0,56.0,29.0,22.0,17.0,9.0,1.0,1.0,1998,1
1,101727,27.7,6.0,0.0,56.0,30.0,23.0,15.0,9.0,1.0,2.0,1998,1
2,102765,21.7,12.0,0.0,51.0,28.0,22.0,9.0,9.0,4.0,7.0,1998,1
3,102563,22.7,6.0,1.0,63.0,30.0,20.0,19.0,10.0,3.0,5.0,1998,1
4,102796,21.6,9.0,6.0,80.0,27.0,22.0,23.0,13.0,3.0,8.0,1998,1


In [17]:
len(cleaned_match_stats)

18220

## Creating the Data Base 

In [18]:
 # creating the Tennis Database
db_string = f'postgresql://postgres:{db_password}@127.0.0.1:5432/tennis_data'
engine = create_engine(db_string)
match_stats_df.to_sql(name='match_stats', con=engine, if_exists='replace')