### Initial filling of the empty database tables with scraped data

In [1]:
import psycopg2
import numpy as np
import pandas as pd
import db_utilities as dbu # custom module


### Table COUNTRIES

In [260]:
# check table
df = dbu.select_query("SELECT * FROM countries;")
df.head()

Unnamed: 0,code,name
0,GER,Germany
1,FRA,France
2,ITA,Italy
3,ESP,Spain
4,ENG,England


In [7]:
# inserts
values = [('GER', 'Germany'), ('FRA', 'France'), ('ITA', 'Italy'), ('ESP', 'Spain'), ('ENG', 'England')]
for v in values:
    query_str = f"""INSERT INTO countries (code, name) VALUES ('{v[0]}', '{v[1]}');"""
    response = dbu.manipulating_query(query_str)
    print(response)


INSERT 0 1
INSERT 0 1
INSERT 0 1
INSERT 0 1
INSERT 0 1


### Table LEAGUES

In [10]:
# read in csv file
df = pd.read_csv('../data/scraped/fbref/leagues/fbref_leagues_initial.csv')
df.head()

Unnamed: 0,league_id,name,country,season_str_format
0,9,Premier League,ENG,yyyy-yyyy
1,12,La Liga,ESP,yyyy-yyyy
2,13,Ligue 1,FRA,yyyy-yyyy
3,20,Bundesliga,GER,yyyy-yyyy
4,11,Serie A,ITA,yyyy-yyyy


In [14]:
# check table is empty (and check column names)
result_df = dbu.select_query("Select * from leagues;")
result_df

Unnamed: 0,id,fbref_id,name,country
0,1,9,Premier League,ENG
1,2,12,La Liga,ESP
2,3,13,Ligue 1,FRA
3,4,20,Bundesliga,GER
4,5,11,Serie A,ITA


In [13]:
# insert 
conn = dbu.get_conn(type='DB_su')
# one insert statement per df row
for i, r in df.iterrows():
        query_str = f"""INSERT INTO leagues (fbref_id, name, country) 
                        VALUES ({r['league_id']}, '{r['name']}', '{r['country']}');"""
        print(query_str)
        response = dbu.manipulating_query(query_str, conn=conn)
        print(response)
conn.close() # close connection (note: cursor does not need to be closed when using 'with'-block)


INSERT INTO leagues (fbref_id, name, country) 
                        VALUES (9, 'Premier League', 'ENG');
INSERT 0 1
INSERT INTO leagues (fbref_id, name, country) 
                        VALUES (12, 'La Liga', 'ESP');
INSERT 0 1
INSERT INTO leagues (fbref_id, name, country) 
                        VALUES (13, 'Ligue 1', 'FRA');
INSERT 0 1
INSERT INTO leagues (fbref_id, name, country) 
                        VALUES (20, 'Bundesliga', 'GER');
INSERT 0 1
INSERT INTO leagues (fbref_id, name, country) 
                        VALUES (11, 'Serie A', 'ITA');
INSERT 0 1


### Table TEAMS

In [108]:
path = "../data/scraped/fbref/match"
# read in all files in path
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
print(files)

['league11_ssy2017_cols170_rows760.csv', 'league11_ssy2018_cols170_rows760.csv', 'league11_ssy2019_cols170_rows760.csv', 'league11_ssy2020_cols170_rows760.csv', 'league11_ssy2021_cols170_rows760.csv', 'league11_ssy2022_cols170_rows760.csv', 'league12_ssy2017_cols170_rows760.csv', 'league12_ssy2018_cols170_rows760.csv', 'league12_ssy2019_cols170_rows760.csv', 'league12_ssy2020_cols170_rows760.csv', 'league12_ssy2021_cols170_rows760.csv', 'league12_ssy2022_cols170_rows760.csv', 'league13_ssy2017_cols170_rows762.csv', 'league13_ssy2018_cols170_rows762.csv', 'league13_ssy2019_cols170_rows760.csv', 'league13_ssy2020_cols170_rows762.csv', 'league13_ssy2021_cols170_rows762.csv', 'league13_ssy2022_cols170_rows760.csv', 'league20_ssy2017_cols170_rows614.csv', 'league20_ssy2018_cols170_rows614.csv', 'league20_ssy2019_cols170_rows614.csv', 'league20_ssy2020_cols170_rows614.csv', 'league20_ssy2021_cols170_rows614.csv', 'league20_ssy2022_cols170_rows612.csv', 'league9_ssy2017_cols170_rows760.csv', 

In [109]:
# read in all individual files
df_list = []
for f in files:
    df_list.append(pd.read_csv(os.path.join(path, f), sep=';'))
# concatenate all files
df = pd.concat(df_list, ignore_index=True)
print(df.shape)

(21930, 170)


In [74]:
# match fbref ids with names and league ids (we get the country via the league id)
df = df.groupby('schedule_fbref_opponent_id').first().sort_values('schedule_fbref_league_id')[['schedule_Opponent', 'schedule_fbref_league_id']]

In [82]:
# get leagues table for country matching
leagues_df = dbu.select_query("SELECT * FROM leagues;")
# get matching dict
leagues_dict = dict(zip(leagues_df['fbref_id'].astype(int), leagues_df['country']))
# add country column
df['country'] = df['schedule_fbref_league_id'].map(leagues_dict)
df.head()

Unnamed: 0_level_0,schedule_Opponent,schedule_fbref_league_id,country
schedule_fbref_opponent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
822bd0ba,Liverpool,9,ENG
60c6b05f,West Brom,9,ENG
b2b47a98,Newcastle Utd,9,ENG
a2d435b3,Leicester City,9,ENG
943e8050,Burnley,9,ENG


In [86]:
# replace apostrophes in names with double apostrophes (SQL syntax)
df['schedule_Opponent'] = df['schedule_Opponent'].str.replace("'", "''")

In [90]:
# inserts
conn = dbu.get_conn(type='DB_su')
# one insert statement per df row
flag = False
for i, r in df.iterrows():
    query_str = f"""INSERT INTO teams (fbref_id, name, country) VALUES ('{i}', '{r['schedule_Opponent']}', '{r['country']}');""" 
    print(query_str)
    response = dbu.manipulating_query(query_str, conn=conn)
    print(response)
conn.close()

INSERT INTO teams (fbref_id, name, country) VALUES ('d07537b9', 'Brighton', 'ENG');
INSERT 0 1
INSERT INTO teams (fbref_id, name, country) VALUES ('17892952', 'Stoke City', 'ENG');
INSERT 0 1
INSERT INTO teams (fbref_id, name, country) VALUES ('d3fd31cc', 'Everton', 'ENG');
INSERT 0 1
INSERT INTO teams (fbref_id, name, country) VALUES ('5bfb9659', 'Leeds United', 'ENG');
INSERT 0 1
INSERT INTO teams (fbref_id, name, country) VALUES ('cff3d9bb', 'Chelsea', 'ENG');
INSERT 0 1
INSERT INTO teams (fbref_id, name, country) VALUES ('19538871', 'Manchester Utd', 'ENG');
INSERT 0 1
INSERT INTO teams (fbref_id, name, country) VALUES ('18bb7c10', 'Arsenal', 'ENG');
INSERT 0 1
INSERT INTO teams (fbref_id, name, country) VALUES ('658bf2de', 'Genoa', 'ITA');
INSERT 0 1
INSERT INTO teams (fbref_id, name, country) VALUES ('68449f6d', 'Spezia', 'ITA');
INSERT 0 1
INSERT INTO teams (fbref_id, name, country) VALUES ('6a7ad59d', 'Frosinone', 'ITA');
INSERT 0 1
INSERT INTO teams (fbref_id, name, country) V

In [92]:
# check table
teams_df = dbu.select_query("SELECT * FROM teams;")
teams_df

Unnamed: 0,id,fbref_id,name,country
0,1,822bd0ba,Liverpool,ENG
1,2,60c6b05f,West Brom,ENG
2,3,b2b47a98,Newcastle Utd,ENG
3,4,a2d435b3,Leicester City,ENG
4,5,943e8050,Burnley,ENG
...,...,...,...,...
138,139,12192a4c,Greuther Fürth,GER
139,140,a224b06a,Mainz 05,GER
140,141,d9f93f02,Paderborn 07,GER
141,142,acbb6a5b,RB Leipzig,GER


### Table MATCHES

In [148]:
# get data from individual csvs 
path = "../data/scraped/fbref/match"
# read in all files in path
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
print(files)

# read in all individual files
df_list = []
for f in files:
    df_list.append(pd.read_csv(os.path.join(path, f), sep=';'))
# concatenate all files
df = pd.concat(df_list, ignore_index=True)
print(df.shape)

['league11_ssy2017_cols170_rows760.csv', 'league11_ssy2018_cols170_rows760.csv', 'league11_ssy2019_cols170_rows760.csv', 'league11_ssy2020_cols170_rows760.csv', 'league11_ssy2021_cols170_rows760.csv', 'league11_ssy2022_cols170_rows760.csv', 'league12_ssy2017_cols170_rows760.csv', 'league12_ssy2018_cols170_rows760.csv', 'league12_ssy2019_cols170_rows760.csv', 'league12_ssy2020_cols170_rows760.csv', 'league12_ssy2021_cols170_rows760.csv', 'league12_ssy2022_cols170_rows760.csv', 'league13_ssy2017_cols170_rows762.csv', 'league13_ssy2018_cols170_rows762.csv', 'league13_ssy2019_cols170_rows760.csv', 'league13_ssy2020_cols170_rows762.csv', 'league13_ssy2021_cols170_rows762.csv', 'league13_ssy2022_cols170_rows760.csv', 'league20_ssy2017_cols170_rows614.csv', 'league20_ssy2018_cols170_rows614.csv', 'league20_ssy2019_cols170_rows614.csv', 'league20_ssy2020_cols170_rows614.csv', 'league20_ssy2021_cols170_rows614.csv', 'league20_ssy2022_cols170_rows612.csv', 'league9_ssy2017_cols170_rows760.csv', 

In [149]:
# filter only rows from home perspective (caution: if tournaments (i.e. world cups) are in the data this might not work!)
df = df[df['schedule_Venue']=='Home']

In [150]:
# filter relevant data
df_prep = df[['schedule_fbref_match_id', 'schedule_fbref_league_id', 'schedule_fbref_squad_id', 'schedule_fbref_opponent_id', 'schedule_Date', 'schedule_Time', 'schedule_Round', 'schedule_Day']]

In [151]:
# filter out matches set in the future (i.e. no schedule data yet)
df_prep = df_prep[df_prep['schedule_fbref_match_id']!='matchup']
df_prep.shape

(10769, 8)

In [152]:
# get leagues table for league id and teams table for team ids
leagues_df = dbu.select_query("SELECT * FROM leagues;")
leagues_dict = dict(zip(leagues_df['fbref_id'].astype(int), leagues_df['id']))
teams_df = dbu.select_query("SELECT * FROM teams;")
teams_dict = dict(zip(teams_df['fbref_id'], teams_df['id'].astype(int)))


In [153]:
# add id columns to df_prep
df_prep['league_id'] = df_prep['schedule_fbref_league_id'].map(leagues_dict)
df_prep['home_team_id'] = df_prep['schedule_fbref_squad_id'].map(teams_dict)
df_prep['away_team_id'] = df_prep['schedule_fbref_opponent_id'].map(teams_dict)

In [154]:
# sort 
df_prep = df_prep.sort_values('schedule_Date')

In [160]:
# get rid of rows with missing match id
df_prep = df_prep[df_prep['schedule_fbref_match_id'].isna()==False]
df_prep.shape

(10760, 11)

In [159]:
df_prep.head()

Unnamed: 0,schedule_fbref_match_id,schedule_fbref_league_id,schedule_fbref_squad_id,schedule_fbref_opponent_id,schedule_Date,schedule_Time,schedule_Round,schedule_Day,league_id,home_team_id,away_team_id
9158,a68e623d,13,fd6114db,3f8c4b5f,2017-08-04,20:45,Matchweek 1,Fri,3,89,109
9120,37f2c25f,13,e2d8892c,25622401,2017-08-05,17:15,Matchweek 1,Sat,3,92,111
9806,bf730e30,13,54195385,b3072e00,2017-08-05,20:00,Matchweek 1,Sat,3,114,104
9462,b2829d08,13,281b0e73,74229020,2017-08-05,20:00,Matchweek 1,Sat,3,108,110
9348,68b9eea2,13,d298ef2c,132ebc33,2017-08-05,20:00,Matchweek 1,Sat,3,103,97


In [223]:
# convert to string columns
df_prep['schedule_Date'] = df_prep['schedule_Date'].astype(str)
df_prep['schedule_Time'] = df_prep['schedule_Time'].astype(str)
df_prep['schedule_Round'] = df_prep['schedule_Round'].astype(str)
df_prep['schedule_Day'] = df_prep['schedule_Day'].astype(str)

In [224]:
# inserts (since we have many we reuse the cursor)
conn = dbu.get_conn(type='DB_su')
cursor = conn.cursor()
# iterate over df rows
counter = 0
query_str = ""
for i, r in df_prep.iterrows():
    counter += 1
    date_str = 'NULL' if r['schedule_Date']=='nan' else "'" + r['schedule_Date'] + "'" 
    time_str = 'NULL' if r['schedule_Time']=='nan' else "'" + r['schedule_Time'] + "'"
    round_str = 'NULL' if r['schedule_Round']=='nan' else "'" + r['schedule_Round'] + "'"
    day_str = 'NULL' if r['schedule_Day']=='nan' else "'" + r['schedule_Day'] + "'"
    query_str += f"""INSERT INTO matches (fbref_id, league_id, home_team_id, away_team_id, schedule_date, schedule_time, schedule_round, schedule_day) 
                    VALUES ('{r['schedule_fbref_match_id']}', {r['league_id']}, {r['home_team_id']}, {r['away_team_id']}, 
                    {date_str}, {time_str}, {round_str}, {day_str});"""
    #print(query_str)
    if counter % 100 == 0: # send in batches to avoid sending too many requests to the server
        cursor.execute(query_str if query_str.endswith(';') else query_str + ';')
        status_msg = cursor.statusmessage # get response
        conn.commit() # commit changes
        query_str = "" # reset query string
        print(status_msg)
        print(f"progress: {counter}/{df_prep.shape[0]}")
        
# insert remaining rows
cursor.execute(query_str if query_str.endswith(';') else query_str + ';')
status_msg = cursor.statusmessage # get response
conn.commit() # commit changes
print(status_msg)

cursor.close()
conn.close()


INSERT 0 1
progress: 100/10760
INSERT 0 1
progress: 200/10760
INSERT 0 1
progress: 300/10760
INSERT 0 1
progress: 400/10760
INSERT 0 1
progress: 500/10760
INSERT 0 1
progress: 600/10760
INSERT 0 1
progress: 700/10760
INSERT 0 1
progress: 800/10760
INSERT 0 1
progress: 900/10760
INSERT 0 1
progress: 1000/10760
INSERT 0 1
progress: 1100/10760
INSERT 0 1
progress: 1200/10760
INSERT 0 1
progress: 1300/10760
INSERT 0 1
progress: 1400/10760
INSERT 0 1
progress: 1500/10760
INSERT 0 1
progress: 1600/10760
INSERT 0 1
progress: 1700/10760
INSERT 0 1
progress: 1800/10760
INSERT 0 1
progress: 1900/10760
INSERT 0 1
progress: 2000/10760
INSERT 0 1
progress: 2100/10760
INSERT 0 1
progress: 2200/10760
INSERT 0 1
progress: 2300/10760
INSERT 0 1
progress: 2400/10760
INSERT 0 1
progress: 2500/10760
INSERT 0 1
progress: 2600/10760
INSERT 0 1
progress: 2700/10760
INSERT 0 1
progress: 2800/10760
INSERT 0 1
progress: 2900/10760
INSERT 0 1
progress: 3000/10760
INSERT 0 1
progress: 3100/10760
INSERT 0 1
progre

In [227]:
# check table
matches_df = dbu.select_query("SELECT * FROM matches;")
matches_df.head()

Unnamed: 0,id,fbref_id,league_id,home_team_id,away_team_id,schedule_date,schedule_time,schedule_round,schedule_day
0,1,a68e623d,3,89,109,2017-08-04,20:45:00,Matchweek 1,Fri
1,2,37f2c25f,3,92,111,2017-08-05,17:15:00,Matchweek 1,Sat
2,3,bf730e30,3,114,104,2017-08-05,20:00:00,Matchweek 1,Sat
3,4,b2829d08,3,108,110,2017-08-05,20:00:00,Matchweek 1,Sat
4,5,68b9eea2,3,103,97,2017-08-05,20:00:00,Matchweek 1,Sat


In [229]:
matches_df.dtypes

id                 int64
fbref_id          object
league_id          int64
home_team_id       int64
away_team_id       int64
schedule_date     object
schedule_time     object
schedule_round    object
schedule_day      object
dtype: object

### Table TEAMWAGES

In [234]:
# check table
teamwages_df = dbu.select_query("SELECT * FROM teamwages;")
teamwages_df.head()

Unnamed: 0,team_id,season_str,n_players,pct_estimated,weekly_wages_eur,weekly_wages_gbp,weekly_wages_usd,annual_wages_eur,annual_wages_gbp,annual_wages_usd


In [235]:
# read in data
df = pd.read_csv("../data/scraped/fbref/wages/team_wages_big_five_2017_to_2022.csv", sep=';')
df.head()

Unnamed: 0,squad_name,n_players,pct_estimated,squad_id,season_str,weekly_wages_eur,weekly_wages_gbp,weekly_wages_usd,annual_wages_eur,annual_wages_gbp,annual_wages_usd
0,Manchester Utd,35,100%,19538871,2017-2018,3810955,3195596,3883356,198169670,166171000,201934520
1,Arsenal,45,100%,18bb7c10,2017-2018,3629433,3043385,3698385,188730521,158256000,192316043
2,Manchester City,32,100%,b8fd03ef,2017-2018,3474446,2913423,3540453,180671167,151498000,184103578
3,Chelsea,45,100%,cff3d9bb,2017-2018,2790189,2339654,2843197,145089806,121662000,147846240
4,Liverpool,34,100%,822bd0ba,2017-2018,2194158,1839865,2235843,114096241,95673000,116263857


In [237]:
# clean and prepare df for inserts

# fix pct_estimated column
df['pct_estimated'] = df['pct_estimated'].str.replace('%', '').astype(float)

In [241]:
# check for missing values
df.isna().sum()

squad_name          0
n_players           0
pct_estimated       0
squad_id            0
season_str          0
weekly_wages_eur    0
weekly_wages_gbp    0
weekly_wages_usd    0
annual_wages_eur    0
annual_wages_gbp    0
annual_wages_usd    0
team_id             0
dtype: int64

In [239]:
# get id column via fbref ids
teams_df = dbu.select_query("SELECT * FROM teams;")
teams_dict = dict(zip(teams_df['fbref_id'], teams_df['id'].astype(int)))
df['team_id'] = df['squad_id'].map(teams_dict)
df.head()

Unnamed: 0,squad_name,n_players,pct_estimated,squad_id,season_str,weekly_wages_eur,weekly_wages_gbp,weekly_wages_usd,annual_wages_eur,annual_wages_gbp,annual_wages_usd,team_id
0,Manchester Utd,35,100.0,19538871,2017-2018,3810955,3195596,3883356,198169670,166171000,201934520,28
1,Arsenal,45,100.0,18bb7c10,2017-2018,3629433,3043385,3698385,188730521,158256000,192316043,29
2,Manchester City,32,100.0,b8fd03ef,2017-2018,3474446,2913423,3540453,180671167,151498000,184103578,18
3,Chelsea,45,100.0,cff3d9bb,2017-2018,2790189,2339654,2843197,145089806,121662000,147846240,27
4,Liverpool,34,100.0,822bd0ba,2017-2018,2194158,1839865,2235843,114096241,95673000,116263857,1


In [251]:
conn = dbu.get_conn(type='DB_su')
cursor = conn.cursor()
# iterate over df rows
query_str = ""
for counter, (i, r) in enumerate(df.iterrows()):
    query_str += f"""INSERT INTO teamwages (team_id, season_str, n_players, pct_estimated, weekly_wages_eur, weekly_wages_gbp, weekly_wages_usd, annual_wages_eur, annual_wages_gbp, annual_wages_usd)
                     VALUES ({r['team_id']}, '{r['season_str']}', {r['n_players']}, {r['pct_estimated']}, {r['weekly_wages_eur']}, {r['weekly_wages_gbp']}, {r['weekly_wages_usd']}, {r['annual_wages_eur']}, {r['annual_wages_gbp']}, {r['annual_wages_usd']});"""
    if counter % 50 == 0: # send in batches to avoid sending too many requests to the server
        cursor.execute(query_str if query_str.endswith(';') else query_str + ';')
        status_msg = cursor.statusmessage # get response
        conn.commit() # commit changes
        query_str = "" # reset query string
        print(status_msg)
        print(f"progress: {counter}/{df.shape[0]}")
        
# insert remaining rows
cursor.execute(query_str if query_str.endswith(';') else query_str + ';')
status_msg = cursor.statusmessage
conn.commit()
print(status_msg)

cursor.close()
conn.close()

INSERT 0 1
progress: 0/584
INSERT 0 1
progress: 50/584
INSERT 0 1
progress: 100/584
INSERT 0 1
progress: 150/584
INSERT 0 1
progress: 200/584
INSERT 0 1
progress: 250/584
INSERT 0 1
progress: 300/584
INSERT 0 1
progress: 350/584
INSERT 0 1
progress: 400/584
INSERT 0 1
progress: 450/584
INSERT 0 1
progress: 500/584
INSERT 0 1
progress: 550/584
INSERT 0 1


In [256]:

teamwages_df = dbu.select_query("SELECT * FROM teamwages;")
print(teamwages_df.shape)
teamwages_df.head()

(584, 10)


Unnamed: 0,team_id,season_str,n_players,pct_estimated,weekly_wages_eur,weekly_wages_gbp,weekly_wages_usd,annual_wages_eur,annual_wages_gbp,annual_wages_usd
0,28,2017-2018,35,100.0,3810955,3195596,3883356,198169670,166171000,201934520
1,29,2017-2018,45,100.0,3629433,3043385,3698385,188730521,158256000,192316043
2,18,2017-2018,32,100.0,3474446,2913423,3540453,180671167,151498000,184103578
3,27,2017-2018,45,100.0,2790189,2339654,2843197,145089806,121662000,147846240
4,1,2017-2018,34,100.0,2194158,1839865,2235843,114096241,95673000,116263857


### Table MATCHSTATS

In [6]:
# cleaner setup
# make cleaning/data_cleaning_v2.py available
import sys
sys.path.append('../cleaning')

import cleaning.data_cleaning_v2 as data_cleaning_v2
import os
# instantiate data cleaning object
cleaner = data_cleaning_v2.DataCleaning()

# read in data
path = "./data/scraped/fbref/match"
# read in all files in path
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
print(files)

# read in all individual files
df_list = []
for f in files:
    df_list.append(pd.read_csv(os.path.join(path, f), sep=';'))
# concatenate all files
df = pd.concat(df_list, ignore_index=True)
print(df.shape)


ModuleNotFoundError: No module named 'cleaning'