# This notebook looks to create dataframes to be used for my own personal analysis and data visulaisations.

Dataframes to create include:
* Countries
* Leagues
* Seasons
* Results
* lineup and bench
* events (goals, cards, corners, sub)
* odds (1x2, total goals)

# Imports

In [1]:
import json
import pandas as pd
import numpy as np
from datetime import datetime
from sportmonks_utility import (
    read_json
)
from fetch import (
    fetch_fixtures_data
)
from clean import (
    clean_fixture_data
)
from sportmonks_config import (
    TOP_FIVE_LEAGUE_IDS,
    YEARS_OF_INTEREST
)
pd.set_option('display.max_columns', None)

# Fetch seasons data of interest

In [2]:
season_fixtures_data_dict = fetch_fixtures_data(
    league_ids_list = TOP_FIVE_LEAGUE_IDS,
    years_list = YEARS_OF_INTEREST,
    fixture_data='events'
)

# Get data into dataframe format

### Countries data

Data already in .csv form

In [3]:
countries_df = pd.read_csv(f'sportmonks_data/countries.csv')

In [4]:
countries_df.head(3)

Unnamed: 0,id,name,image_path,extra.continent,extra.sub_region,extra.world_region,extra.fifa,extra.iso,extra.iso2,extra.longitude,extra.latitude,extra.flag,extra
0,2,Poland,https://cdn.sportmonks.com/images/countries/pn...,Europe,Eastern Europe,EMEA,POL,POL,PL,19.37776,52.14785,"<svg xmlns=""http://www.w3.org/2000/svg"" width=...",
1,5,Brazil,https://cdn.sportmonks.com/images/countries/pn...,Americas,South America,AMER,BRA,BRA,BR,-52.973118,-10.810452,"<svg version=""1"" xmlns=""http://www.w3.org/2000...",
2,11,Germany,https://cdn.sportmonks.com/images/countries/pn...,Europe,Western Europe,EMEA,GER,DEU,DE,10.382203,51.202465,"<svg xmlns=""http://www.w3.org/2000/svg"" width=...",


### leagues

Data already in .csv form

In [27]:
leagues_df = pd.read_csv(f'sportmonks_data/leagues.csv')

In [28]:
leagues_df#.head(3)

Unnamed: 0,id,active,type,legacy_id,country_id,logo_path,name,is_cup,is_friendly,current_season_id,current_round_id,current_stage_id,live_standings,coverage.predictions,coverage.topscorer_goals,coverage.topscorer_assists,coverage.topscorer_cards
0,8,True,domestic,29,462,https://cdn.sportmonks.com/images/soccer/leagu...,Premier League,False,False,19734,274675.0,77457864.0,True,True,True,True,True
1,72,True,domestic,1,38,https://cdn.sportmonks.com/images/soccer/leagu...,Eredivisie,False,False,19726,274589.0,77457832.0,True,True,True,True,True
2,82,True,domestic,4,11,https://cdn.sportmonks.com/images/soccer/leagu...,Bundesliga,False,False,19744,274912.0,77457883.0,True,True,True,True,True
3,271,True,domestic,43,320,https://cdn.sportmonks.com/images/soccer/leagu...,Superliga,False,False,19686,274241.0,77457696.0,True,True,True,True,True
4,301,True,domestic,47,17,https://cdn.sportmonks.com/images//soccer/leag...,Ligue 1,False,False,19745,274942.0,77457884.0,True,True,True,True,True
5,384,True,domestic,22,251,https://cdn.sportmonks.com/images//soccer/leag...,Serie A,False,False,19806,276004.0,77458057.0,True,True,True,True,True
6,462,True,domestic,16,20,https://cdn.sportmonks.com/images//soccer/leag...,Liga Portugal,False,False,19896,277459.0,77458262.0,True,True,True,True,True
7,501,True,domestic,66,1161,https://cdn.sportmonks.com/images/soccer/leagu...,Premiership,False,False,19735,274721.0,77457866.0,True,True,True,True,True
8,564,True,domestic,19,32,https://cdn.sportmonks.com/images/soccer/leagu...,La Liga,False,False,19799,275889.0,77458033.0,True,True,True,True,True
9,600,True,domestic,46,404,https://cdn.sportmonks.com/images/soccer/leagu...,Super Lig,False,False,19900,277530.0,77458273.0,True,True,True,True,True


### Seasons

Load json file

In [7]:
seasons_json = read_json('sportmonks_data/seasons.json')

get today's date to filter for season end date in later step

In [8]:
now = datetime.now().strftime("%Y-%m-%d")

add season start and season end variables to each season json blob.

Start date is the start of the first round and end date is the end of the last round.

If it's season that ends in the future we add a NaN to the season end date.

In [9]:
for season in seasons_json:
    if len(season['rounds']) > 0:
        season_start = season['rounds'][0]['start']
        season_end = season['rounds'][-1]['end']
        season['season_start'] = season_start
        try:
            if season_end < now:
                season['season_end'] = season_end
            else:
                season['season_end'] = np.nan
        except:
            season['season_end'] = np.nan
    season.pop('rounds')
    season.pop('stages')

In [10]:
seasons_df = pd.DataFrame(seasons_json)

seasons_df.head(3)

Unnamed: 0,id,name,league_id,is_current_season,current_round_id,current_stage_id,season_start,season_end
0,1586,2005/2006,8,False,,,2005-08-13,2006-05-07
1,8,2006/2007,8,False,,,2006-08-19,2007-05-13
2,14,2007/2008,8,False,,,2007-08-11,2008-05-11


### Fixtures (results data)

Get fixtures raw data. 

From loaded in seasons fixtures data, for each fixture we append cleaned json data to a list and then dataframe created.

In [11]:
clean_fixtures_list = []
for league_season_name, season_event_data  in season_fixtures_data_dict.items():
    season_name = league_season_name[-9:]
    for fixture_json in season_event_data:
        clean_fixture_dict = clean_fixture_data(fixture_json)
        clean_fixture_dict['season_name'] = season_name
        clean_fixtures_list.append(clean_fixture_dict)

In [12]:
fixtures_df = pd.DataFrame(clean_fixtures_list)
fixtures_df.head(3)

Unnamed: 0,fixture_id,league_id,season_id,home_id,away_id,home_score,away_score,ht_score,date_time,home_name,away_name,season_name
0,1710802,8,6397,19,42,4,3,2-2,2017-08-11 19:45:00,Arsenal,Leicester City,2017_2018
1,1710810,8,6397,25,8,3,3,2-1,2017-08-12 12:30:00,Watford,Liverpool,2017_2018
2,1710804,8,6397,18,27,2,3,0-3,2017-08-12 15:00:00,Chelsea,Burnley,2017_2018


### lineup and bench information

From loaded in seasons fixtures data, for each fixture we append lineup and bench data to then create dataframe with.

In [13]:
lineup_bench_data_list = []

for _, season_event_data  in season_fixtures_data_dict.items():
    for fixture_json in season_event_data:
        lineup_list = fixture_json['lineup']
        bench_list = fixture_json['bench']
        for starter in lineup_list:
            starter['starting'] = True
            lineup_bench_data_list.append(starter)
        for substitute in bench_list:
            substitute['starting'] = False
            lineup_bench_data_list.append(substitute)

In [14]:
lineup_bench_df = pd.json_normalize(lineup_bench_data_list)

In [15]:
lineup_bench_df.head(3)

Unnamed: 0,team_id,fixture_id,player_id,player_name,number,position,additional_position,formation_position,posx,posy,captain,type,starting,stats.shots.shots_total,stats.shots.shots_on_goal,stats.goals.scored,stats.goals.assists,stats.goals.conceded,stats.goals.owngoals,stats.goals.team_conceded,stats.fouls.drawn,stats.fouls.committed,stats.cards.yellowcards,stats.cards.redcards,stats.cards.yellowredcards,stats.passing.total_crosses,stats.passing.crosses_accuracy,stats.passing.passes,stats.passing.accurate_passes,stats.passing.passes_accuracy,stats.passing.key_passes,stats.dribbles.attempts,stats.dribbles.success,stats.dribbles.dribbled_past,stats.duels.total,stats.duels.won,stats.other.aerials_won,stats.other.punches,stats.other.offsides,stats.other.saves,stats.other.inside_box_saves,stats.other.pen_scored,stats.other.pen_missed,stats.other.pen_saved,stats.other.pen_committed,stats.other.pen_won,stats.other.hit_woodwork,stats.other.tackles,stats.other.blocks,stats.other.interceptions,stats.other.clearances,stats.other.dispossesed,stats.other.minutes_played,stats.rating
0,19,1710802,84.0,Petr Čech,33,G,,1.0,,,True,lineup,True,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,16.0,80.0,0.0,0.0,0.0,0.0,1.0,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,90.0,6.0
1,19,1710802,1901.0,Rob Holding,16,D,,2.0,,,False,lineup,True,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,59.0,48.0,83.0,0.0,0.0,0.0,0.0,8.0,5.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,10.0,0.0,67.0,6.7
2,19,1710802,1303.0,Nacho Monreal,18,D,,3.0,,,False,lineup,True,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.0,53.0,94.0,0.0,0.0,0.0,0.0,8.0,5.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,5.0,7.0,0.0,90.0,7.0


### Goals, cards, subs, corners data

events (goals, cards, corners, sub)

In [16]:
season_event_data = season_fixtures_data_dict['8_2021_2022']
all_event_list = []
for _, season_event_data  in season_fixtures_data_dict.items():
    for fixture_json in season_event_data:
        events_list = fixture_json['events']
        corner_list = fixture_json['corners']
        for event in events_list:
            all_event_list.append(event)
        for corner in corner_list:
            corner['type'] = 'corner'
            all_event_list.append(corner)
all_event_df = pd.DataFrame(all_event_list)

In [17]:
all_event_df.head(3)

Unnamed: 0,id,team_id,type,var_result,fixture_id,player_id,player_name,related_player_id,related_player_name,minute,extra_minute,reason,injuried,result,on_pitch,comment
0,1710802001,19,goal,,1710802,95334.0,A. Lacazette,1891.0,Mohamed Elneny,2,,,,1-0,True,
1,1710802002,42,goal,,1710802,1212.0,S. Okazaki,2884.0,H. Maguire,5,,,,1-1,True,
2,1710802003,42,goal,,1710802,1182.0,J. Vardy,174.0,M. Albrighton,29,,,,1-2,True,


# Save .csv files

In [21]:
countries_df.to_csv('sportmonks_data/cleaned/countries.csv', index=False)
leagues_df.to_csv('sportmonks_data/cleaned/leagues.csv', index=False)
seasons_df.to_csv('sportmonks_data/cleaned/seasons.csv', index=False)
fixtures_df.to_csv('sportmonks_data/cleaned/fixtures.csv', index=False)
lineup_bench_df.to_csv('sportmonks_data/cleaned/lineup_bench.csv', index=False)
all_event_df.to_csv('sportmonks_data/cleaned/events.csv', index=False)

# Conclusion

Have created dataframes for:
* countries
* leagues
* seasons
* fixtures
* lineup and bench data
* event data per fixture

This notebook does not include cleaning odds data. (Sportmonks only has odds data from 2017/2018 onwards)

In [29]:
### Fetching above data

# countries_df = pd.read_csv('sportmonks_data/cleaned/countries.csv')
# leagues_df = pd.read_csv('sportmonks_data/cleaned/leagues.csv')
# seasons_df = pd.read_csv('sportmonks_data/cleaned/seasons.csv')
# fixtures_df = pd.read_csv('sportmonks_data/cleaned/fixtures.csv')
# lineup_bench_df = pd.read_csv('sportmonks_data/cleaned/lineup_bench.csv')
# all_event_df = pd.read_csv('sportmonks_data/cleaned/events.csv')