# La Liga Graph Analysis

In [3]:
import requests
import pandas as pd
import networkx as nx
import json

### Competitions

We need to obtain all of the seasons available for La Liga and their ID's

In [4]:
competitions = requests.get('https://raw.githubusercontent.com/statsbomb/open-data/master/data/competitions.json')

In [5]:
seasons = {}

for competition in competitions.json():
    if competition['competition_name'] == 'La Liga':
        seasons.update({competition['season_id']: competition['season_name']})

Getting all of the matches per season to build a table of all the available data per game, it doesn't include Events data

In [58]:
matches = {}

for num_season in seasons.keys():
    
    url = f'https://raw.githubusercontent.com/statsbomb/open-data/master/data/matches/11/{num_season}.json'
    season = requests.get(url)
    
    for match in season.json():
        matches.update({match['match_id']: {
                            'date': match['match_date'],
                            'season_id': match['season']['season_id'],
                            'season_name': match['season']['season_name'],
                            'home_team_id': match['home_team']['home_team_id'],
                            'home_team_name': match['home_team']['home_team_name'],
                            'away_team_id': match['away_team']['away_team_id'],
                            'away_team_name': match['away_team']['away_team_name'],
                            'home_score': match['home_score'],
                            'away_score': match['away_score'],
                            'match_week': match['match_week'],
                            'ref_id': match['referee']['id'],
                            'ref_name': match['referee']['name'],
                            'match_url': f'https://raw.githubusercontent.com/statsbomb/open-data/master/data/events/{match["match_id"]}.json'}})

2015/2016
2014/2015
2013/2014
2012/2013
2011/2012
2010/2011
2009/2010
2008/2009
2007/2008
2006/2007
2005/2006
2004/2005


In [61]:
matches_df = pd.DataFrame.from_records(matches).T

In [62]:
matches_df.head()

Unnamed: 0,date,season_id,season_name,home_team_id,home_team_name,away_team_id,away_team_name,home_score,away_score,match_week,ref_id,ref_name,match_url
68313,2004-10-24,37,2004/2005,217,Barcelona,422,Osasuna,3,0,8,994,Vicente José Lizondo Cortés,https://raw.githubusercontent.com/statsbomb/op...
68314,2004-12-04,37,2004/2005,217,Barcelona,223,Málaga,4,0,14,993,José Omar Losantos,https://raw.githubusercontent.com/statsbomb/op...
68315,2004-12-21,37,2004/2005,217,Barcelona,221,Levante,2,1,17,222,David Fernández,https://raw.githubusercontent.com/statsbomb/op...
68316,2005-05-01,37,2004/2005,217,Barcelona,608,Albacete,2,0,34,407,Carlos Velasco Carballo,https://raw.githubusercontent.com/statsbomb/op...
68317,2005-10-01,38,2005/2006,217,Barcelona,395,Real Zaragoza,2,2,6,996,Carlos Megía Dávila,https://raw.githubusercontent.com/statsbomb/op...


Building function to extract all of the pass information in a given game. 

In [7]:
def pass_extracter(game_json):
    """
    Function to extract all the pass data for a given game. 
    It requires the data to be in a nested JSON. 
    This is the format that Statsbomb uses to distribute the data. 
    """
    
    type_id = 30
    passes = {}

    for play in game_json.json():
        if play['type']['id'] == type_id:
            if 'outcome' in play['pass']:
                continue
            passes.update({
                play['id']: {
                    'timestamp': play['timestamp'],
                    'minute': play['minute'],
                    'second': play['second'],
                    'team_id': play['possession_team']['id'],
                    'team_name': play['possession_team']['name'],
                    'player_id_made_pass': play['player']['id'],
                    'player_name_made_pass': play['player']['name'],
                    'x_loc_made_pass': play['location'][0],
                    'y_loc_made_pass': play['location'][1],
                    'player_id_received_pass': play['pass']['recipient']['id'],
                    'player_name_received_pass': play['pass']['recipient']['name'],
                    'x_loc_received_pass': play['pass']['end_location'][0],
                    'y_loc_received_pass': play['pass']['end_location'][1],
                    'length_pass': play['pass']['length']
                }
            })
            
    return passes

In [181]:
players = {}

for row in matches_df.iterrows():
    
    url = row[1]['match_url']
    home_team_id = row[1]['home_team_id']
    away_team_id = row[1]['away_team_id']
    
    
    game_json = requests.get(url)
    
    passes = pass_extracter(game_json)
    
    home_graph = nx.Graph()
    away_graph = nx.Graph()
    
    
    for play in passes.values():
        if play['team_id'] == home_team_id:
            home_graph.add_edge(play['player_id_made_pass'], 
                                play['player_id_received_pass'],
                               weigth = play['length_pass'])
        elif play['team_id'] == away_team_id:
            away_graph.add_edge(play['player_id_made_pass'], 
                                play['player_id_received_pass'],
                               weigth = play['length_pass'])
    
        if play['player_id_made_pass'] not in players:
            players.update({play['player_id_made_pass']: play['player_name_made_pass']})
        
        if play['player_id_received_pass'] not in players:
            players.update({play['player_id_received_pass']: play['player_name_received_pass']})
            
    with open(f'data/passes/{row[0]}_passes.json', 'w') as json_file:
        json.dump(passes, json_file)
        
    nx.write_gml(home_graph, f'data/graphs/{row[0]}_{home_team_id}.gml')
    nx.write_gml(away_graph, f'data/graphs/{row[0]}_{away_team_id}.gml')

with open('data/players.json', 'w') as json_file:
    json.dump(players, json_file)

In [183]:
matches_df.to_csv('data/matches_df.csv')