In [1]:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd

def collect_ratings(soup, team):
    if team == 'home':
        i = 0
    if team == 'away':
        i = 1
        
    lineup    = soup.find_all('div', {'class': 'kick__data-grid__main'})[i].find_all('a')
    subs      = soup.find_all('div', {'class': 'kick__data-grid__main'})[i+4].find_all('a')[0::2]
    subs_time = soup.find_all('div', {'class': 'kick__data-grid__main'})[i+4].find_all('div', {'class': 'kick__substitutions__time'})[0::2]

    ratings =  [parse_ratings(player) for player in lineup]
    ratings.extend([parse_ratings(player, time) for player, time in zip(subs, subs_time)])
    
    return ratings

def parse_ratings(player, time=0):
    
    name = player.get('href').split('/')[1]
    
    rating = player.get_text().strip()

    if any(char.isdigit() for char in rating):
        rating = rating[-3:]
    else:
        rating = ''
    
    if time == 0:
        start = time
    else: 
        start = int(time.get_text().replace("'", "").split(' ')[0])
    
    return [name, rating, start]


In [2]:
### Find match links
urls = dict()
for year in range(2010, 2021):
    
    season = str(year) + '-' + str(year+1)[-2:]
    
    session = HTMLSession()
    url = 'https://www.kicker.de/bundesliga/spieltag/' + season + '/-1'
    resp = session.get(url)
    soup = BeautifulSoup(resp.html.html, features='lxml')

    fixture = soup.find_all('div', {'class': 'kick__v100-gameList__gameRow'})
    urls[season] = [i.find_all('a', {'class': 'kick__v100-scoreBoard kick__v100-scoreBoard--standard'})[0].get('href').replace('analyse', 'schema') for i in fixture]


In [3]:
### Get ratings

df_all = pd.DataFrame()

for season in urls.keys():
    
    print(season)

    for url in urls[season]:

        ### Scrape data

        session = HTMLSession()
        resp = session.get('https://www.kicker.de'+url)
        soup = BeautifulSoup(resp.html.html, features='lxml')


        ### Parse data

        match_info = soup.find_all('div', {'class': 'kick__v100-scoreboardInfo'})[0].find_all('a')[0].get('href').replace('/spieltag', '')[1:].split('/')

        scores = soup.find_all('div', {'class': 'kick__v100-scoreBoard__scoreHolder'})[0].get_text().replace('\n', '').split(':')

        teams = [i.get('href').split('/')[1] for i in soup.find_all('a', {'class': 'kick__v100-gameCell__team'})]

        data = dict()
        data['match_info']= {'competition': match_info[0],
                             'season': match_info[1], 
                             'round': match_info[2]
                            }
        data['home'] = {'ratings': collect_ratings(soup, 'home'),
                        'team': teams[0],
                        'score': scores[0]
                       }
        data['away'] = {'ratings': collect_ratings(soup, 'away'),
                        'team': teams[1],
                        'score': scores[1]
                       }



        ### Create dataframe

        for i in ['home', 'away']:
            data[i]['df'] = pd.DataFrame(data[i]['ratings'], columns = ['player_name', 'rating', 'start'])
            data[i]['df']['team'] = data[i]['team']
            data[i]['df']['score'] = data[i]['score']
            data[i]['df']['home'] = 1 if i == 'home' else 0

        df_both = data['home']['df'].append(data['away']['df'], ignore_index=True)

        for i in data['match_info'].keys():
            df_both[i] = data['match_info'][i]
            
        df_both['season2'] = season

        df_all = df_all.append(df_both, ignore_index=True)


### Export data        
df_all.to_csv('input/ratings_bundesliga.csv', index=False)
df_all[['team']].rename(columns={'team': 'team_ratings'}).groupby('team_ratings').count().to_csv('input/team_names_ratings.csv')


2010-11
2011-12
2012-13
2013-14
2014-15
2015-16
2016-17
2017-18
2018-19
2019-20
2020-21
