In [2]:
# utility functions for accessing transfermarkt.com

from urllib2 import urlopen, Request
from bs4 import BeautifulSoup
from pandas import DataFrame

# transfermarkt blocks default useragent
useragent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
# "lol" can be anything - transfermarkt ignores it/uses it for semantic url
urlprefix = 'http://www.transfermarkt.com/lol/'

def getUrlByLeagueId(id, season=2015):
    return urlprefix + 'startseite/wettbewerb/' + id + '/saison_id/' + str(season)

def getUrlByClubId(id, season=2015):
    return urlprefix + 'startseite/verein/' + str(id) + '/saison_id/' + str(season)

def getUrlByPlayerId(id):
    return urlprefix + 'profil/spieler/' + str(id)

def getClubsByLeagueId(id, season=2015):
    bs = BeautifulSoup(urlopen(Request(getUrlByLeagueId(id, season), headers={'User-Agent': useragent})))
    elements = bs.find(id='yw1').find_all("td",class_="hauptlink no-border-links hide-for-small hide-for-pad")
    return [{'clubId': e.find("a")["id"], 'name': e.getText()} for e in elements]

def getPlayersByClubId(id, season=2015):
    bs = BeautifulSoup(urlopen(Request(getUrlByClubId(id, season), headers={'User-Agent': useragent})))
    elements = bs.find(id='yw1').find_all("span",class_="hide-for-small")
    return [{'playerId': e.find("a", class_="spielprofil_tooltip")["id"], 
             'name': e.getText()} for e in elements if e.find("a", class_="spielprofil_tooltip")]

def getTransfersByPlayerId(id):
    bs = BeautifulSoup(urlopen(Request(getUrlByPlayerId(id), headers={'User-Agent': useragent})))
    elements = bs.find(class_="transferhistorie").find_all("tr",class_="zeile-transfer")
    dicts = [{'seasonDate': "  ".join([td.getText() for td in e.findAll("td")[:2]]),
      'mv': e.find("td",class_="zelle-mw").getText(),
      'fee': e.find("td",class_="zelle-abloese").getText(),
      'teams': dict(zip(['from','to'],([{'teamId': team.find("a")["id"], 'name': team.getText()}
                                    for team in e.find_all("td", class_="hauptlink no-border-links hide-for-small vereinsname")])))} for e in elements]
    return [{'season': d['seasonDate'].split("  ")[0],
       'date': d['seasonDate'].split("  ")[1],
       'mv': d['mv'], 'fee': d['fee'],
       'fromTeamId': d['teams']['from']['teamId'],
       'fromTeamName': d['teams']['from']['name'].lstrip(),
       'toTeamId': d['teams']['to']['teamId'],
       'toTeamName': d['teams']['to']['name'].lstrip()} for d in dicts]



In [3]:
DataFrame(getTransfersByPlayerId(121483))

Unnamed: 0,date,fee,fromTeamId,fromTeamName,mv,season,toTeamId,toTeamName
0,"Jul 16, 2014","16,00 Mill. €",294,Benfica,"4,00 Mill. €",14/15,13,Atlético Madrid
1,"Jun 30, 2013",End of loan,2425,Rio Ave FC,"2,40 Mill. €",12/13,294,Benfica
2,"Jul 1, 2012",Loan,294,Benfica,"1,10 Mill. €",12/13,2425,Rio Ave FC
3,"Jun 30, 2012",End of loan,2639,Leiria,"1,10 Mill. €",11/12,294,Benfica
4,"Aug 1, 2011",Loan,294,Benfica,700 Th. €,11/12,2639,Leiria
5,"Jun 30, 2011",End of loan,4750,Olhanense,750 Th. €,10/11,294,Benfica
6,"Jan 1, 2011",Loan,294,Benfica,750 Th. €,10/11,4750,Olhanense
7,"Dec 31, 2010",End of loan,1436,Beira-Mar,750 Th. €,10/11,294,Benfica
8,"Aug 1, 2010",Loan,294,Benfica,750 Th. €,10/11,1436,Beira-Mar
9,"Jul 1, 2010","1,70 Mill. €",4772,NK Olimpija,750 Th. €,10/11,294,Benfica


In [4]:
DataFrame(getClubsByLeagueId('GB1',2010))  #.sort_values(by="clubId")

Unnamed: 0,clubId,name
0,631,Chelsea FC
1,985,Manchester United
2,281,Manchester City
3,31,Liverpool FC
4,11,Arsenal FC
5,148,Tottenham Hotspur
6,405,Aston Villa
7,29,Everton FC
8,379,West Ham United
9,289,Sunderland AFC


In [5]:
DataFrame(getPlayersByClubId(418,2005))

Unnamed: 0,name,playerId
0,Iker Casillas,3979
1,Diego López,34370
2,Jonathan Woodgate,3224
3,Iván Helguera,7514
4,Sergio Ramos,25557
5,Francisco Pavón,7517
6,Álvaro Mejía,16634
7,Roberto Carlos,7518
8,Raúl Bravo,3771
9,Míchel Salgado,7515
