In [1]:
# Dependencies
import requests
import os
from os.path  import basename
from bs4 import BeautifulSoup
import pandas as pd
from glob import glob
import datetime
from datetime import datetime
import time
from random import randint
import unicodedata


In [2]:
leagues_df = pd.read_csv("data/leagues_data.csv")
leagues_df.dtypes

league_ID       int64
country        object
tier            int64
league_name    object
league_link    object
dtype: object

In [3]:
leagues_df

Unnamed: 0,league_ID,country,tier,league_name,league_link
0,1,BRAZIL,1,brasileiro_serie_a,https://www.transfermarkt.com/campeonato-brasi...
1,2,BRAZIL,2,brasileiro_serie_b,https://www.transfermarkt.com/campeonato-brasi...
2,3,UNITED_STATES,1,major_league_soccer,https://www.transfermarkt.com/major-league-soc...
3,4,UNITED_STATES,2,USL_CHAMPIONSHIP,https://www.transfermarkt.com/usl-pro/startsei...


In [4]:
# to be able to scrape this website we need to use 'User Agents'
# more info about user agents in 'https://webscraping.com/blog/User-agents/'
# You can find your User-Agent at 'http://whatsmyuseragent.com/'
headers = {'User-Agent': 
           'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}

In [5]:
convert_url = "https://www.google.com/search?q=euro+to+dollar&oq=eur&aqs=chrome.1.69i57j35i39j0j46j0l2j69i61l2.2373j1j4&sourceid=chrome&ie=UTF-8"
html = requests.get(convert_url, headers=headers)
soup = BeautifulSoup(html.content, 'html.parser')
curr_value = soup.find('div', class_= "b1hJbf")
curr_value = round(float(curr_value["data-exchange-rate"]),2)

def get_value_us(x):
    value = []
    for char in x:
        value.append(char)
    if value[-1] == "m":
        float_value = "".join(value[1:-1])
        return round(float(float_value)*curr_value,2)
    else:
        float_value = "".join(value[1:-3])
        return round((float(float_value)*curr_value)/1000,2)


In [6]:
def strip_accents(text):

    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")

    return str(text.strip())

In [12]:
def fix_heights(df):
    heights = []
    
    for index, row in df.iterrows():
        if pd.notnull(row["height"]):
            heights.append(row["height"])
        else:
            nat = row["nat"]
            position = row["position"]
            field_position = row["field_position"]
            comp_players = df.loc[((df["nat"] == nat) & (df["position"] == position))]
            avg_df = round(comp_players["height"].mean(skipna = True),2)
            row["height"] = avg_df
            if pd.isnull(row["height"]):
                comp_players2 = df.loc[((df["nat"] == nat) & (df["field_position"] == field_position))]
                avg_df2 = round(comp_players2["height"].mean(skipna = True),2)
                row["height"] = avg_df2
                if pd.isnull(row["height"]):
                    comp_players3 = df.loc[((df["position"] == position))]
                    avg_df3 = round(comp_players3["height"].mean(skipna = True),2)
                    row["height"] = avg_df3
            heights.append(row["height"])
    
    
    df.height = heights

In [7]:
def scrape_league_data(leagues_df):
    team_id = 1
    team_ID = []
    links = []
    names = []
    logos = []
    squads = []
    foreigners = []
    total_MVs = []
    avg_MVs = []
    league_ID = []
    
    for index, row in leagues_df.iterrows():
        url = row["league_link"]
        league_name = row["league_name"]
        tier = row["tier"]
        country = row["country"]
        league_id = row["league_ID"]
        print(f"scraping: {country}_{tier}_{league_name}")
        html = requests.get(url, headers=headers)
        soup = BeautifulSoup(html.content, 'html.parser')
        htmltable = soup.find('table', class_= "items")


        results = htmltable.findAll("tr", class_ =["odd","even"])


        for result in results:
            features = result.findAll("td")
            links.append(("https://www.transfermarkt.com"+result.find("a", href=True)\
                          ["href"]+"/plus/1").replace("startseite", "kader"))
            logo = result.find("img", src=True)["src"]
            logo = logo.split("?")[0]
            logo = logo.replace("tiny", "header")
            logos.append(logo)
            name = features[1].text
            names.append(strip_accents(name))
            squad = features[3].text
            squads.append(squad)
            foreigner = features[5].text
            foreigners.append(foreigner)
            total_MV = get_value_us(features[6].text)
            total_MVs.append(total_MV)
#             print(features[7])
            avg_MV = get_value_us(features[7].text)
            avg_MVs.append(avg_MV)
            team_ID.append(team_id)
            league_ID.append(league_id)
            team_id = team_id + 1

            time.sleep(2)


    # Create a Dataframe and export to a .csv file
    df = pd.DataFrame(list(zip(team_ID, league_ID, names, squads, foreigners,avg_MVs, total_MVs, logos,links)), \
columns =["team_ID", "league_ID","club","squad", "foreigners", "avg_market_value_m", "total_MV_m",'Logo_img', "link_page"]) 
    df['league_ID'] = df['league_ID'].astype(int)
    df['team_ID'] = df['team_ID'].astype(int)

    df.to_csv(f'data/teams_trmk.csv',index=False)




In [14]:
def scrape_team_data(teams_df):
    player_id = 1
    name = []
    player_page = []
    position = []
    f_posi = []
    Age = []
    Nat = []
    Height = []
    foot = []
    dt_joined = []
    prev_team = []
    contract_expires = []
    market_value = []
    team_ID = []
    players_ID = []
    

    df_league = pd.read_csv(teams_df)
    for index, row in df_league.iterrows():
        team_id = row["team_ID"]
        team = row["link_page"]
        team_name = team.split('/')[-8]
        print(team_name)
        html = requests.get(team, headers=headers)
        soup = BeautifulSoup(html.content, 'html.parser')
        htmltable = soup.find('table', class_= "items")

        results = htmltable.findAll("tr", class_ =["odd","even"])


        for result in results:
            features = result.findAll("td")

            name_1 = features[2].find("img", alt=True)["alt"]

            player_page.append("https://www.transfermarkt.com" + features[3].find("a", href=True)["href"])

            position_1 = features[4].text
            
            if position_1 in ("Centre-Back", "Left-Back", "Right-Back", "Defender"):
                field_posit = "DEF"
            elif position_1 in ("Defensive Midfield", "Central Midfield" , "Right Midfield" , "Left Midfield" , \
            "Attacking Midfield" , "Midfield"):
                field_posit = "MID"
            elif position_1 in ("Left Winger" , "Right Winger" , "Centre-Forward" , "Second Striker" , "Forward"):
                field_posit = "ATT"
            else:
                field_posit = "GLK"

            try:
                age_1 = int((features[5].text.split("(",)[-1])[:-1])
            except:
                age_1 = ""

            nat = features[6].img["alt"]

            try:
                Height_1 = float((features[7].text.split(" ")[0]).replace(",", "."))
            except:
                Height_1 = ""

            foot_1 = features[8].text

            dt_joined_1 = features[9].text
            try:
                dt_joined_1 = datetime.strptime(dt_joined_1, '%b %d, %Y').date()
            except:
                dt_joined_1 = ""

            try:
                prev_team_1 = features[10].img["alt"]
            except:
                prev_team_1 = "N.A."


            contract_expires_1 = features[11].text
            try:
                contract_expires_1 = datetime.strptime(contract_expires_1, '%d.%m.%Y').date()
            except:
                contract_expires_1 = ""

            try:
                market_value_1 = get_value_us(features[12].text[:-2])
            except:
                market_value_1 = 0

            name.append(strip_accents(name_1))
            position.append(position_1)
            f_posi.append(field_posit)
            Age.append(age_1)
            Nat.append(nat)
            Height.append(Height_1)
            foot.append(foot_1)
            dt_joined.append(dt_joined_1)
            prev_team.append(strip_accents(prev_team_1))
            contract_expires.append(contract_expires_1)
            market_value.append(market_value_1)
            team_ID.append(team_id)
            players_ID.append(player_id)
            player_id = player_id+1

        time.sleep(randint(3,5))

    df = pd.DataFrame(list(zip(players_ID, team_ID, name, position,f_posi, Age,Nat, Height, foot,dt_joined,prev_team, contract_expires,\
                               market_value,player_page)), 
                      columns =["players_ID", "team_ID","name","position","field_position", "Age", "Nat","Height","foot",'dt_joined',"prev_team", \
                                "contract_expires", "market_value","player_page" ])
    df['players_ID'] = df['players_ID'].astype(int)
    df['team_ID'] = df['team_ID'].astype(int)
    df['dt_joined'] = pd.to_datetime(df['dt_joined'])
    df['contract_expires'] = pd.to_datetime(df['contract_expires'])
    
    fix_heights(df)

    df.to_csv(f"data/players_trmk.csv" ,index=False)


In [9]:
scrape_league_data(leagues_df)

scraping: BRAZIL_1_brasileiro_serie_a
scraping: BRAZIL_2_brasileiro_serie_b
scraping: UNITED_STATES_1_major_league_soccer
scraping: UNITED_STATES_2_USL_CHAMPIONSHIP


In [10]:
teams_df = "data/teams_trmk.csv"

scrape_team_data(teams_df)

flamengo-rio-de-janeiro
se-palmeiras-sao-paulo
gremio-foot-ball-porto-alegrense
fc-sao-paulo
corinthians-sao-paulo
sc-internacional-porto-alegre
fc-santos
atletico-mineiro
fluminense-football-club
vasco-da-gama-rio-de-janeiro
clube-atletico-paranaense
ec-bahia
clube-atletico-bragantino-sp-
botafogo-fr-rio-de-janeiro
coritiba-fc
goias-esporte-clube
sport-club-do-recife
ceara-sporting-club-ce-
fortaleza-esporte-clube
atletico-goianiense
ec-cruzeiro-belo-horizonte
avai-futebol-clube-sc-
centro-sportivo-alagoano-al-
clube-de-regatas-brasil-al-
associacao-atletica-ponte-preta
associacao-chapecoense-de-futebol
esporte-clube-vitoria
figueirense-futebol-clube
esporte-clube-juventude
america-futebol-clube-mg-
clube-nautico-capibaribe
guarani-futebol-clube-sp-
botafogo-futebol-clube-sp-
cuiaba-esporte-clube-mt-
parana-clube
gremio-esportivo-brasil-rs-
oeste-futebol-clube-sp-
operario-ferroviario-esporte-clube-pr-
associacao-desportiva-confianca-se-
sampaio-correa-futebol-clube-ma-
atlanta-united

 \
 \
  \
  \
  \
  \
  \.
  

## tests

In [21]:
# teams_df = "data/BRAZIL_1_BRA_A_trmk.csv"

# scrape_team_data1(teams_df)

In [None]:
# bra_1= pd.read_csv("data/teams_trmk.csv")
# bra_1

In [None]:
# bra_1.dtypes

In [None]:
# bra_1["club"] = bra_1["club"].astype(str)
# # bra_1.dtypes

In [None]:
# files = glob('data/*.csv', recursive=True)
# scrape_team_data1(files)


In [None]:
# df_league = pd.read_csv(teams_df)
# df_league

In [10]:
# contratcs = ["15.06.2024", "30.12.2024", "30.06.2024"]

# for contratc in contratcs:
# #     try:
# # #         contratc_expires_1 = datetime.strptime(contratc, '%d.%m.%Y').date()
# #         dt_joined_1 = datetime.strptime(contratc, '%b %d, %Y').date()
# #     except:
# # #         contract_expires_1 = "01/01/2030"
# #         dt_joined_1 = "2019-01-01"
#     try:
#         contratc_expires_1 = datetime.strptime(contratc, '%d.%m.%Y').date()
# #         dt_joined_1 = datetime.strptime(contratc, '%b %d, %Y').date()
#     except:
#         contract_expires_1 = "2030-01-01"
# #         dt_joined_1 = "01/01/2019"

#     print(contratc_expires_1)
    

2024-06-15
2024-12-30
2024-06-30


In [11]:
fla= pd.read_csv("data/players_trmk.csv")
fla

Unnamed: 0,players_ID,team_ID,name,position,field_position,Age,Nat,Height,foot,dt_joined,prev_team,contract_expires,market_value,player_page
0,1,1,Diego Alves,Goalkeeper,GLK,34,Brazil,1.87,left,2017-07-16,Valencia CF,2020-12-31,3.16,https://www.transfermarkt.com/diego-alves/prof...
1,2,1,Hugo Souza,Goalkeeper,GLK,21,Brazil,1.96,both,2019-08-31,Clube de Regatas do Flamengo U20,2023-09-30,0.51,https://www.transfermarkt.com/hugo-souza/profi...
2,3,1,Cesar,Goalkeeper,GLK,28,Brazil,1.94,right,2013-01-01,Clube de Regatas do Flamengo U20,2022-04-30,0.45,https://www.transfermarkt.com/cesar/profil/spi...
3,4,1,Gabriel Batista,Goalkeeper,GLK,22,Brazil,1.88,right,2017-01-01,Clube de Regatas do Flamengo U20,2022-12-31,0.06,https://www.transfermarkt.com/gabriel-batista/...
4,5,1,Rodrigo Caio,Centre-Back,DEF,26,Brazil,1.83,right,2019-01-01,Sao Paulo Futebol Clube,2023-12-31,5.42,https://www.transfermarkt.com/rodrigo-caio/pro...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2761,2762,101,Phillip Goodrum,Right Winger,ATT,22,United States,1.70,left,2020-01-14,Atlanta United FC,2020-11-30,0.00,https://www.transfermarkt.com/phillip-goodrum/...
2762,2763,101,Tyler Wolff,Right Winger,ATT,17,United States,1.74,-,2020-03-07,Atlanta United Academy,,0.00,https://www.transfermarkt.com/tyler-wolff/prof...
2763,2764,101,Coleman Gannon,Right Winger,ATT,17,United States,1.68,-,2020-03-07,Atlanta United Academy,,0.00,https://www.transfermarkt.com/coleman-gannon/p...
2764,2765,101,Jackson Conway,Centre-Forward,ATT,18,United States,1.88,-,2018-12-20,Atlanta United Academy,2020-11-30,0.17,https://www.transfermarkt.com/jackson-conway/p...


In [None]:
# fla.dtypes

In [23]:
# flamengp = "https://www.transfermarkt.com/clube-de-regatas-do-flamengo/kader/verein/614/saison_id/2019/plus/1"
# team_name = goias.split('/')[-8]
# print(f"scraping: {team_name}")
# team_id = 1
      

    
# html = requests.get(goias, headers=headers)
# soup = BeautifulSoup(html.content, 'html.parser')
# htmltable = soup.find('table', class_= "items")

# results = htmltable.findAll("tr", class_ =["odd","even"])

# player_id = 1
# name = []
# player_page = []
# position = []
# f_posi = []
# Age = []
# Nat = []
# Height = []
# foot = []
# dt_joined = []
# prev_team = []
# contract_expires = []
# market_value = []
# team_ID = []
# players_ID = []


# for result in results:
#     features = result.findAll("td")

#     name_1 = features[2].find("img", alt=True)["alt"]

#     player_page.append("https://www.transfermarkt.com" + features[3].find("a", href=True)["href"])

#     position_1 = features[4].text
    
#     if position_1 in ("Centre-Back", "Left-Back", "Right-Back", "Defender"):
#         field_posit = "DEF"
#     elif position_1 in ("Defensive Midfield", "Central Midfield" , "Right Midfield" , "Left Midfield" , \
#     "Attacking Midfield" , "Midfield"):
#         field_posit = "MID"
#     elif position_1 in ("Left Winger" , "Right Winger" , "Centre-Forward" , "Second Striker" , "Forward"):
#         field_posit = "ATT"
#     else:
#         field_posit = "GLK"
#     try:
#         age_1 = int((features[5].text.split("(",)[-1])[:-1])
#     except:
#         age_1 = "-"

#     nat = features[6].img["alt"]

#     try:
#         Height_1 = float((features[7].text.split(" ")[0]).replace(",", "."))
#     except:
#         Height_1 = "0"

#     foot_1 = features[8].text

#     dt_joined_1 = features[9].text
#     try:
#         dt_joined_1 = datetime.strptime(dt_joined_1, '%b %d, %Y').date()
#     except:
#         dt_joined_1 = "2019-01-01"

#     try:
#         prev_team_1 = features[10].img["alt"]
#     except:
#         prev_team_1 = "-"


#     contract_expires_1 = features[11].text
#     try:
#         contratc_expires_1 = datetime.strptime(contratc_expires_1, '%d.%m.%Y').date()
#     except:
#         contract_expires_1 = "2030-01-01"


#     try:
#         market_value_1 = get_value_us(features[12].text[:-2])
#     except:
#         market_value_1 = 0

#     name.append(strip_accents(name_1))
#     position.append(position_1)
#     f_posi.append(field_posit)
#     Age.append(age_1)
#     Nat.append(nat)
#     Height.append(Height_1)
#     foot.append(foot_1)
#     dt_joined.append(dt_joined_1)
#     prev_team.append(strip_accents(prev_team_1))
#     contract_expires.append(contract_expires_1)
#     market_value.append(market_value_1)
#     team_ID.append(team_id)
#     players_ID.append(player_id)
#     player_id = player_id+1

# #     time.sleep(randint(3,5))

# df = pd.DataFrame(list(zip(players_ID, team_ID, name, position,f_posi, Age,Nat, Height, foot,dt_joined,prev_team, contract_expires,\
#                            market_value,player_page)), 
#                   columns =["players_ID", "team_ID","name","position","field_position", "Age", "Nat","Height","foot",'dt_joined',"prev_team", \
#                             "contract_expires", "market_value","player_page" ])
# df['players_ID'] = df['players_ID'].astype(int)
# df['team_ID'] = df['team_ID'].astype(int)
# df['dt_joined'] = pd.to_datetime(df['dt_joined'])
# df['contract_expires'] = pd.to_datetime(df['contract_expires'])


# df


scraping: clube-de-regatas-do-flamengo


Unnamed: 0,players_ID,team_ID,name,position,field_position,Age,Nat,Height,foot,dt_joined,prev_team,contract_expires,market_value,player_page
0,1,1,Diego Alves,Goalkeeper,GLK,34,Brazil,1.87,left,2017-07-16,Valencia CF,2030-01-01,3.14,https://www.transfermarkt.com/diego-alves/prof...
1,2,1,Hugo Souza,Goalkeeper,GLK,21,Brazil,1.96,both,2019-08-31,Clube de Regatas do Flamengo U20,2030-01-01,0.5,https://www.transfermarkt.com/hugo-souza/profi...
2,3,1,Cesar,Goalkeeper,GLK,28,Brazil,1.94,right,2013-01-01,Clube de Regatas do Flamengo U20,2030-01-01,0.45,https://www.transfermarkt.com/cesar/profil/spi...
3,4,1,Gabriel Batista,Goalkeeper,GLK,21,Brazil,1.88,right,2017-01-01,Clube de Regatas do Flamengo U20,2030-01-01,0.06,https://www.transfermarkt.com/gabriel-batista/...
4,5,1,Rodrigo Caio,Centre-Back,DEF,26,Brazil,1.83,right,2019-01-01,Sao Paulo Futebol Clube,2030-01-01,5.38,https://www.transfermarkt.com/rodrigo-caio/pro...
5,6,1,Leo Pereira,Centre-Back,DEF,24,Brazil,1.89,left,2020-01-28,Club Athletico Paranaense,2030-01-01,4.48,https://www.transfermarkt.com/leo-pereira/prof...
6,7,1,Gustavo Henrique,Centre-Back,DEF,27,Brazil,1.96,right,2020-01-13,Santos FC,2030-01-01,2.69,https://www.transfermarkt.com/gustavo-henrique...
7,8,1,Thuler,Centre-Back,DEF,21,Brazil,1.85,right,2018-01-01,Clube de Regatas do Flamengo B,2030-01-01,0.5,https://www.transfermarkt.com/thuler/profil/sp...
8,9,1,Matheus Dantas,Centre-Back,DEF,21,Brazil,1.86,right,2019-01-01,-,2030-01-01,0.06,https://www.transfermarkt.com/matheus-dantas/p...
9,10,1,Rafael Santos,Centre-Back,DEF,22,Brazil,1.9,left,2019-01-01,-,2030-01-01,0.06,https://www.transfermarkt.com/rafael-santos/pr...


In [None]:
# df.dtypes

In [None]:
# from datetime import datetime

# df_copy = df.copy()

# for date in df_copy["contract_expires"]:
#     try:
#         date = datetime.strptime(date, '%d.%m.%Y').date()
#     except:
#         date = '-' 
#     print(date)


In [None]:
# fla_copy = fla.copy()

# for date in fla_copy["dt_joined"]:
#     try:
#         date = datetime.strptime(date, '%b %d, %Y').date()
#     except:
#         date = "-"
#     print(date)

In [None]:
# name_1 = features[2].find("img", alt=True)["alt"]
# position_1 = features[4].text
# DoB_1 = int((features[5].text.split("(",)[-1])[:-1])
# nat = features[6].img["alt"]
# Height_1 = float((features[7].text.split(" ")[0]).replace(",", "."))
# foot_1 = features[8].text
# dt_joined_1 = features[9].text
# prev_team_1 = features[10].img["alt"]
# contract_expires_1 = features[11].text
# market_value_1 = get_value_us(features[12].text[:-2])

# print(name_1, position_1,DoB_1, nat, Height_1,foot_1,dt_joined_1,prev_team_1,contract_expires_1,market_value_1   )


In [None]:
# pd.read_csv("data/teams/goias-esporte-clube_trmk.csv")

In [None]:
# # Retrieve page with the requests module
# url = "https://www.transfermarkt.com/major-league-soccer/startseite/wettbewerb/MLS1"
# html = requests.get(url, headers=headers)

In [None]:
# # Create BeautifulSoup object; parse with 'html.parser'
# soup = BeautifulSoup(html.content, 'html.parser')
# htmltable = soup.find('table', class_= "items")
# text1 = htmltable.text
# text1

In [None]:
# #Examine the results, then determine element that contains sought info
# print(soup.prettify())

In [None]:
# results = htmltable.findAll("tr", class_ =["odd","even"])
# print(results[0])
# print(len(results))

In [None]:
# # to Download imgs
# for img in logos:
#         with open(basename(img), "wb") as f:
#             f.write(requests.get(img).content)

In [None]:
# names= []
# for result in results:
#     names.append(result.find("img", alt=True)["alt"])
    
# names

In [None]:
## Extrating a table from HTML
# htmltable2 = str(htmltable)
# dfs = pd.read_html(htmltable2)
# df_clean = dfs[0][["Club.1","name.1", "ø age", "Total market value", "ø market value" ]]
# df_clean = df_clean.drop(df_clean.index[len(df_clean.index)-1])
# df = df_clean.set_axis(["Club","Squad", "Foreigners", "avg_market_value" , "total_market_value"], axis=1, inplace=False)

<td class="zentriert no-border-rechts">
    <a class="vereinprofil_tooltip" href="/los-angeles-football-club/startseite/verein/51828/saison_id/2019">
        <img alt="Los Angeles FC" class="tiny_wappen" src="https://tmssl.akamaized.net//images/wappen/tiny/51828.png?lm=1511112738" title=" "/>
    </a>
</td>
<td class="hauptlink no-border-links hide-for-small hide-for-pad">
    <a class="vereinprofil_tooltip" href="/los-angeles-football-club/startseite/verein/51828/saison_id/2019" id="51828">
        Los Angeles FC
    </a>
</td>
<td class="hauptlink no-border-links show-for-small show-for-pad">
    <a class="vereinprofil_tooltip" href="/los-angeles-football-club/startseite/verein/51828/saison_id/2019" id="51828">
        Los Angeles FC
    </a>
</td>
<td class="zentriert">
    <a href="/los-angeles-fc/kader/verein/51828/saison_id/2019" title="Los Angeles FC">
    24
    </a>
</td>
<td class="zentriert hide-for-small hide-for-pad">
    26,1
</td>
<td class="zentriert hide-for-pad hide-for-small">
    17
</td>
<td class="rechts hide-for-small hide-for-pad">
    <a href="/los-angeles-fc/kader/verein/51828/saison_id/2019" title="Los Angeles FC">
    €49.20m
    </a>
</td>
<td class="rechts hide-for-small hide-for-pad">
    €2.05m
</td>
<td class="rechts show-for-small show-for-pad nowrap">
    <a href="/los-angeles-fc/kader/verein/51828/saison_id/2019" title="Los Angeles FC">
        €49.20m
    </a>
</td>
<td class="rechts show-for-small show-for-pad nowrap">
    €2.05m
</td>


In [None]:
# def get_value(x):
#     value = []
#     for char in x:
#         value.append(char)
#     float_value = "".join(value[1:-1])
#     return float(float_value)

In [None]:
# get_value(fdf[6].text)

In [None]:
# # URLs of page to be scraped
# bra = {"BRAZIL": {"leagues":[{"tier_1":'https://www.transfermarkt.com/campeonato-brasileiro-serie-a/startseite/wettbewerb/BRA1'},
#                  {"tier_2": "https://www.transfermarkt.com/campeonato-brasileiro-serie-a/startseite/wettbewerb/BRA2"}]}},
# us = {"UNITED_STATES": {"leagues":[{"tier_1": "https://www.transfermarkt.com/major-league-soccer/startseite/wettbewerb/MLS1"},
#                   {"tier_2":"https://www.transfermarkt.com/usl-pro/startseite/wettbewerb/USL"}, 
#                   {"tier_3":"https://www.transfermarkt.com/usl-league-one/startseite/wettbewerb/USC3"}]}}
# urls= [bra, us]

In [None]:
# def scrape_league_data(leagues_df):
#     team_id = 1
#     for index, row in leagues_df.iterrows():
#         url = row["league_link"]
#         league_name = row["league_name"]
#         tier = row["tier"]
#         country = row["country"]
#         league_id = row["league_ID"]
#         print(f"scraping: {country}_{tier}_{league_name}")
#         html = requests.get(url, headers=headers)
#         soup = BeautifulSoup(html.content, 'html.parser')
#         htmltable = soup.find('table', class_= "items")


#         results = htmltable.findAll("tr", class_ =["odd","even"])


#         team_ID = []
#         links = []
#         names = []
#         logos = []
#         squads = []
#         foreigners = []
#         total_MVs = []
#         avg_MVs = []
#         league_ID = []



#         for result in results:
#             features = result.findAll("td")
#             links.append(("https://www.transfermarkt.com"+result.find("a", href=True)\
#                           ["href"]+"/plus/1").replace("startseite", "kader"))
#             logo = result.find("img", src=True)["src"]
#             logo = logo.split("?")[0]
#             logo = logo.replace("tiny", "header")
#             logos.append(logo)
#             name = features[1].text
#             names.append(name)
#             squad = features[3].text
#             squads.append(squad)
#             foreigner = features[5].text
#             foreigners.append(foreigner)
#             total_MV = get_value_us(features[6].text)
#             total_MVs.append(total_MV)
#             avg_MV = get_value_us(features[7].text)
#             avg_MVs.append(avg_MV)
#             team_ID.append(team_id)
#             league_ID.append(league_id)
#             team_id = team_id + 1



#         # Create a Dataframe and export to a .csv file
#         df = pd.DataFrame(list(zip(team_ID, league_ID, names, squads, foreigners,avg_MVs, total_MVs, logos,links)), \
#     columns =["team_ID", "league_ID","club","squad", "foreigners", "avg_market_value_m", "total_MV_m",'Logo_img', "link_page"]) 
#         df['league_ID'] = df['league_ID'].astype(int)
#         df['team_ID'] = df['team_ID'].astype(int)

#         df.to_csv(f'data/{country}_{tier}_{league_name}_trmk.csv',index=False)


#         time.sleep(1)

In [None]:
# def scrape_team_data(files):
#     player_id = 1
#     for filename in files:
#         print(filename)
#         time.sleep(5)
#         df_league = pd.read_csv(filename)
#         for index, row in df_league.iterrows():
#             team_id = row["team_ID"]
#             team = row["link_page"]
#             team_name = team.split('/')[-8]
#             print(team_name)
#             html = requests.get(team, headers=headers)
#             soup = BeautifulSoup(html.content, 'html.parser')
#             htmltable = soup.find('table', class_= "items")

#             results = htmltable.findAll("tr", class_ =["odd","even"])
#             name = []
#             player_page = []
#             position = []
#             Age = []
#             Nat = []
#             Height = []
#             foot = []
#             dt_joined = []
#             prev_team = []
#             contract_expires = []
#             market_value = []
#             team_ID = []
#             players_ID = []

#             for result in results:
#                 features = result.findAll("td")

#                 name_1 = features[2].find("img", alt=True)["alt"]

#                 player_page.append("https://www.transfermarkt.com" + features[3].find("a", href=True)["href"])

#                 position_1 = features[4].text

#                 try:
#                     age_1 = int((features[5].text.split("(",)[-1])[:-1])
#                 except:
#                     age_1 = "-"

#                 nat = features[6].img["alt"]

#                 try:
#                     Height_1 = float((features[7].text.split(" ")[0]).replace(",", "."))
#                 except:
#                     Height_1 = "m"

#                 foot_1 = features[8].text

#                 dt_joined_1 = features[9].text
#                 try:
#                     dt_joined_1 = datetime.strptime(dt_joined_1, '%b %d, %Y').date()
#                 except:
#                     dt_joined_1 = "-"

#                 try:
#                     prev_team_1 = features[10].img["alt"]
#                 except:
#                     prev_team_1 = "-"


#                 contract_expires_1 = features[11].text
#                 try:
#                     contratc_expires_1 = datetime.strptime(contratc_expires_1, '%d.%m.%Y').date()
#                 except:
#                     contract_expires_1 = features[11].text


#                 try:
#                     market_value_1 = get_value_us(features[12].text[:-2])
#                 except:
#                     market_value_1 = 0

#                 name.append(name_1)
#                 position.append(position_1)
#                 Age.append(age_1)
#                 Nat.append(nat)
#                 Height.append(Height_1)
#                 foot.append(foot_1)
#                 dt_joined.append(dt_joined_1)
#                 prev_team.append(prev_team_1)
#                 contract_expires.append(contract_expires_1)
#                 market_value.append(market_value_1)
#                 team_ID.append(team_id)
#                 players_ID.append(player_id)
#                 player_id = player_id+1

#             df = pd.DataFrame(list(zip(players_ID, team_ID, name, position, Age,Nat, Height, foot,dt_joined,prev_team, contract_expires,\
#                                        market_value,player_page)), 
#                               columns =["players_ID", "team_ID","name","position", "Age", "Nat","Height","foot",'dt_joined',"prev_team", \
#                                         "contract_expires", "market_value","player_page" ])
#             df['players_ID'] = df['players_ID'].astype(int)
#             df['team_ID'] = df['team_ID'].astype(int)
            

#             df.to_csv(f"data/teams/{team_name}_trmk.csv" ,index=False)
#             time.sleep(1)