import requests
from bs4 import BeautifulSoup
import pandas as pd

#In this assignment, I try to scrape data from transfermakt.com. Specifically, I will scrape information of Turkish Superleague player's information for
#every team.


# Define headers to simulate a browser request
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

def scrape_team_players(team_url, club_name):


    
    response = requests.get(team_url, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        
        soup = BeautifulSoup(response.content, 'html.parser')

        
        player_table = soup.find('table', class_='items')

    
        player_data = []

        for player_row in player_table.find_all('tr')[1:]: 
            player_cells = player_row.find_all(['th', 'td'])
            player_info = [cell.text.strip() for cell in player_cells]

            
            if not player_info[0]:
                continue

            # Clean up additional information for the same player
            player_info = [info for info in player_info if info]

            
            player_data.append([club_name] + player_info)

        # number of columns
        num_columns = max(len(row) for row in player_data)

        # Create column names based on the number of columns
        columns = ['Club'] + [f'Column_{i + 1}' for i in range(num_columns - 1)]

        # Create a Pandas DataFrame from the extracted data
        df = pd.DataFrame(player_data, columns=columns)

        return df

    else:
        print(f"Error: Unable to fetch the page (Status Code: {response.status_code})")
        return None

# URL of the Süper Lig
super_lig_url = 'https://www.transfermarkt.com.tr/super-lig/startseite/wettbewerb/TR1'


response = requests.get(super_lig_url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    
    soup = BeautifulSoup(response.content, 'html.parser')

    
    teams_table = soup.find('table', class_='items')

    # Extract team URLs and names from the table
    team_data = [(a['href'], a.text.strip()) for a in teams_table.select('td.hauptlink a[href^="/"]')]

    
    all_players_df = pd.DataFrame()

    # Loop through each team URL and scrape player information
    for team_url, club_name in team_data:
        team_url = 'https://www.transfermarkt.com.tr' + team_url
        team_players_df = scrape_team_players(team_url, club_name)

        # Check if the DataFrame is not empty before concatenating
        if team_players_df is not None and not team_players_df.empty:
            all_players_df = pd.concat([all_players_df, team_players_df], ignore_index=True)

    
    display(all_players_df)

else:
    print(f"Error: Unable to fetch the page (Status Code: {response.status_code})")



#Further data cleaning
columns = ['Club', 'No', 'Name1', 'Name', 'Position', 'Age', 'Market_Value']
all_players_df.columns = columns
del all_players_df["Name1"]
all_players_df= all_players_df.dropna(how="any")
all_players_df=all_players_df.reset_index(drop=True)

all_players_df.to_csv('all_superleague_players.csv', index=False)
all_players_df