In [8]:
import polars as pl
import os

In [9]:
def import_tennis_matches(base_path="tennis_wta", data_type="singles"):
    dataframes = []
    null_values = ['Q', 'NA', 'N/A', '', '-', 'Unknown', 'null', 'W/O']
   
    # Define file prefixes for different match types
    match_types = {
        "singles": ["wta_matches"],
        "qualifiers_challengers": ["wta_matches_qual_itf"]
    }
   
    # Define year ranges for different match types
    year_ranges = {
        "singles": (1968, 2025),
        "qualifiers_challengers": (1968, 2025)
    }
   
    # Get the appropriate prefixes and year range
    prefixes = match_types.get(data_type, [])
    start_year, end_year = year_ranges.get(data_type, (1968, 2025))
   
    # Iterate through all specified prefixes
    for prefix in prefixes:
        for year in range(start_year, end_year):
            filename = f"{prefix}_{year}.csv"
            filepath = os.path.join(base_path, filename)
           
            if os.path.exists(filepath):
                try:
                    # Expanded schema overrides to preserve string types
                    schema_overrides = {
                        'winner_seed': pl.Utf8,
                        'loser_seed': pl.Utf8,
                        'tourney_date': pl.Utf8,
                        'winner_entry': pl.Utf8,
                        'loser_entry': pl.Utf8,
                        'winner_rank': pl.Utf8,
                        'loser_rank': pl.Utf8,
                        'winner1_id': pl.Utf8,
                        'winner2_id': pl.Utf8,
                        'loser1_id': pl.Utf8,
                        'loser2_id': pl.Utf8
                    }
                   
                    df = pl.read_csv(
                        filepath,
                        schema_overrides=schema_overrides,
                        null_values=null_values,
                        ignore_errors=True,
                        truncate_ragged_lines=True,
                        infer_schema_length=10000
                    )
                   
                    # Convert tourney_date to Date type
                    df = df.with_columns(
                        pl.col('tourney_date')
                        .str.to_date(format='%Y%m%d')
                        .alias('tourney_date')
                    )
                   
                    dataframes.append(df)
                except Exception as e:
                    print(f"Error importing {filename}: {e}")
   
    # Vertical concat with relaxed schema
    return pl.concat(dataframes, how="vertical_relaxed") if dataframes else None

# Usage
singles_matches = import_tennis_matches(data_type="singles")
qualifiers_challengers = import_tennis_matches(data_type="qualifiers_challengers")

# Print shapes and date range for verification
for match_type, matches in [
    ("Singles", singles_matches),
    ("Qualifiers/Challengers", qualifiers_challengers)
]:
    if matches is not None:
        print(f"{match_type} matches DataFrame shape: {matches.shape}")
        print(f"{match_type} matches date range: {matches['tourney_date'].min()} to {matches['tourney_date'].max()}")

Error importing wta_matches_qual_itf_2015.csv: invalid utf-8 sequence
Error importing wta_matches_qual_itf_2016.csv: invalid utf-8 sequence
Error importing wta_matches_qual_itf_2017.csv: invalid utf-8 sequence
Singles matches DataFrame shape: (156578, 49)
Singles matches date range: 1967-12-25 to 2024-05-20
Qualifiers/Challengers matches DataFrame shape: (512063, 49)
Qualifiers/Challengers matches date range: 1968-04-22 to 2024-05-20


In [10]:
singles_matches.write_parquet(file="aggregated_matches_wta/singles_matches.parquet",compression="zstd")
qualifiers_challengers.write_parquet(file="aggregated_matches_wta/quali_and_challengers_matches.parquet",compression="zstd")

In [11]:
players=pl.read_csv("tennis_wta/wta_players.csv")