In [32]:
import polars as pl
import os

def import_tennis_matches(base_path="matches_and_ranking", data_type="singles"):
    dataframes = []
    null_values = ['Q', 'NA', 'N/A', '', '-', 'Unknown', 'null', 'W/O']
   
    # Define file prefixes for different match types
    match_types = {
        "singles": ["atp_matches"],
        "doubles": ["atp_matches_doubles"],
        "qualifiers_challengers": ["atp_matches_qual_chall"],
        "futures": ["atp_matches_futures"]
    }
   
    # Define year ranges for different match types
    year_ranges = {
        "singles": (1968, 2025),
        "doubles": (1968, 2025),
        "qualifiers_challengers": (1978, 2025),
        "futures": (1991, 2025)
    }
   
    # Get the appropriate prefixes and year range
    prefixes = match_types.get(data_type, [])
    start_year, end_year = year_ranges.get(data_type, (1968, 2025))
   
    # Iterate through all specified prefixes
    for prefix in prefixes:
        for year in range(start_year, end_year):
            filename = f"{prefix}_{year}.csv"
            filepath = os.path.join(base_path, filename)
           
            if os.path.exists(filepath):
                try:
                    # Expanded schema overrides to preserve string types
                    schema_overrides = {
                        'winner_seed': pl.Utf8,
                        'loser_seed': pl.Utf8,
                        'tourney_date': pl.Utf8,
                        'winner_entry': pl.Utf8,
                        'loser_entry': pl.Utf8,
                        'winner_rank': pl.Utf8,
                        'loser_rank': pl.Utf8,
                        'winner1_id': pl.Utf8,
                        'winner2_id': pl.Utf8,
                        'loser1_id': pl.Utf8,
                        'loser2_id': pl.Utf8
                    }
                   
                    df = pl.read_csv(
                        filepath,
                        schema_overrides=schema_overrides,
                        null_values=null_values,
                        ignore_errors=True,
                        truncate_ragged_lines=True,
                        infer_schema_length=10000
                    )
                   
                    # Convert tourney_date to Date type
                    df = df.with_columns(
                        pl.col('tourney_date')
                        .str.to_date(format='%Y%m%d')
                        .alias('tourney_date')
                    )
                   
                    dataframes.append(df)
                except Exception as e:
                    print(f"Error importing {filename}: {e}")
   
    # Vertical concat with relaxed schema
    return pl.concat(dataframes, how="vertical_relaxed") if dataframes else None

# Usage
singles_matches = import_tennis_matches(data_type="singles")
doubles_matches = import_tennis_matches(data_type="doubles")
qualifiers_challengers = import_tennis_matches(data_type="qualifiers_challengers")
futures_matches = import_tennis_matches(data_type="futures")

# Print shapes and date range for verification
for match_type, matches in [
    ("Singles", singles_matches),
    ("Doubles", doubles_matches),
    ("Qualifiers/Challengers", qualifiers_challengers),
    ("Futures", futures_matches)
]:
    if matches is not None:
        print(f"{match_type} matches DataFrame shape: {matches.shape}")
        print(f"{match_type} matches date range: {matches['tourney_date'].min()} to {matches['tourney_date'].max()}")

Singles matches DataFrame shape: (193337, 49)
Singles matches date range: 1967-12-28 to 2024-05-20
Doubles matches DataFrame shape: (26399, 65)
Doubles matches date range: 2000-01-03 to 2020-03-06
Qualifiers/Challengers matches DataFrame shape: (216430, 49)
Qualifiers/Challengers matches date range: 1978-01-08 to 2024-05-20
Futures matches DataFrame shape: (486553, 49)
Futures matches date range: 1990-12-29 to 2024-05-20


In [33]:
doubles_matches

tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner1_id,winner2_id,winner_seed,winner_entry,loser1_id,loser2_id,loser_seed,loser_entry,score,best_of,round,winner1_name,winner1_hand,winner1_ht,winner1_ioc,winner1_age,winner2_name,winner2_hand,winner2_ht,winner2_ioc,winner2_age,loser1_name,loser1_hand,loser1_ht,loser1_ioc,loser1_age,loser2_name,loser2_hand,loser2_ht,loser2_ioc,loser2_age,winner1_rank,winner1_rank_points,winner2_rank,winner2_rank_points,loser1_rank,loser1_rank_points,loser2_rank,loser2_rank_points,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
str,str,str,i64,str,date,i64,str,str,str,str,str,str,str,str,str,i64,str,str,str,i64,str,f64,str,str,i64,str,f64,str,str,i64,str,f64,str,str,i64,str,f64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""2000-301""","""Auckland""","""Hard""",32,"""A""",2000-01-10,300,"""101704""","""101097""","""1""",,"""101358""","""101543""","""2""",,"""7-5 6-4""",3,"""F""","""Ellis Ferreira""","""L""",188,"""RSA""",29.889117,"""Rick Leach""","""L""",188,"""USA""",35.033539,"""Olivier Delaitre""","""R""",170,"""FRA""",32.610541,"""Jeff Tarango""","""L""",180,"""USA""",31.137577,13,0,15,0,16,0,17,0,,,,,,,,,,,,,,,,,,,
"""2000-301""","""Auckland""","""Hard""",32,"""A""",2000-01-10,299,"""101704""","""101097""","""1""",,"""102562""","""101866""","""3""",,"""6-2 6-4""",3,"""SF""","""Ellis Ferreira""","""L""",188,"""RSA""",29.889117,"""Rick Leach""","""L""",188,"""USA""",35.033539,"""Jiri Novak""","""R""",190,"""CZE""",24.804928,"""David Rikl""","""L""",178,"""CZE""",28.867899,13,0,15,0,34,0,29,0,,,,,,,,,,,,,,,,,,,
"""2000-301""","""Auckland""","""Hard""",32,"""A""",2000-01-10,298,"""101358""","""101543""","""2""",,"""102664""","""101779""",,,"""6-3 6-4""",3,"""SF""","""Olivier Delaitre""","""R""",170,"""FRA""",32.610541,"""Jeff Tarango""","""L""",180,"""USA""",31.137577,"""Petr Pala""","""R""",193,"""CZE""",24.273785,"""Pavel Vizner""","""R""",183,"""CZE""",29.489391,16,0,17,0,97,0,74,0,,,,,,,,,,,,,,,,,,,
"""2000-301""","""Auckland""","""Hard""",32,"""A""",2000-01-10,297,"""101704""","""101097""","""1""",,"""101342""","""102162""",,"""WC""","""6-3 6-4""",3,"""QF""","""Ellis Ferreira""","""L""",188,"""RSA""",29.889117,"""Rick Leach""","""L""",188,"""USA""",35.033539,"""Patrick Galbraith""","""L""",183,"""USA""",32.736482,"""Chris Woodruff""","""R""",188,"""USA""",27.019849,13,0,15,0,65,0,248,0,,,,,,,,,,,,,,,,,,,
"""2000-301""","""Auckland""","""Hard""",32,"""A""",2000-01-10,296,"""101358""","""101543""","""2""",,"""103483""","""102262""",,"""WC""","""6-2 6-4""",3,"""QF""","""Olivier Delaitre""","""R""",170,"""FRA""",32.610541,"""Jeff Tarango""","""L""",180,"""USA""",31.137577,"""James Shortall""","""R""",,"""NZL""",20.043806,"""Cristiano Testa""","""L""",,"""BRA""",26.297057,16,0,17,0,779,0,114,0,,,,,,,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2020-M-DC-2020-WG2-PO-MAR-VIE-…","""Davis Cup WG2 PO: MAR vs VIE""","""Clay""",4,"""D""",2020-03-06,3,"""104917""","""104467""",,,"""109042""","""122533""",,,"""6-4 6-1""",3,"""RR""","""Anas Fattar""","""R""",,"""MAR""",32.807666,"""Lamine Ouahab""","""R""",185,"""ALG""",35.192334,"""Quoc Khanh Le""","""R""",,"""VIE""",37.497604,"""Nam Hoang Ly""","""R""",,"""VIE""",23.014374,683,48,942,26,,,1293,12,,,,,,,,,,,,,,,,,,,
"""2020-M-DC-2020-WG2-PO-PAR-SRI-…","""Davis Cup WG2 PO: PAR vs SRI""","""Clay""",4,"""D""",2020-03-06,3,"""134677""","""137018""",,,"""202462""","""134886""",,,"""6-1 6-4""",3,"""RR""","""Juan Borba""","""R""",,"""PAR""",23.887748,"""Ayed Zatar""","""R""",,"""PAR""",23.926078,"""Yasitha De Silva""",,,"""SRI""",24.665298,"""Sharmal Dissanayake""","""R""",,"""SRI""",23.800137,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""2020-M-DC-2020-WG2-PO-PHI-GRE-…","""Davis Cup WG2 PO: PHI vs GRE""","""Clay""",4,"""D""",2020-03-06,3,"""108717""","""104604""",,,"""109737""","""202065""",,,"""7-6(5) 6-4""",3,"""RR""","""Francis Casey Alcantara""","""U""",,"""PHI""",28.073922,"""Ruben Gonzales""","""R""",,"""PHI""",34.472279,"""Markos Kalovelonis""","""R""",,"""GRE""",25.790554,"""Petros Tsitsipas""","""U""",,"""GRE""",19.597536,537,69,212,307,503,77,1072,20,,,,,,,,,,,,,,,,,,,
"""2020-M-DC-2020-WG2-PO-POL-HKG-…","""Davis Cup WG2 PO: POL vs HKG""","""Hard""",4,"""D""",2020-03-06,3,"""126166""","""126591""",,,"""205695""","""106388""",,,"""6-3 6-7(6) 6-3""",3,"""RR""","""Szymon Walkow""","""R""",,"""POL""",24.443532,"""Jan Zielinski""","""R""",,"""POL""",23.290897,"""Ching Lam""","""R""",,"""HKG""",20.495551,"""Pak Long Yeung""","""U""",,"""HKG""",25.045859,182,389,295,178,,,557,66,,,,,,,,,,,,,,,,,,,


In [34]:
doubles_matches.select(pl.col("winner_seed").value_counts()) 

winner_seed
struct[2]
"{""13"",108}"
"{""5"",531}"
"{""10"",114}"
"{""17"",5}"
"{""1"",2661}"
…
"{""4"",1797}"
"{""15"",116}"
"{""7"",430}"
"{""2"",2317}"


In [35]:
singles_matches.write_parquet(file="aggregated_matches/singles_matches.parquet",compression="zstd")
doubles_matches.write_parquet(file="aggregated_matches/doubles_matches.parquet",compression="zstd")
futures_matches.write_parquet(file="aggregated_matches/futures_matches.parquet",compression="zstd")
qualifiers_challengers.write_parquet(file="aggregated_matches/quali_and_challengers_matches.parquet",compression="zstd")