In [16]:
import os
import glob
import pandas as pd

def find_urls_in_csv(file_path, base_url):
    """
    Reads a CSV file and finds URLs matching a specific pattern.

    Args:
        file_path (str): The path to the CSV file.
        base_url (str): The base URL pattern to search for.

    Returns:
        list: A list of matching URLs.
    """
    try:
        data = pd.read_csv(file_path)
        matching_urls = []

        for column in data.columns:
            if data[column].dtype == object:  # Check if column type is 'object' (string)
                urls = data[column].dropna().tolist()  # Get all non-null values
                matching_urls.extend([url for url in urls if url.startswith(base_url)])

        return matching_urls
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return []

def process_all_csv_files(directory, base_url):
    """
    Process all CSV files in a directory to find URLs matching a specific pattern.

    Args:
        directory (str): The directory containing CSV files.
        base_url (str): The base URL pattern to search for.

    Returns:
        pandas.Series: A series containing all matching URLs.
    """
    all_urls = []
    for file_path in glob.glob(os.path.join(directory, '*.csv')):
        print(f"Processing file: {file_path}")
        urls = find_urls_in_csv(file_path, base_url)
        all_urls.extend(urls)

    return pd.Series(all_urls)

# Example usage
directory_path = r'C:\Users\hamud\Documents\GitHub\1_Latest_version_Hop_Scrapper_V5\Explo\CSV_Collector'  # Replace with your directory path
base_url = 'https://www.eversports.de/s/'
all_matching_urls = process_all_csv_files(directory_path, base_url)
print(all_matching_urls)


Processing file: C:\Users\hamud\Documents\GitHub\1_Latest_version_Hop_Scrapper_V5\Explo\CSV_Collector\1_PoleStudio_Overview_Didi_2023-05-03_01-18.csv
Processing file: C:\Users\hamud\Documents\GitHub\1_Latest_version_Hop_Scrapper_V5\Explo\CSV_Collector\1_PoleStudio_Overview_Didi_2023-05-03_19-32.csv
Processing file: C:\Users\hamud\Documents\GitHub\1_Latest_version_Hop_Scrapper_V5\Explo\CSV_Collector\1_PoleStudio_Overview_Didi_2023-05-05_16-01.csv
Processing file: C:\Users\hamud\Documents\GitHub\1_Latest_version_Hop_Scrapper_V5\Explo\CSV_Collector\1_PoleStudio_Overview_Didi_2023-05-06_13-24.csv
Processing file: C:\Users\hamud\Documents\GitHub\1_Latest_version_Hop_Scrapper_V5\Explo\CSV_Collector\1_PoleStudio_Overview_Didi_2023-05-06_21-48.csv
Processing file: C:\Users\hamud\Documents\GitHub\1_Latest_version_Hop_Scrapper_V5\Explo\CSV_Collector\1_PoleStudio_Overview_Didi_2023-05-07_13-43.csv
Processing file: C:\Users\hamud\Documents\GitHub\1_Latest_version_Hop_Scrapper_V5\Explo\CSV_Collecto

In [17]:
len(set(all_matching_urls))

559

In [18]:
all_matching_urls

0       https://www.eversports.de/s/pole-faction-pole-...
1       https://www.eversports.de/s/pole-dance-palazzo...
2            https://www.eversports.de/s/balance-neumarkt
3                  https://www.eversports.de/s/aerialflow
4       https://www.eversports.de/s/seemannsbraut-pole...
                              ...                        
7003           https://www.eversports.de/s/Star-Poledance
7004    https://www.eversports.de/s/Polemotions-Filder...
7005          https://www.eversports.de/s/The-Pole-Jungle
7006           https://www.eversports.de/s/Polestructions
7007       https://www.eversports.de/s/Pole-Dance-Krefeld
Length: 7008, dtype: object

In [None]:
all_matching_urls.to_csv("all_found_urls_23.12.23.csv")