V1

In [None]:
import pandas as pd
from PyCaller import process_urls
from tqdm import tqdm

def process_and_print_results(url):
    print(f"Processing URL: {url}")
    data = process_urls([url])

    if data:
        print(f"\nResults for URL: {url}")
        for key, df in data.items():
            if df is not None and not df.empty:
                print(f"{key.replace('_', ' ').title()}:")
                print("\n")
        return data
    else:
        print(f"No data found for URL: {url}")
        return None

def main():
    # Load initial URLs
    all_found_urls_s = pd.read_csv("all_found_urls_23.12.23_cleaned.csv")
    initial_urls = list(set(all_found_urls_s["0"]))[:10]

    # Initialize DataFrames
    all_pole_studio_data = pd.DataFrame()
    all_workshops_data = pd.DataFrame()
    all_workshop_details_data = pd.DataFrame()

    # Process each URL
    total_urls = len(initial_urls)
    for i, url in tqdm(enumerate(initial_urls, start=1), total=total_urls, desc="Processing URLs"):
        print(f"Processing URL {i}/{total_urls}: {url}")
        data = process_and_print_results(url)
        if data:
            # Add data to corresponding DataFrames
            if 'pole_studio_data' in data and not data['pole_studio_data'].empty:
                all_pole_studio_data = pd.concat([all_pole_studio_data, data['pole_studio_data']], ignore_index=True)
            
            if 'workshops_data' in data and not data['workshops_data'].empty:
                all_workshops_data = pd.concat([all_workshops_data, data['workshops_data']], ignore_index=True)
            
            if 'workshop_details' in data and not data['workshop_details'].empty:
                all_workshop_details_data = pd.concat([all_workshop_details_data, data['workshop_details']], ignore_index=True)

        # Display progress
        progress_percent = (i / total_urls) * 100
        print(f"Progress: {progress_percent:.2f}%\n")

    # # Export DataFrames to CSV files
    # all_pole_studio_data.to_csv("Pole_Studio_Übersicht_S.csv", index=False)
    # all_workshops_data.to_csv("Workshop_Liste_SW.csv", index=False)
    # all_workshop_details_data.to_csv("Workshop_Übersicht_E.csv", index=False)

if __name__ == "__main__":
    main()


URLS Cleaner

In [None]:
import pandas as pd

def clean_and_unique_urls(file_path):
    # Read the CSV file
    all_found_urls_s = pd.read_csv(file_path, header=None, names=['index', 'url'])

    # Ensure all URLs are in lowercase
    all_found_urls_s['url'] = all_found_urls_s['url'].str.lower()

    # Remove duplicate URLs
    all_found_urls_s = all_found_urls_s.drop_duplicates(subset=['url'])

    # Save the cleaned and unique URLs to a new CSV file
    cleaned_file_path = file_path.replace('.csv', '_cleaned.csv')
    all_found_urls_s.to_csv(cleaned_file_path, index=False, header=False)

    return cleaned_file_path

if __name__ == "__main__":
    # Example usage
    input_file_path = "all_found_urls_23.12.23.csv"
    cleaned_file_path = clean_and_unique_urls(input_file_path)
    print(f"Cleaned and unique URLs saved to: {cleaned_file_path}")


V2

In [None]:
import pandas as pd
from PyCaller import process_urls
from tqdm import tqdm

def process_and_print_results(url):
    data = process_urls([url])

    if data:
        for key, df in data.items():
            if df is not None and not df.empty:
                print(f"{key.replace('_', ' ').title()}: {len(df)} entries")

def main():
    # Load initial URLs
    all_found_urls_s = pd.read_csv("all_found_urls_23.12.23_cleaned.csv")
    initial_urls = list(set(all_found_urls_s["0"]))[:100]

    # Initialize DataFrames
    all_pole_studio_data = pd.DataFrame()
    all_workshops_data = pd.DataFrame()
    all_workshop_details_data = pd.DataFrame()

    # Process each URL
    total_urls = len(initial_urls)
    with tqdm(total=total_urls, desc="Processing URLs") as pbar:
        for i, url in enumerate(initial_urls, start=1):
            process_and_print_results(url)
            pbar.update(1)

if __name__ == "__main__":
    main()


V3

In [9]:
import pandas as pd
from PyCaller import process_urls
from tqdm import tqdm

def process_and_print_results(url, all_pole_studio_data, all_workshops_data, all_workshop_details_data):
    data = process_urls([url])

    if data:
        for key, df in data.items():
            if df is not None and not df.empty:
                print(f"{key.replace('_', ' ').title()}: {len(df)} entries")

                # Update the appropriate DataFrame
                if key == 'pole_studio_data':
                    all_pole_studio_data = pd.concat([all_pole_studio_data, df], ignore_index=True)
                elif key == 'workshops_data':
                    all_workshops_data = pd.concat([all_workshops_data, df], ignore_index=True)
                elif key == 'workshop_details':
                    all_workshop_details_data = pd.concat([all_workshop_details_data, df], ignore_index=True)

    return all_pole_studio_data, all_workshops_data, all_workshop_details_data

def main():
    # Load initial URLs
    all_found_urls_s = pd.read_csv("all_found_urls_23.12.23_cleaned.csv")
    initial_urls = list(set(all_found_urls_s["0"]))[:10]

    # Initialize DataFrames
    all_pole_studio_data = pd.DataFrame()
    all_workshops_data = pd.DataFrame()
    all_workshop_details_data = pd.DataFrame()

    # Process each URL with tqdm
    with tqdm(total=len(initial_urls), desc="Processing URLs", dynamic_ncols=True) as pbar:
        for url in initial_urls:
            all_pole_studio_data, all_workshops_data, all_workshop_details_data = process_and_print_results(
                url, all_pole_studio_data, all_workshops_data, all_workshop_details_data
            )
            pbar.update(1)
            pbar.set_postfix_str(f"Current URL: {url}", refresh=True)

    # # Export DataFrames to CSV files
    # all_pole_studio_data.to_csv("Pole_Studio_Übersicht_S.csv", index=False)
    # all_workshops_data.to_csv("Workshop_Liste_SW.csv", index=False)
    # all_workshop_details_data.to_csv("Workshop_Übersicht_E.csv", index=False)

    # Return the final DataFrames
    return all_pole_studio_data, all_workshops_data, all_workshop_details_data

# Run the main function and get the final DataFrames
result_pole_studio, result_workshops, result_workshop_details = main()

# Now you can access result_pole_studio, result_workshops, and result_workshop_details outside the function
# print(result_pole_studio)
# print(result_workshops)
# print(result_workshop_details)


Processing URLs:   0%|          | 0/10 [00:00<?, ?it/s]

Starting URL reconstruction...


Processing URLs:  10%|█         | 1/10 [00:02<00:22,  2.45s/it, Current URL: https://www.eversports.de/s/dance-%26-fly-pole-studio]

Error during HTTP request: 404 Client Error: Not Found for url: https://www.eversports.de/s/dance-%26-fly-pole-studio
Converting to DataFrame...
Validating URLs...
Processing completed.
Starting URL reconstruction...


Processing URLs:  20%|██        | 2/10 [00:03<00:15,  1.90s/it, Current URL: https://www.eversports.de/s/polefriends]              

Error during HTTP request: 404 Client Error: Not Found for url: https://www.eversports.de/s/polefriends
Converting to DataFrame...
Validating URLs...
Processing completed.
Starting URL reconstruction...


2024-02-02 22:36:30,196 - INFO - Processing URL 1/3: https://www.eversports.de/s/loft1-basel-city


Converting to DataFrame...
Validating URLs...


2024-02-02 22:36:31,784 - INFO - URL https://www.eversports.de/s/loft1-basel-city is valid.
2024-02-02 22:36:31,785 - INFO - Processing URL 2/3: https://www.eversports.de/sp/loft1-basel-city
2024-02-02 22:36:33,087 - INFO - URL https://www.eversports.de/sp/loft1-basel-city is not valid.
2024-02-02 22:36:33,089 - INFO - Processing URL 3/3: https://www.eversports.de/s/loft1-basel-city/team
2024-02-02 22:36:34,384 - INFO - URL https://www.eversports.de/s/loft1-basel-city/team is valid.


Scraping Pole Studio Data from https://www.eversports.de/s/loft1-basel-city...


Processing URLs:  30%|███       | 3/10 [00:11<00:30,  4.32s/it, Current URL: https://www.eversports.de/s/loft1-basel-city]

Processing completed.
Pole Studio Data: 1 entries
Starting URL reconstruction...


Processing URLs:  40%|████      | 4/10 [00:12<00:18,  3.10s/it, Current URL: https://www.eversports.de/s/move-with-ana]   

Error during HTTP request: 404 Client Error: Not Found for url: https://www.eversports.de/s/move-with-ana
Converting to DataFrame...
Validating URLs...
Processing completed.
Starting URL reconstruction...


Processing URLs:  50%|█████     | 5/10 [00:13<00:12,  2.46s/it, Current URL: https://www.eversports.de/s/luftfabrik-dresden]

Error during HTTP request: 404 Client Error: Not Found for url: https://www.eversports.de/s/luftfabrik-dresden
Converting to DataFrame...
Validating URLs...
Processing completed.
Starting URL reconstruction...


2024-02-02 22:36:39,892 - INFO - Processing URL 1/3: https://www.eversports.de/s/tanzschule-poledance-reutlingen


Converting to DataFrame...
Validating URLs...


2024-02-02 22:36:41,392 - INFO - URL https://www.eversports.de/s/tanzschule-poledance-reutlingen is valid.
2024-02-02 22:36:41,394 - INFO - Processing URL 2/3: https://www.eversports.de/sp/tanzschule-poledance-reutlingen
2024-02-02 22:36:42,608 - INFO - URL https://www.eversports.de/sp/tanzschule-poledance-reutlingen is valid.
2024-02-02 22:36:42,609 - INFO - Processing URL 3/3: https://www.eversports.de/s/tanzschule-poledance-reutlingen/team
2024-02-02 22:36:44,480 - INFO - URL https://www.eversports.de/s/tanzschule-poledance-reutlingen/team is valid.


Scraping Pole Studio Data from https://www.eversports.de/s/tanzschule-poledance-reutlingen...


Processing URLs:  60%|██████    | 6/10 [00:20<00:15,  3.88s/it, Current URL: https://www.eversports.de/s/tanzschule-poledance-reutlingen]

Processing completed.
Pole Studio Data: 1 entries
Starting URL reconstruction...


Processing URLs:  70%|███████   | 7/10 [00:24<00:12,  4.02s/it, Current URL: https://www.eversports.de/s/polesports-studio-l%c3%bcneburg]

Error during HTTP request: Exceeded 30 redirects.
Converting to DataFrame...
Validating URLs...
Processing completed.
Starting URL reconstruction...


Processing URLs:  80%|████████  | 8/10 [00:28<00:07,  3.90s/it, Current URL: https://www.eversports.de/s/schönheitstanz-studio]          

Error during HTTP request: Exceeded 30 redirects.
Converting to DataFrame...
Validating URLs...
Processing completed.
Starting URL reconstruction...


Processing URLs:  90%|█████████ | 9/10 [00:32<00:03,  3.94s/it, Current URL: https://www.eversports.de/s/dance-moves-wolfenb%c3%bcttel]

Error during HTTP request: Exceeded 30 redirects.
Converting to DataFrame...
Validating URLs...
Processing completed.
Starting URL reconstruction...


Processing URLs: 100%|██████████| 10/10 [00:33<00:00,  3.35s/it, Current URL: https://www.eversports.de/s/health-and-shape]             

Error during HTTP request: 404 Client Error: Not Found for url: https://www.eversports.de/s/health-and-shape
Converting to DataFrame...
Validating URLs...
Processing completed.





In [10]:
result_pole_studio

Unnamed: 0,PoleStudio_Name,Adresse,PLZ,Stadt,Straße,Buttons,Pole Studio Beschreibung,E-Mail,Homepage,Telefon,URL_S,Art,Angebot,Created Date,Updated Date
0,LOFT1 BASEL CITY,"[Centralbahnplatz 10 , 4051 Basel]",4051,Basel,Centralbahnplatz 10,"[Übersicht, Klassen, Videos, Preise, Team]",Hell und freundlich und mit 13 Stangen ausgest...,info@loft1.ch,https://www.loft1.ch/studio-basel-2-2/,,https://www.eversports.de/s/loft1-basel-city,"[Fitness, Poledance, Poledance, Fitness, Fitne...",2 Angebote für Neukund:innen,2024-02-02 22:36:35,2024-02-02 22:36:35
1,Tanzschule Poledance Reutlingen,"[Uhlandstraße 60 , 72793 Pfullingen]",72793,Pfullingen,Uhlandstraße 60,"[Übersicht, Klassen, Videos, Preise, Team]",Die Tanzschule Poledance Reutlingen befindet s...,info@poledance-reutlingen.de,https://www.poledance-reutlingen.de,,https://www.eversports.de/s/tanzschule-poledan...,"[Rückenfit, Poledance, Poledance, Crossbody (H...",1 Angebot für Neukund:innen,2024-02-02 22:36:45,2024-02-02 22:36:45


In [11]:
result_workshop_details

In [12]:
from b_URLS_Validation import validated_urls

ImportError: cannot import name 'validated_urls' from 'b_URLS_Validation' (c:\Users\hamud\Documents\GitHub\1_Latest_version_Hop_Scrapper_V5\d_Test_Refactor\Test\Py\b_URLS_Validation.py)