V1

In [None]:
import pandas as pd
from PyCaller import process_urls
from tqdm import tqdm

def process_and_print_results(url):
    print(f"Processing URL: {url}")
    data = process_urls([url])

    if data:
        print(f"\nResults for URL: {url}")
        for key, df in data.items():
            if df is not None and not df.empty:
                print(f"{key.replace('_', ' ').title()}:")
                print("\n")
        return data
    else:
        print(f"No data found for URL: {url}")
        return None

def main():
    # Load initial URLs
    all_found_urls_s = pd.read_csv("all_found_urls_23.12.23_cleaned.csv")
    initial_urls = list(set(all_found_urls_s["0"]))[:10]

    # Initialize DataFrames
    all_pole_studio_data = pd.DataFrame()
    all_workshops_data = pd.DataFrame()
    all_workshop_details_data = pd.DataFrame()

    # Process each URL
    total_urls = len(initial_urls)
    for i, url in tqdm(enumerate(initial_urls, start=1), total=total_urls, desc="Processing URLs"):
        print(f"Processing URL {i}/{total_urls}: {url}")
        data = process_and_print_results(url)
        if data:
            # Add data to corresponding DataFrames
            if 'pole_studio_data' in data and not data['pole_studio_data'].empty:
                all_pole_studio_data = pd.concat([all_pole_studio_data, data['pole_studio_data']], ignore_index=True)
            
            if 'workshops_data' in data and not data['workshops_data'].empty:
                all_workshops_data = pd.concat([all_workshops_data, data['workshops_data']], ignore_index=True)
            
            if 'workshop_details' in data and not data['workshop_details'].empty:
                all_workshop_details_data = pd.concat([all_workshop_details_data, data['workshop_details']], ignore_index=True)

        # Display progress
        progress_percent = (i / total_urls) * 100
        print(f"Progress: {progress_percent:.2f}%\n")

    # # Export DataFrames to CSV files
    # all_pole_studio_data.to_csv("Pole_Studio_Übersicht_S.csv", index=False)
    # all_workshops_data.to_csv("Workshop_Liste_SW.csv", index=False)
    # all_workshop_details_data.to_csv("Workshop_Übersicht_E.csv", index=False)

if __name__ == "__main__":
    main()


URLS Cleaner

In [None]:
import pandas as pd

def clean_and_unique_urls(file_path):
    # Read the CSV file
    all_found_urls_s = pd.read_csv(file_path, header=None, names=['index', 'url'])

    # Ensure all URLs are in lowercase
    all_found_urls_s['url'] = all_found_urls_s['url'].str.lower()

    # Remove duplicate URLs
    all_found_urls_s = all_found_urls_s.drop_duplicates(subset=['url'])

    # Save the cleaned and unique URLs to a new CSV file
    cleaned_file_path = file_path.replace('.csv', '_cleaned.csv')
    all_found_urls_s.to_csv(cleaned_file_path, index=False, header=False)

    return cleaned_file_path

if __name__ == "__main__":
    # Example usage
    input_file_path = "all_found_urls_23.12.23.csv"
    cleaned_file_path = clean_and_unique_urls(input_file_path)
    print(f"Cleaned and unique URLs saved to: {cleaned_file_path}")


V2

In [None]:
import pandas as pd
from PyCaller import process_urls
from tqdm import tqdm

def process_and_print_results(url):
    data = process_urls([url])

    if data:
        for key, df in data.items():
            if df is not None and not df.empty:
                print(f"{key.replace('_', ' ').title()}: {len(df)} entries")

def main():
    # Load initial URLs
    all_found_urls_s = pd.read_csv("all_found_urls_23.12.23_cleaned.csv")
    initial_urls = list(set(all_found_urls_s["0"]))[:100]

    # Initialize DataFrames
    all_pole_studio_data = pd.DataFrame()
    all_workshops_data = pd.DataFrame()
    all_workshop_details_data = pd.DataFrame()

    # Process each URL
    total_urls = len(initial_urls)
    with tqdm(total=total_urls, desc="Processing URLs") as pbar:
        for i, url in enumerate(initial_urls, start=1):
            process_and_print_results(url)
            pbar.update(1)

if __name__ == "__main__":
    main()


V3

In [3]:
import pandas as pd
from PyCaller import process_urls
from tqdm import tqdm

def process_and_print_results(url, all_pole_studio_data, all_workshops_data, all_workshop_details_data):
    data = process_urls([url])

    if data:
        for key, df in data.items():
            if df is not None and not df.empty:
                print(f"{key.replace('_', ' ').title()}: {len(df)} entries")

                # Update the appropriate DataFrame
                if key == 'pole_studio_data':
                    all_pole_studio_data = pd.concat([all_pole_studio_data, df], ignore_index=True)
                elif key == 'workshops_data':
                    all_workshops_data = pd.concat([all_workshops_data, df], ignore_index=True)
                elif key == 'workshop_details':
                    all_workshop_details_data = pd.concat([all_workshop_details_data, df], ignore_index=True)

    return all_pole_studio_data, all_workshops_data, all_workshop_details_data

def main():
    # Load initial URLs
    all_found_urls_s = pd.read_csv("all_found_urls_23.12.23_cleaned_cleaned.csv")
    initial_urls = list(set(all_found_urls_s["0"]))

    # Initialize DataFrames
    all_pole_studio_data = pd.DataFrame()
    all_workshops_data = pd.DataFrame()
    all_workshop_details_data = pd.DataFrame()

    # Process each URL with tqdm
    with tqdm(total=len(initial_urls), desc="Processing URLs", dynamic_ncols=True) as pbar:
        for url in initial_urls:
            all_pole_studio_data, all_workshops_data, all_workshop_details_data = process_and_print_results(
                url, all_pole_studio_data, all_workshops_data, all_workshop_details_data
            )
            pbar.update(1)
            pbar.set_postfix_str(f"Current URL: {url}", refresh=True)

    # # Export DataFrames to CSV files
    # all_pole_studio_data.to_csv("Pole_Studio_Übersicht_S.csv", index=False)
    # all_workshops_data.to_csv("Workshop_Liste_SW.csv", index=False)
    # all_workshop_details_data.to_csv("Workshop_Übersicht_E.csv", index=False)

    # Return the final DataFrames
    return all_pole_studio_data, all_workshops_data, all_workshop_details_data

# Run the main function and get the final DataFrames
result_pole_studio, result_workshops, result_workshop_details = main()

# Now you can access result_pole_studio, result_workshops, and result_workshop_details outside the function
# print(result_pole_studio)
# print(result_workshops)
# print(result_workshop_details)


KeyError: '0'

In [10]:
result_pole_studio

Unnamed: 0,PoleStudio_Name,Adresse,PLZ,Stadt,Straße,Buttons,Pole Studio Beschreibung,E-Mail,Homepage,Telefon,URL_S,Art,Angebot,Created Date,Updated Date
0,LOFT1 BASEL CITY,"[Centralbahnplatz 10 , 4051 Basel]",4051,Basel,Centralbahnplatz 10,"[Übersicht, Klassen, Videos, Preise, Team]",Hell und freundlich und mit 13 Stangen ausgest...,info@loft1.ch,https://www.loft1.ch/studio-basel-2-2/,,https://www.eversports.de/s/loft1-basel-city,"[Fitness, Poledance, Poledance, Fitness, Fitne...",2 Angebote für Neukund:innen,2024-02-02 22:36:35,2024-02-02 22:36:35
1,Tanzschule Poledance Reutlingen,"[Uhlandstraße 60 , 72793 Pfullingen]",72793,Pfullingen,Uhlandstraße 60,"[Übersicht, Klassen, Videos, Preise, Team]",Die Tanzschule Poledance Reutlingen befindet s...,info@poledance-reutlingen.de,https://www.poledance-reutlingen.de,,https://www.eversports.de/s/tanzschule-poledan...,"[Rückenfit, Poledance, Poledance, Crossbody (H...",1 Angebot für Neukund:innen,2024-02-02 22:36:45,2024-02-02 22:36:45


In [11]:
result_workshop_details