In [None]:
import s3fs

fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"})

In [None]:
files = [file for file in fs.ls("lgaliana/cyclisme/") if file.endswith(".parquet")]

In [None]:
import s3fs
import pandas as pd 

fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"})

files = [file for file in fs.ls("lgaliana/cyclisme/") if file.endswith(".parquet")]

dataframes = []
for file in files:
    with fs.open(file, "rb") as f:
        df = pd.read_parquet(f)
        dataframes.append(df)

list_files = pd.concat(dataframes)

In [None]:
list_downloaded = pd.DataFrame(
    fs.ls("lgaliana/cyclisme/data/geojson/split/"),
    columns = ["downloaded"]
)

In [None]:
list_downloaded['file'] = list_downloaded['downloaded'].str.rsplit("/").str[-1]
list_downloaded['file'] = "https://www.cols-cyclisme.com/gpx/" + list_downloaded['file']
list_downloaded['file'] = list_downloaded['file'].str.replace("geojson", "gpx")

In [None]:
list_all = list_files.merge(list_downloaded, left_on = "id", right_on="file", how = "left")

In [None]:
non_downloaded = list_all.loc[list_all['downloaded'].isnull()]

In [None]:
import geopandas as gpd
from pathlib import Path
from scraping import extract_info_col, get_gpx_from_url
import time

details_df = pd.DataFrame()
traces = gpd.GeoDataFrame()

Path("./gpx").mkdir(parents=True, exist_ok=True)

#details_df_old = pd.DataFrame()
#traces_old = gpd.GeoDataFrame()
details_df = pd.DataFrame()
traces = gpd.GeoDataFrame()

# Iterate over each row in the DataFrame
for index, row in non_downloaded.iterrows():
    if pd.notnull(row['id']): # and row['id'] not in details_df_old['url'].tolist():
        print(f"{index}, {row['id']}")
        col_url = row['href']
        col_info_df = extract_info_col(col_url, id=index)
        col_info_df['url'] = row['id']
        trace = get_gpx_from_url(row['id'])
        details_df = pd.concat([details_df, col_info_df], ignore_index=True)
        traces = pd.concat([traces, trace])
        time.sleep(1)  # Sleep for 1 second between requests

In [None]:
df = non_downloaded.copy()

df["url"] = df["id"]
df["id"] = df["url"].str.rsplit("/").str[-1].str.replace(".gpx", "")

In [None]:
import os
import requests
from scraping import create_geojson_from_gpx, get_max_altitude_rows

# Create a directory to store the downloaded images
os.makedirs("images", exist_ok=True)
os.makedirs("data/derived/", exist_ok=True)

# ----------------------------------
# duplicate images on sspcloud

for url in df["Profil Image URL"]:
    # Check if the URL is available
    if url != "Not available":
        # Get the filename from the URL
        filename = url.split("/")[-1]

        # Check if the file already exists locally
        if os.path.exists(f"images/{filename}"):
            print(f"Image already exists: {filename}")
        else:
            time.sleep(1)
            # Send a GET request to download the image
            response = requests.get(url)

            # Check if the request was successful
            if response.status_code == 200:
                # Save the image locally
                with open(f"images/{filename}", "wb") as file:
                    file.write(response.content)
                print(f"Image downloaded: {filename}")
            else:
                print(f"Failed to download image: {filename}")
    else:
        print("Profil Image URL is not available.")


In [None]:
# ---------------------------------------
# create geojson for climbing ascent
from unidecode import unidecode

filename_summits = "missed.geojson"

geojsons = create_geojson_from_gpx()
df_max_alt = get_max_altitude_rows(geojsons)

df_max_alt["url"] = "https://www.cols-cyclisme.com//gpx/" + df_max_alt["url"]
df_max_alt["id"] = df_max_alt["url"].str.rsplit("/").str[-1].str.replace(".gpx", "")
df_max_alt = df_max_alt.drop("url", axis="columns")
df_max_alt = df_max_alt.merge(df, on="id")


sanitized_columns = (
    df_max_alt.columns.map(unidecode)  # Transliterate characters to ASCII
    .str.replace("%", "percent")  # Replace '%' with 'percent'
    .str.lower()  # Convert to lowercase
    .str.replace(
        r"[^a-zA-Z0-9_\s]", "", regex=True
    )  # Remove special characters except underscores and spaces
    .str.replace(" ", "_")  # Replace spaces with underscores
)
df_max_alt.columns = sanitized_columns
columns_to_sanitize = [
    "altitude",
    "longueur",
    "denivellation",
    "percent_moyen",
    "percent_maximal",
]
df_max_alt[columns_to_sanitize] = (
    df_max_alt.loc[:, columns_to_sanitize]
    .replace({r"\s*": "", "km": "", "m": "", "%": ""}, regex=True)
    .astype(float)
)
if 'vtt' in df_max_alt.columns:
    df_max_alt['vtt'] = (
        df_max_alt['vtt'] == "ATTENTION : cette ascension nécéssite l'utilisation d'un VTT"
    )
if 'ouverture' in df_max_alt.columns:
    df_max_alt = df_max_alt.drop("ouverture", axis="columns")
df_max_alt["category"] = pd.cut(
    df_max_alt["denivellation"],
    right=False,
    bins=[80, 160, 320, 640, 800, float("inf")], 
    labels=["Cat 4", "Cat 3", "Cat 2", "Cat 1", "HC"]
)
df_max_alt = df_max_alt.dropna(subset=["category"])
df_max_alt["category"] = df_max_alt["category"].astype(str)
df_max_alt = df_max_alt.loc[~df_max_alt['massif'].str.contains('Canada')]
df_max_alt = df_max_alt.loc[~df_max_alt['massif'].str.contains('(Réunion, France)')]
df_max_alt.to_file(filename_summits)

In [None]:
from shapely.geometry import LineString

# Create routes ----------------------------------

routes = create_geojson_from_gpx(three_dim=True)

# Split the routes geodataframe by 'url' column values
split_routes = routes.groupby("url")

split_routes = routes.groupby(["url"])["geometry"].apply(
    lambda x: LineString(x.tolist())
)
split_routes = gpd.GeoDataFrame(split_routes, geometry="geometry")
split_routes = split_routes.groupby("url")

# Write each split into a separate .geojson file
for value, group in split_routes:
    filename = value.replace(".gpx", "")
    file_path = f"data/derived/{filename}.geojson"
    group.to_file(file_path, driver="GeoJSON")
