Libraries


In [1]:
import logging
import requests
from pathlib import Path
import time
from typing import Callable, Any
import backoff
from functools import wraps
from kaggle.api.kaggle_api_extended import KaggleApi
from joblib import Parallel, delayed


Dataset Processing


In [2]:
class Config:
    KAGGLE_DATASET = "shuyangli94/food-com-recipes-and-user-interactions"
    DATASET_DESTINATION_PATH = Path("../data")
    RETRY_ATTEMPTS = 2
    LOG_FORMAT = '%(message)s'
    LOG_DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
    LOG_LEVEL = logging.INFO
    RETRY_EXCEPTIONS = (requests.exceptions.RequestException, OSError)
    N_JOBS = -1  # Use all available cores

# Configure logging
logging.basicConfig(level=Config.LOG_LEVEL, format=Config.LOG_FORMAT, datefmt=Config.LOG_DATE_FORMAT)

def execution_time(func: Callable[..., Any]) -> Callable[..., Any]:
    @wraps(func)
    def wrapper(*args, **kwargs) -> Any:
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time() - start_time
        logging.info(f"{func.__name__} executed in {end_time:.2f} seconds.")
        return result
    return wrapper

def retry_on_failure(max_tries=Config.RETRY_ATTEMPTS, exceptions=Config.RETRY_EXCEPTIONS):
    def decorator(func):
        @wraps(func)
        @backoff.on_exception(backoff.expo, exceptions, max_tries=max_tries,
                              on_backoff=lambda details: logging.warning(
                                  f"Retry {details['tries']}/{max_tries} for {func.__name__} due to error, waiting {details.get('wait', 0):0.1f} seconds."),
                              on_giveup=lambda details: logging.error(
                                  f"Giving up {func.__name__} after {details['tries']} tries"))
        def wrapper(*args, **kwargs):
            return func(*args, **kwargs)
        return wrapper
    return decorator

@retry_on_failure()
@execution_time
def kaggle_api_authenticate() -> KaggleApi:
    api = KaggleApi()
    api.authenticate()
    logging.info("Kaggle API authenticated successfully.")
    return api

@execution_time
def download_and_extract_kaggle_dataset(api: KaggleApi) -> None:
    if not Config.DATASET_DESTINATION_PATH.exists() or not any(Config.DATASET_DESTINATION_PATH.iterdir()):
        api.dataset_download_files(Config.KAGGLE_DATASET, path=Config.DATASET_DESTINATION_PATH, unzip=True)
        logging.info(f"Dataset extracted to {Config.DATASET_DESTINATION_PATH}.")
    else:
        logging.info("Dataset already present, skipping download.")

@execution_time
def find_csv_files() -> list:
    csv_files = [str(file) for file in Config.DATASET_DESTINATION_PATH.rglob('*.csv')]
    logging.info(f"Found {len(csv_files)} CSV files in the dataset.")
    return csv_files

@execution_time
def main():
    logging.info("Initiating dataset processing...")
    api = kaggle_api_authenticate()
    download_and_extract_kaggle_dataset(api)
    csv_files = find_csv_files()
    logging.info(f"Processing complete. CSV Files: {csv_files}")

if __name__ == "__main__":
    main()


Initiating dataset processing...
Kaggle API authenticated successfully.
kaggle_api_authenticate executed in 0.01 seconds.
Dataset extracted to ..\data.
download_and_extract_kaggle_dataset executed in 7.36 seconds.
Found 7 CSV files in the dataset.
find_csv_files executed in 0.00 seconds.
Processing complete. CSV Files: ['..\\data\\interactions_test.csv', '..\\data\\interactions_train.csv', '..\\data\\interactions_validation.csv', '..\\data\\PP_recipes.csv', '..\\data\\PP_users.csv', '..\\data\\RAW_interactions.csv', '..\\data\\RAW_recipes.csv']
main executed in 7.38 seconds.
