Libraries


EDA


In [1]:
import logging
import colorlog
import pandas as pd
from pathlib import Path
from joblib import Parallel, delayed
from time import time
from functools import wraps
import os
from dotenv import load_dotenv

# Load environment variables from a .env file if present
# load_dotenv()

# Configure logging 
# logging.basicConfig(level=Config.LOG_LEVEL, format=Config.LOG_FORMAT, datefmt=Config.LOG_DATE_FORMAT)

# Configuration class for global constants loaded from environment variables
class Config:
    LOG_LEVEL = os.getenv('LOG_LEVEL', 'INFO').upper()
    DATA_DIRECTORY = Path(os.getenv('DATA_DIRECTORY', '../data'))
    RETRY_LIMIT = int(os.getenv('RETRY_LIMIT', 2))
    N_JOBS = int(os.getenv('N_JOBS', -1))  # Number of parallel jobs
    LOG_FILE = os.getenv('LOG_FILE', 'app.log')
    LOG_FORMAT_CONSOLE = '%(log_color)s%(message)s%(reset)s'
    LOG_FORMAT_FILE = '%(asctime)s - %(levelname)s - %(message)s'
    DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
    LOG_COLORS = {
        'DEBUG': 'cyan',
        #'INFO': 'white',
        'WARNING': 'green',
        'ERROR': 'purple',
        'CRITICAL': 'red',
    }


# Setup logging with colorlog and file logging
def setup_logging():
    logger = colorlog.getLogger()
    logger.setLevel(getattr(logging, Config.LOG_LEVEL))

    # Console handler with colored output
    console_handler = colorlog.StreamHandler()
    console_handler.setFormatter(colorlog.ColoredFormatter(
        Config.LOG_FORMAT_CONSOLE,
        datefmt=Config.DATE_FORMAT,
        log_colors=Config.LOG_COLORS))

    # File handler for logging to a file
    file_handler = logging.FileHandler(Config.LOG_FILE)
    file_handler.setFormatter(logging.Formatter(
        Config.LOG_FORMAT_FILE,
        datefmt=Config.DATE_FORMAT))

    # Adding handlers to the logger
    logger.addHandler(console_handler)
    logger.addHandler(file_handler)


# Decorator for retry mechanism
def retry(func):
    @wraps(func)
    def wrapper_retry(*args, **kwargs):
        for attempt in range(Config.RETRY_LIMIT):
            try:
                return func(*args, **kwargs)
            except Exception as e:
                logging.warning(f"Attempt {attempt + 1} failed for {func.__name__}: {e}")
                if attempt == Config.RETRY_LIMIT - 1:
                    raise
    return wrapper_retry


# Decorator to measure execution time
def execution_time(func):
    @wraps(func)
    def wrapper_execution_time(*args, **kwargs):
        start = time()
        result = func(*args, **kwargs)
        end = time()
        logging.info(f"{func.__name__} executed in {end - start:.2f} seconds.")
        return result
    return wrapper_execution_time


def process_file(file_path: Path):
    start_time = time()
    df = pd.read_csv(file_path)
    cleaned_df = df.drop_duplicates()
    duplicates_count = len(df) - len(cleaned_df)
    null_counts = cleaned_df.isnull().sum().sum()
    end_time = time()
    execution_time = end_time - start_time
    message = f"Processed {file_path.name} in {execution_time:.2f} seconds. Rows={len(cleaned_df)}, Duplicates removed={duplicates_count}, Total null values={null_counts}."
    if null_counts > 0:
        message = f"[Alert] {message}"
    return message


# Main function to execute the script logic
@execution_time
def main():
    setup_logging()
    logging.warning("Processing files...")
    files = list(Config.DATA_DIRECTORY.glob('*.csv'))
    results = Parallel(n_jobs=Config.N_JOBS)(delayed(process_file)(file) for file in files)
    for result in results:
        logging.info(result)
    logging.warning("Processing completed.")

if __name__ == "__main__":
    main()


[32mProcessing files...[0m
[37mProcessed interactions_test.csv in 0.01 seconds. Rows=12455, Duplicates removed=0, Total null values=0.[0m
[37mProcessed interactions_train.csv in 0.41 seconds. Rows=698901, Duplicates removed=0, Total null values=0.[0m
[37mProcessed interactions_validation.csv in 0.01 seconds. Rows=7023, Duplicates removed=0, Total null values=0.[0m
[37mProcessed PP_recipes.csv in 1.93 seconds. Rows=178265, Duplicates removed=0, Total null values=0.[0m
[37mProcessed PP_users.csv in 0.15 seconds. Rows=25076, Duplicates removed=0, Total null values=0.[0m
[37m[Alert] Processed RAW_interactions.csv in 3.81 seconds. Rows=1132367, Duplicates removed=0, Total null values=169.[0m
[37m[Alert] Processed RAW_recipes.csv in 3.39 seconds. Rows=231637, Duplicates removed=0, Total null values=4980.[0m
[32mProcessing completed.[0m
[37mmain executed in 4.63 seconds.[0m
