In [1]:
import pandas as pd
import numpy as np
import timeit
import re
from typing import List
import utils

In [2]:
# Sample DataFrame with many columns
num_columns = 10000  # Adjust to simulate larger or smaller DataFrames
data = {f'Column {i}#@!': range(10) for i in range(num_columns)}
df = pd.DataFrame(data)

# Approach 1: Standardize by passing the entire DataFrame
def standardize_column_names_df(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = df.columns.str.lower().str.replace('[^a-z0-9]', '', regex=True)
    return df

# Approach 2: Standardize by passing only the column names
def standardize_column_names_list(columns: List[str]) -> List[str]:
    standardized_columns = pd.Series(columns).str.lower().str.replace('[^a-z0-9]', '', regex=True).tolist()
    return standardized_columns

# Measure time for the first approach
time_df = timeit.timeit(lambda: standardize_column_names_df(df.copy()), number=10)

# Measure time for the second approach, reassigning only the columns
time_list = timeit.timeit(lambda: df.copy().set_axis(standardize_column_names_list(df.columns.tolist()), axis=1), number=10)

# Output the results
print(f"Time for entire DataFrame approach: {time_df:.5f} seconds")
print(f"Time for column list approach: {time_list:.5f} seconds")

Time for entire DataFrame approach: 0.06737 seconds
Time for column list approach: 0.07065 seconds


In [3]:
df = pd.read_excel('/Users/iairlinker/Documents/repos/sandbox-for-experimentation/modelos/clasificacion/data/raw/facturacion_directa_essilor_train.xlsx')  # Replace 'file1.csv' with the actual filename
df.columns = utils.standardize_column_names(df.columns.values)
# Function for vectorized approach with np.vectorize
def clean_text(text: str) -> str:
    return re.sub(r'[^a-z0-9]', '', text.lower())

def vectorized_numpy_approach():
    clean_text_vectorized = np.vectorize(clean_text)
    df['categoriaproducto'] = clean_text_vectorized(df['categoriaproducto'].values)

# Function for pandas str method approach
def pandas_str_approach():
    df['categoriaproducto'] = df['categoriaproducto'].apply(
        lambda x: [re.sub(r'[^a-z0-9]', '', item.lower()) for item in x]
    )

# Time the two approaches
numpy_time = timeit.timeit(vectorized_numpy_approach, number=10)
pandas_time = timeit.timeit(pandas_str_approach, number=10)

print(f"NumPy vectorized approach time: {numpy_time:.5f} seconds")
print(f"Pandas str method approach time: {pandas_time:.5f} seconds")
print(f"Ratio Pandas vs Numpy  approach time: {pandas_time/numpy_time:.5f} seconds")

NumPy vectorized approach time: 0.06622 seconds
Pandas str method approach time: 0.35901 seconds
Ratio Pandas vs Numpy  approach time: 5.42171 seconds


In [4]:
def has_beveling(
    product_list: List[str],
    beveling_keywords: List[str] = ['biselado', 'bisel', 'bordes']
) -> dict[str, List[int]]:
    """
    Identifies the positions of beveling-related keywords in the product list.

    Parameters:
    - product_list: list of product descriptions
    - beveling_keywords: list of keywords indicating beveling
    
    Returns:
    - Dictionary with keywords as keys and lists of positions as values, or an empty dictionary if no keyword is found
    """
    return {
        keyword: [idx for idx, item in enumerate(product_list) if keyword in item.lower()]
        for keyword in beveling_keywords
        if any(keyword in item.lower() for item in product_list)
    }
def identify_lens_type(product_list, lens_keywords=['progresivo', 'monofocal', 'bifocal', 'ocupacional']):
    """
    Identifies the positions of lens type keywords in the product list.

    Parameters:
    - product_list: list of product descriptions
    - lens_keywords: list of keywords indicating lens types
    
    Returns:
    - Dictionary with keywords as keys and lists of positions as values, or an empty dictionary if no keyword is found
    """
    return {
        keyword: [idx for idx, item in enumerate(product_list) if keyword in item.lower()]
        for keyword in lens_keywords
        if any(keyword in item.lower() for item in product_list)
    }
def has_treatment(product_list, treatment_keywords=['tratamiento', 'revestimiento', 'resistente', 'uv', 'protección']):
    """
    Identifies the positions of treatment-related keywords in the product list.

    Parameters:
    - product_list: list of product descriptions
    - treatment_keywords: list of keywords indicating treatments
    
    Returns:
    - Dictionary with keywords as keys and lists of positions as values, or an empty dictionary if no keyword is found
    """
    return {
        keyword: [idx for idx, item in enumerate(product_list) if keyword in item.lower()]
        for keyword in treatment_keywords
        if any(keyword in item.lower() for item in product_list)
    }
# Sample product list for testing
product_list = [
    'progresivo', 'monofocal', 'biselado', 'tratamiento', 'resistente', 'ocupacional',
    'bifocal', 'uv', 'revestimiento', 'bordes', 'tratamiento', 'progresivo'
] * 100000  # Scale up for performance testing

# Define test functions with specific keyword sets
lens_keywords = ['progresivo', 'monofocal', 'bifocal', 'ocupacional']
treatment_keywords = ['tratamiento', 'revestimiento', 'resistente', 'uv', 'protección']
beveling_keywords = ['biselado', 'bisel', 'bordes']

# Measure time for `identify_lens_type`
time_lens_type = timeit.timeit(
    lambda: identify_lens_type(product_list, lens_keywords), number=10
)

# Measure time for `has_treatment`
time_has_treatment = timeit.timeit(
    lambda: has_treatment(product_list, treatment_keywords), number=10
)

# Measure time for `has_beveling`
time_has_beveling = timeit.timeit(
    lambda: has_beveling(product_list, beveling_keywords), number=10
)

# Measure time for combined `identify_keywords_and_positions`
time_combined = timeit.timeit(
    lambda: utils.identify_keywords_and_positions(product_list, lens_keywords + treatment_keywords + beveling_keywords),
    number=10
)

# Output the results
print(f"Time for identify_lens_type: {time_lens_type:.5f} seconds")
print(f"Time for has_treatment: {time_has_treatment:.5f} seconds")
print(f"Time for has_beveling: {time_has_beveling:.5f} seconds")
print(f"Time for identify_keywords_and_positions (combined): {time_combined:.5f} seconds")

Time for identify_lens_type: 2.32158 seconds
Time for has_treatment: 2.76243 seconds
Time for has_beveling: 1.70784 seconds
Time for identify_keywords_and_positions (combined): 6.72285 seconds


In [5]:
# Define the two arrays for comparison
arr1 = np.array(['CRIZAL SAPPHIRE HR', 'VARILUX LIBERTY 3.0 SHORT 1.5'])
arr2 = np.array(['VARILUX LIBERTY 3.0 SHORT 1.5', 'CRIZAL SAPPHIRE HR'])

# Method 1: Using np.array_equal with sorting
def compare_with_sort(arr1, arr2):
    return np.array_equal(np.sort(arr1), np.sort(arr2))

# Method 2: Using set comparison
def compare_with_set(arr1, arr2):
    return set(arr1) == set(arr2)

# Measure execution time for both methods using timeit
print("Comparing performance...")

# Define the number of iterations
iterations = 100000

# Time the sorting method
sort_time = timeit.timeit(lambda: compare_with_sort(arr1, arr2), number=iterations)
print(f"Sorting method time: {sort_time:.6f} seconds")

# Time the set comparison method
set_time = timeit.timeit(lambda: compare_with_set(arr1, arr2), number=iterations)
print(f"Set comparison method time: {set_time:.6f} seconds")

# Display the faster method
if sort_time < set_time:
    print("Sorting method is faster.")
else:
    print("Set comparison method is faster.")


Comparing performance...
Sorting method time: 0.241273 seconds
Set comparison method time: 0.148574 seconds
Set comparison method is faster.


In [None]:
df = pd.read_excel('/Users/iairlinker/Documents/repos/sandbox-for-experimentation/modelos/clasificacion/data/raw/facturacion_directa_essilor_train.xlsx')  # Replace 'file1.csv' with the actual filename
df.columns = utils.standardize_column_names(df.columns.values)
# Pandas approach
def pandas_groupby():
    grouped_df = df.groupby('nmerodealbarndecargoodealbarndeabono').agg({
        'categoriaproducto': list,
        'descripcindelproducto': list
    })
    return grouped_df

# NumPy approach
def numpy_grouping():
    # Step 0 : Create the variables
    descripcindelproducto = df.descripcindelproducto.values
    categoriaproducto = df.categoriaproducto.values
    nmerodealbarndecargoodealbarndeabono = df.nmerodealbarndecargoodealbarndeabono.values
    # Step 1: Get unique values and inverse indices for grouping
    unique_nmeros, inverse_indices = np.unique(nmerodealbarndecargoodealbarndeabono, return_inverse=True)
    # Step 2: Group `categoriaproducto` and `descripcindelproducto` by `inverse_indices`
    categoriaproducto_groups = [categoriaproducto[inverse_indices == i] for i in range(len(unique_nmeros))]
    descripcindelproducto_groups = [descripcindelproducto[inverse_indices == i] for i in range(len(unique_nmeros))]

    #return unique_nmeros, categoriaproducto_groups, descripcindelproducto_groups
    return  pd.DataFrame({
        'nmerodealbarndecargoodealbarndeabono': unique_nmeros,
        'categoriaproducto': categoriaproducto_groups,
        'descripcindelproducto': descripcindelproducto_groups
    })

# Measure performance
pandas_time = timeit.timeit(pandas_groupby, number=10)
numpy_time = timeit.timeit(numpy_grouping, number=10)

# Print results
print(f"Pandas approach time: {pandas_time:.5f} seconds")
print(f"NumPy approach time: {numpy_time:.5f} seconds")
print(f"Ratio Pandas vs Numpy  approach time: {pandas_time/numpy_time:.5f} seconds")