In [1]:
import numpy as np
import requests
import pandas as pd
from datetime import datetime
import os

In [None]:
ingested_data = []

for id in range(1, 30):
    response = requests.get(f"https://tienda.mercadona.es/api/categories/{id}/?lang=ca")

    if response.status_code == 200:
        data = response.json()
        category = data['name']

        for subcategory in data['categories']:
            subcategory_name = subcategory['name']

        for product in subcategory['products']:
            thumbnail = product['thumbnail']
            display_name = product['display_name']
            iva = product['price_instructions']['iva']
            bulk_price = product['price_instructions']['bulk_price']
            unit_price = product['price_instructions']['unit_price']
            total_units = product['price_instructions']['total_units']
            reference_format = product['price_instructions']['reference_format']

            ingested_data.append({
                'category': category,
                'subcategory': subcategory_name,
                'name': display_name,
                'iva': iva,
                'bulk_price': bulk_price,
                'unit_price': unit_price,
                'units': total_units,
                'format': reference_format,
                'image': thumbnail,
            })
    else:
        print("Error:", response.status_code)

In [None]:
dtype_dict = {
  'category': str,
  'subcategory': str,
  'name': str,
  'iva': str,
  'bulk_price': float,
  'unit_price': float,
  'units': float,
  'format': str,
  'image': str,
}

df = pd.DataFrame(ingested_data)
df = df.astype(dtype_dict)

In [None]:
df.dtypes

In [None]:
df

#### Explorció bàsica de les dades generades

In [None]:
# Veiem com hi ha noms molt llargs, per tant els reduirem.
max(df['name'], key=len)

In [None]:
# Les categories que hem ingestat:
df.category.unique()

In [None]:
df.category.unique().size

In [None]:
# Les unitats possibles (les reduirem a kg i ud nomès).

# Kg -> generarem un pes aleatori per determinar-ne el preu.
# La resta es suposarà preu unitari i es generaràn unitats aleatories per generar el tiquet.

df.format.unique()

#### Tractament de dades

In [None]:
# Fem un nom curt per no ocupar gran part del tiquet:
def short_name(cell_value, n=20):
    words = len(cell_value.split())
    chars = len(cell_value)

    if words <= 1 or chars <= n:
        return cell_value

    words = cell_value[0:n].split(' ')[0:-1] # La última fora perque pot quedar la paraula tallada
    return ' '.join([word for word in words if len(word) > 2])

df['name'] = df['name'].apply(short_name)
df.head()

In [None]:
max(df['name'], key=len)

In [None]:
# Calcular preus totals:

def calculate_price(row):
    if row['format'] == 'kg':
        # Verificar que category sea solo una palabra y que units sea NaN
        if ((row['category'] in ['Fruita i verdura', 'Verdura']) and (len(row['name'].split()) <= 2)):
            return pd.Series({'price': row['bulk_price'], 'format': 'kg'})
        else:
            return pd.Series({'price': row['unit_price'], 'format': 'ud'})

    elif row['format'] == 'ud':
        if not pd.isna(row['units']):
            return pd.Series({'price': row['unit_price'] * row['units'], 'format': 'ud'})
        else:
            return pd.Series({'price': row['unit_price'], 'format': 'ud'})

    else:
        return pd.Series({'price': row['unit_price'], 'format': 'ud'})

# Aplicar la función calculate_price a cada fila del DataFrame
df[['price', 'format']] = df.apply(calculate_price, axis=1)

df.head()

In [None]:
df.format.unique()

In [None]:
# Revisem quants registres per dia ens han quedat: 
df.size

In [None]:
# Fem que sí el preu del prodcute és superior a 100 sigui un valor random entre 80 i 100:
df['price'] = df.apply(lambda row: np.random.uniform(80, 100) if row['price'] > 100 else row['price'], axis=1)

# Ens assegurem de que no en quedi cap.
len(df.loc[df['price']>100])

#### Afegim la data del dia de injesta

In [None]:
df['date'] = datetime.today().date().strftime('%d/%m/%Y')

#### Exportem el dataframe

In [None]:
df.head(20)

In [None]:
# Declarem el fitxer on guardarem les dades:
ruta_df = './productes_mercadona.csv'

# Comprobar si el fitxer ja existeix
if os.path.exists(ruta_df):
    # Si ja existeix, en llegim les dades:
    df_actual = pd.read_csv(ruta_df)
    
    # Concatenar els nous valors
    df_nou = pd.concat([df_actual, df], ignore_index=True)

else:
    # Si no existeix, agafem directament el que tenim inicialment
    df_nou = df

# Exportar el df
df_nou.to_csv(ruta_df, index=False)

In [None]:
df['category'].value_counts()