In [38]:
import requests as r
import pandas as pd
import numpy as np

### Se construirá un dataset de muestra de los sellers de todas las categorías disponibles en Mercado Libre Argentina
Para ello se tomaron aplicará la siguiente metodología:
1. El listado de las categorías, con su correspondiente id. Se trabajará con 10 categorias elegidas aleatoreamente
2. Se buscarán los primeros 200 items de dichas categorias y se extraerán los datos de los sellers que se consideran pertinentes para el análisis, 
    esto nos permitirá obtener una muestra de sellers de distintos rubros y categorias, se buscó bajar la influencia de las publicaciones más relevantes
    y al quitar duplicados de sellers, se tendrá un listado de candidatos para **identificar a aquellos que tienen un buen perfil y son relevantes para el negocio**
3. Dado que en los datos de los sellers dentro de cada item, tiene datos referidos:
    - provincia y localidad
    - Cancelaciones, Reclamos y Demoras de los últimos 60 días (cantidad y porcentaje del total de operaciones)
    - El total de transacciones históricas: canceladas y completas
    - Rating de los usuarios: negativo, neutral y positivos
    Estas variables se consideran valiosas y necesarias para el análisis
4. Es relevante para el negovio contar con información acerca de la facturación de los sellers: Por lo tanto, de los sellers obtenidos en el paso anterior 
    se buscarán todos los items del seller, para obtener el listado de productos, sus precios y cantidades vendidas,
    con esto se espera obtener una metrica de facturacion y una metrica aproximada de la ganancia del negocio por cada seller. Luego se agruparán por seller los resultados 
    para continuar trabajando con dicha unidad de análisis. 
5. Se unirán los datos por USER_ID y está será la muestra de datos con el que se continuará el análisis

Estás decisiones se basan en la hipotesis de que los sellers con mejores metricas, ratings, transacciones y rentabilidad serán los relevantes y tengan un buen perfil para el negocio
Respecto de la construccion del dataset, se indagó en la documentación de la API y en sus resultados y se optó por esta combinacion de consultas para llegar al dataset final


## CREACION DEL DF CATEGORIAS
### Contiene los datos de todas las categorias disponibles en Mercado Libre Argentina

In [60]:
url = "https://api.mercadolibre.com/sites/MLA/categories"
request = r.get(url)
results = request.json()
categorias = pd.DataFrame(results)
categorias = categorias.loc[(categorias.id != 'MLA1743') & (categorias.id != 'MLA2547') & (categorias.id != 'MLA1459') & (categorias.id != 'MLA1540') & (categorias.id != 'MLA1953')]
categorias = categorias.sample(10).reset_index(drop=True)
categorias

Unnamed: 0,id,name
0,MLA9304,"Souvenirs, Cotillón y Fiestas"
1,MLA1648,Computación
2,MLA1182,Instrumentos Musicales
3,MLA3025,"Libros, Revistas y Comics"
4,MLA1071,Animales y Mascotas
5,MLA5725,Accesorios para Vehículos
6,MLA409431,Salud y Equipamiento Médico
7,MLA1144,Consolas y Videojuegos
8,MLA1246,Belleza y Cuidado Personal
9,MLA1367,Antigüedades y Colecciones


In [61]:
# se extraen las categorias para mantener la muestra en futuras extracciones y como checkpoint
categorias.to_csv('../data/muestra_categorias.csv')

In [62]:
categorias = pd.read_csv('../data/muestra_categorias.csv')

In [63]:
offset_list = np.arange(0, 1000, 50).tolist()

In [64]:
df_items = pd.DataFrame()
idx = 0

for idx in range(len(categorias)):

    categoria = categorias.id[idx]

    for offset in offset_list:     

        url = f'https://api.mercadolibre.com/sites/MLA/search?category={categoria}&offset={offset}'

        req = r.get(url)

        items = req.json()['results']

        for item in range(len(items)):

            aux = pd.DataFrame({
                "categoria_MLA" : categoria,
                "seller" : items[item]['seller']['id'],
                "fecha_registro" : items[item]['seller']['registration_date'],
                "power_seller_status" :items[item]['seller']['seller_reputation']['power_seller_status'],
                "level_id" :items[item]['seller']['seller_reputation']['level_id'],
                "metrics_cancellations_value" :items[item]['seller']['seller_reputation']['metrics']['cancellations']['value'],
                "metrics_cancellations_rate" :items[item]['seller']['seller_reputation']['metrics']['cancellations']['rate'],
                "metrics_claims_value" :items[item]['seller']['seller_reputation']['metrics']['claims']['value'],
                "metrics_claims_rate" :items[item]['seller']['seller_reputation']['metrics']['claims']['rate'],
                "metrics_delayed_handling_time_value" :items[item]['seller']['seller_reputation']['metrics']['delayed_handling_time']['value'],
                "metrics_delayed_handling_time_rate" :items[item]['seller']['seller_reputation']['metrics']['delayed_handling_time']['rate'],
                "metrics_sales_value" :items[item]['seller']['seller_reputation']['metrics']['sales']['completed'],
                "transactions_historic_total": items[item]['seller']['seller_reputation']['transactions']['total'],
                "transactions_historic_canceled": items[item]['seller']['seller_reputation']['transactions']['canceled'],
                "transactions_historic_completed": items[item]['seller']['seller_reputation']['transactions']['completed'],
                "rating_negative": items[item]['seller']['seller_reputation']['transactions']['ratings']['negative'],
                "rating_neutral": items[item]['seller']['seller_reputation']['transactions']['ratings']['neutral'],
                "rating_positive": items[item]['seller']['seller_reputation']['transactions']['ratings']['positive'],

            }, index = {idx})
            df_items = df_items.append(aux)
            idx += 1

In [65]:
df_items.value_counts('categoria_MLA')

categoria_MLA
MLA1246      1000
MLA1367      1000
MLA409431    1000
MLA9304      1000
MLA1071       999
MLA1182       999
MLA1144       998
MLA1648       998
MLA5725       966
MLA3025       949
dtype: int64

In [66]:
df_items.sample(10)

Unnamed: 0,categoria_MLA,seller,fecha_registro,power_seller_status,level_id,metrics_cancellations_value,metrics_cancellations_rate,metrics_claims_value,metrics_claims_rate,metrics_delayed_handling_time_value,metrics_delayed_handling_time_rate,metrics_sales_value,transactions_historic_total,transactions_historic_canceled,transactions_historic_completed,rating_negative,rating_neutral,rating_positive
105,MLA1648,224694745,2016-08-09T17:51:24.000-04:00,platinum,5_green,12,0.0044,19,0.0071,15,0.0064,2574,14560,505,14055,0.01,0.01,0.98
213,MLA1182,24209295,2003-06-26T00:00:00.000-04:00,platinum,5_green,1,0.0,4,0.0079,4,0.0083,485,2298,118,2180,0.01,0.03,0.96
470,MLA1246,466012382,2019-08-29T13:02:31.000-04:00,platinum,5_green,2,0.0,15,0.0025,27,0.0047,5641,43965,1627,42338,0.02,0.01,0.97
651,MLA1182,70127535,2011-09-22T16:14:33.000-04:00,platinum,5_green,1,0.0,4,0.0017,0,0.0,2162,12455,682,11773,0.02,0.01,0.97
897,MLA5725,57439812,2006-07-27T02:23:00.000-04:00,platinum,5_green,1,0.0,1,0.0,47,0.0424,1098,3730,146,3584,0.01,0.01,0.98
859,MLA1182,429777659,2019-04-23T21:58:39.000-04:00,silver,5_green,1,0.0,2,0.0,9,0.1011,100,612,32,580,0.01,0.02,0.97
695,MLA409431,665557743,2020-10-29T17:47:02.000-04:00,platinum,5_green,1,0.0,8,0.0087,34,0.0401,885,3531,82,3449,0.01,0.01,0.98
566,MLA5725,161296925,2014-06-23T14:09:09.000-04:00,platinum,5_green,0,0.0,2,0.0,1,0.002,489,2575,147,2428,0.0,0.01,0.99
966,MLA1246,85302508,2009-09-10T18:35:46.000-04:00,gold,5_green,0,0.0,1,0.0,9,0.0267,334,1830,81,1749,0.0,0.01,0.99
43,MLA3025,128577788,2012-11-09T12:04:14.000-04:00,platinum,5_green,4,0.0,38,0.0005,144,0.0021,66488,335129,9862,325267,0.01,0.0,0.99


In [67]:
df_items.to_csv('../data/muestraSellers_dataMLA_checkpoint1.csv')