# Obtener la información de la tienda

Autor: Luis Camilo Jimenez, CEO


## Librerías

In [65]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn import model_selection, naive_bayes, svm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

## Listado de productos

In [38]:
term = 'lacteos'
url_search = 'https://busqueda.tiendasjumbo.co/busca?q={0}'.format(term)
search = requests.get(url_search)
soup_search = BeautifulSoup(search.text, "html5lib")
results = soup_search.find_all('h2',class_="nm-product-name")
print(results[:2])

[<h2 class="nm-product-name" itemprop="name">
            <a alt="//www.tiendasjumbo.co/yogurt-griego-deja-mu-natural-x-1000g/p?idsku=22776" href="//www.tiendasjumbo.co/yogurt-griego-deja-mu-natural-x-1000g/p?idsku=22776" title="Yogurt Griego Deja Mu Natural X 1000g">Yogurt Griego Deja Mu Natural X 1000g</a>
            <div class="nm-product-brand">

                DEJA-MU
            </div>
        </h2>, <h2 class="nm-product-name" itemprop="name">
            <a alt="//www.tiendasjumbo.co/yogo-yogo-surt-alpina-1000cc-x3un/p?idsku=119918" href="//www.tiendasjumbo.co/yogo-yogo-surt-alpina-1000cc-x3un/p?idsku=119918" title="Yogo-Yogo surtido Alpina x3 unidades  x1000g c-u...">Yogo-Yogo surtido Alpina x3 unidades  x1000g c-u...</a>
            <div class="nm-product-brand">

                YOGO YOGO
            </div>
        </h2>]


## Información del producto

Extracción de información: Categorias, imagén, nombre, descripción, precio

In [39]:
products = []
for index, result in enumerate(results):
    product = {}
    product["url"] = 'https:'+ result.a.get('href')
    # Obtiene la web page
    search_product = requests.get(product["url"])
    # Soup
    soup_product = BeautifulSoup(search_product.text, "html5lib")
    # Categorías 1 y 2
    lis = soup_product.find('div',class_="bread-crumb").find_all('li')
    product["category_1"] = lis[3].span.string
    product["category_2"] = lis[2].span.string
    # Imagen
    product["url_image"]=soup_product.find('div',id="image").a.get('href')
    # Nombre
    product["name"]=soup_product.find('div',class_="fn").string
    # Descripción
    product["description"]=soup_product.find('div',class_="productDescription").string
    # Precio
    product["price"]=soup_product.find('div',class_="plugin-preco").find('strong',class_="skuBestPrice").string
    # Incluir
    products.append(product)
    
print(json.dumps(products[:2], indent=4, sort_keys=True))

[
    {
        "category_1": "Yogurt",
        "category_2": "L\u00e1cteos, huevos y refrigerados",
        "description": "Yogurt Griego natural, sin\u00a0az\u00facar y muy cremoso especialmente rico en prote\u00ednas que tu cuerpo necesita.",
        "name": "Yogurt Griego Deja Mu Natural X 1000g",
        "price": "$21.590,00",
        "url": "https://www.tiendasjumbo.co/yogurt-griego-deja-mu-natural-x-1000g/p?idsku=22776",
        "url_image": "https://jumbocolombiafood.vteximg.com.br/arquivos/ids/3320714-1000-1000/7707365170623.jpg?v=636669101250600000"
    },
    {
        "category_1": "Yogurt",
        "category_2": "L\u00e1cteos, huevos y refrigerados",
        "description": "Disfruta del delicioso Yogo-Yogo surtido Alpina x3 unidades  x1000g c-u no te lo puedes perder",
        "name": "Yogo-Yogo surtido Alpina x3 unidades  x1000g c-u - Yogo-Yogo surtido Alpina x3 unidades x1000g c-u",
        "price": "$12.990,00",
        "url": "https://www.tiendasjumbo.co/yogo-yogo-surt

# Analizar los productos

## Extraer campos a analizar

In [40]:
df = pd.DataFrame.from_dict(products)
col = ['category_1', 'name']
df = df[col]
print(df.head())

  category_1                                               name
0     Yogurt              Yogurt Griego Deja Mu Natural X 1000g
1     Yogurt  Yogo-Yogo surtido Alpina x3 unidades  x1000g c...
2     Yogurt      Yogurt griego Deja-Mu vegano arándanos x 160g
3     Yogurt                    Yogurt Griego Alpina Coco x150g
4     Yogurt               Yogurt Griego Alpina Arandanos x150g


## Ajustar nombre y verficar datos

In [41]:
df = df[pd.notnull(df['name'])]
df.columns = ['category', 'name']

## Codificar categoría como entero

In [43]:
df['category_id'] = df['category'].factorize()[0]
category_id_df = df[['category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'category']].values)
print(category_id_df)

                   category  category_id
0                    Yogurt            0
6  Carnes frías y embutidos            1
7                     Kumis            2


## Se generan los tokens

In [44]:
df['name'] = [entry.lower() for entry in df['name']]
print(df.head())

  category                                               name  category_id
0   Yogurt              yogurt griego deja mu natural x 1000g            0
1   Yogurt  yogo-yogo surtido alpina x3 unidades  x1000g c...            0
2   Yogurt      yogurt griego deja-mu vegano arándanos x 160g            0
3   Yogurt                    yogurt griego alpina coco x150g            0
4   Yogurt               yogurt griego alpina arandanos x150g            0


In [48]:
df['name']= [word_tokenize(entry) for entry in df['name']]
print(df.head())

  category                                               name  category_id
0   Yogurt      [yogurt, griego, deja, mu, natural, x, 1000g]            0
1   Yogurt  [yogo-yogo, surtido, alpina, x3, unidades, x10...            0
2   Yogurt  [yogurt, griego, deja-mu, vegano, arándanos, x...            0
3   Yogurt              [yogurt, griego, alpina, coco, x150g]            0
4   Yogurt         [yogurt, griego, alpina, arandanos, x150g]            0


In [52]:
for index,entry in enumerate(df['name']):
    final_words = []
    stemmer = SnowballStemmer('spanish')
    for word in entry:
        if word not in stopwords.words('spanish') and word.isalpha():
            word_final = stemmer.stem(word)
            final_words.append(word_final)
            df.loc[index,'name'] = str(final_words)
print(df.head())

  category                                               name  category_id
0   Yogurt   ['yogurt', 'grieg', 'dej', 'mu', 'natural', 'x']            0
1   Yogurt  ['surt', 'alpin', 'unidad', 'surt', 'alpin', '...            0
2   Yogurt       ['yogurt', 'grieg', 'vegan', 'arandan', 'x']            0
3   Yogurt                ['yogurt', 'grieg', 'alpin', 'coc']            0
4   Yogurt            ['yogurt', 'grieg', 'alpin', 'arandan']            0


# Modelo

## Se dividen los valores en los de entrenamiento y testeo

In [55]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df['name'],df['category'],test_size=0.3)

In [58]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
print(Train_Y)

[1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1]


In [62]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['name'])
print(Tfidf_vect.vocabulary_)

{'yogurt': 50, 'grieg': 19, 'dej': 14, 'mu': 28, 'natural': 29, 'surt': 39, 'alpin': 0, 'unidad': 44, 'vegan': 46, 'arandan': 1, 'coc': 11, 'bonyurt': 8, 'cereal': 9, 'chokogozz': 10, 'salchich': 34, 'montefri': 26, 'superperr': 38, 'mixt': 25, 'zucarit': 53, 'pack': 30, 'bloqu': 6, 'jamon': 21, 'taj': 40, 'tip': 42, 'sandwich': 36, 'finess': 16, 'probiot': 33, 'babygu': 3, 'vainill': 45, 'banan': 4, 'gr': 18, 'und': 43, 'bols': 7, 'fres': 17, 'yagurt': 47, 'melocoton': 24, 'zorb': 52, 'dulc': 15, 'slight': 37, 'colant': 13, 'yog': 48, 'premi': 32, 'mor': 27, 'san': 35, 'martin': 23, 'colagen': 12, 'hidroliz': 20, 'pasc': 31, 'beb': 5, 'lacte': 22, 'yox': 51, 'yogur': 49, 'azuc': 2, 'tapiok': 41}


In [63]:
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
print(Train_X_Tfidf)

  (0, 52)	0.5792342107056825
  (0, 50)	0.2482102214839373
  (0, 29)	0.46535858369659316
  (0, 19)	0.3514829566875038
  (0, 15)	0.512621239159256
  (1, 50)	0.25476675144496436
  (1, 37)	0.5945348153070504
  (1, 24)	0.4776511374777649
  (1, 13)	0.5945348153070504
  (2, 50)	0.33559001038270325
  (2, 43)	0.693084136275112
  (2, 19)	0.4752188220892263
  (2, 0)	0.4256533755384153
  (3, 44)	0.6936034183548345
  (3, 39)	0.6138377831119065
  (3, 0)	0.3769847133117582
  (4, 50)	0.2849013785540113
  (4, 43)	0.5883984021261945
  (4, 18)	0.6648582969270387
  (4, 0)	0.3613612733548554
  (5, 50)	0.25476675144496436
  (5, 33)	0.5945348153070504
  (5, 29)	0.4776511374777649
  (5, 16)	0.5945348153070504
  (6, 24)	0.8282654070499371
  :	:
  (9, 30)	0.5994129051629247
  (9, 8)	0.5304793476171201
  (10, 10)	0.5699294910470167
  (10, 9)	0.5699294910470167
  (10, 8)	0.5043865789245925
  (10, 0)	0.30976592690372273
  (11, 50)	0.2396481592765475
  (11, 29)	0.4493059444517952
  (11, 28)	0.5592534084846453
  (11

## Modelo SVM — Support Vector Machine

In [66]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  75.0
