# Teste Data Science Elo7 - Parte 2 - Sistema de Classificação de Produtos


## 2.1 Bibliotecas para importar

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt;
import matplotlib.gridspec as gridspec
%matplotlib inline

## 2.2 Preparação dos dados

Nesta sessão vamos apenas repetiro todo o procedimento de carregar, limpar e preparar os dados que já foi devidamente explicado no jupyter notebook referente à análise exploratória. Como tudo já está devidamente explicado em [01_analise_exploratoria.ipynb](01_analise_exploratoria.ipynb), esta etapa será feita sem as explicações e comentários entre os comandos.

In [2]:
from unidecode import unidecode

def clean_text(text):
    text = str(text)
    text = unidecode(text)
    text = text.lower()
    for t in ',.:;-_/\\+=()[]<>^~?!#*%':
        text = text.replace(t, ' ')
    return ' '.join(text.split())

def clean_and_separate_tags(text):
    text = clean_text(text)
    tags = [t for t in text.split() if not t.isdigit()]
    return tags

# carregando os dados
raw_data = pd.read_csv("elo7_recruitment_dataset.csv")
df = raw_data.copy()

# lidando com dados faltantes
df['order_counts'].fillna(0, inplace=True)
df['concatenated_tags'].fillna('', inplace=True)
df.dropna(inplace=True)
df.drop(['product_id','seller_id'], axis=1).describe()

# definindo novas variaveis
df['creation_date'] = pd.to_datetime(df['creation_date'])
date_today = pd.to_datetime('today')
df['number_of_days'] = [(date_today-day).days for day in df['creation_date']]

df['len_tags'] = [len(clean_text(t)) for t in df['concatenated_tags']]
df['len_title'] = [len(clean_text(t)) for t in df['title']]
df['len_query'] = [len(clean_text(t)) for t in df['query']]
df['price_per_weight'] = df['price']/(df['weight']+1)

df['order_probability'] = df['order_counts']/df['view_counts']
df = df[df['order_probability'] <= 1]

df['views_per_day'] = df['view_counts']/df['number_of_days']
df['orders_per_day'] = df['order_counts']/df['number_of_days']

df.drop(['product_id','seller_id'], axis=1).describe()

Unnamed: 0,search_page,position,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,number_of_days,len_tags,len_title,len_query,price_per_weight,order_probability,views_per_day,orders_per_day
count,38422.0,38422.0,38422.0,38422.0,38422.0,38422.0,38422.0,38422.0,38422.0,38422.0,38422.0,38422.0,38422.0,38422.0,38422.0,38422.0
mean,1.491802,16.892145,84.109725,361.075191,0.780647,14.609,546.285461,12.838998,1368.039691,45.392562,31.280126,19.922466,14.366134,0.03183,0.44768,0.010674
std,0.980284,11.589229,211.951291,1812.845573,0.413814,43.813837,1417.890272,43.2887,507.022883,47.113327,11.716516,9.339333,81.253071,0.062687,1.303772,0.036305
min,1.0,0.0,0.07,0.0,0.0,0.0,1.0,0.0,482.0,0.0,3.0,1.0,3.4e-05,0.0,0.000343,0.0
25%,1.0,6.0,12.76,6.0,1.0,3.0,119.0,0.0,1036.0,18.0,23.0,13.0,0.228834,0.0,0.089217,0.0
50%,1.0,16.0,28.52,9.0,1.0,7.0,243.0,0.0,1255.0,32.0,30.0,19.0,1.327889,0.0,0.187789,0.0
75%,2.0,27.0,90.0,107.0,1.0,16.0,521.0,14.0,1556.0,57.0,38.0,25.0,7.272601,0.042665,0.418725,0.010965
max,5.0,38.0,11509.38,65009.0,1.0,3000.0,45010.0,2460.0,4558.0,2669.0,60.0,89.0,5563.86,1.0,53.20331,2.318567


In [3]:
qualitative_columns = ['seller_id',
                       'product_id',
                       'creation_date',
                       'query',
                       'title',
                       'concatenated_tags',
                       'category',
                       'express_delivery']

quantitative_columns_original = list(raw_data.drop([*qualitative_columns], axis=1).columns)
quantitative_columns_new = list(df.drop([*(qualitative_columns+quantitative_columns_original)], axis=1).columns)
quantitative_columns = quantitative_columns_original + quantitative_columns_new

print('Variáveis qualitativas:')
for t in qualitative_columns:
    print(f'    - {t}')
print('\nVariáveis quantitativas do dataset original:')
for t in quantitative_columns_original:
    print(f'    - {t}')
print('\nVariáveis quantitativas criadas:')
for t in quantitative_columns_new:
    print(f'    - {t}')

Variáveis qualitativas:
    - seller_id
    - product_id
    - creation_date
    - query
    - title
    - concatenated_tags
    - category
    - express_delivery

Variáveis quantitativas do dataset original:
    - search_page
    - position
    - price
    - weight
    - minimum_quantity
    - view_counts
    - order_counts

Variáveis quantitativas criadas:
    - number_of_days
    - len_tags
    - len_title
    - len_query
    - price_per_weight
    - order_probability
    - views_per_day
    - orders_per_day
