# Development environment setup

## Libraries

In [122]:
from pathlib import Path
from transformers import pipeline

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
from pprint import pprint

## Paths

In [96]:
project_folder = Path(".").absolute().parent

## Plotly preferences

In [97]:
# Change the default theme to "plotly_white"
pio.templates.default = "plotly_white"


## NLP Model

In [98]:
# Sentiment classifier for brazilian portuguese texts
sentiment_classifier = pipeline(
    task="sentiment-analysis",
    model="pysentimiento/bertweet-pt-sentiment"
)

In [99]:
# Use example
sentiment_classifier("Eu gosto de você", top_k=None)
model_response = sentiment_classifier("Eu gosto de você", top_k=None)

# Transform the model response into a dict with the sentiment and the scores
{dictionary["label"]:dictionary["score"] for dictionary in model_response}

{'POS': 0.8772935271263123,
 'NEU': 0.11531976610422134,
 'NEG': 0.007386720739305019}

# Datasets

## Import raw data

### Customers

In [100]:
customers = pd.read_csv(
    project_folder.joinpath("data", "raw", "olist_customers_dataset.csv")
)
customers.head(2)


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP


### Geolocation

In [101]:
geolocation = pd.read_csv(
    project_folder.joinpath("data", "raw", "olist_geolocation_dataset.csv")
)
geolocation.head(2)


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP


### Orders

In [102]:
orders = pd.read_csv(project_folder.joinpath("data", "raw", "olist_orders_dataset.csv"))

orders.head(2)


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00


### Order Items

In [103]:
order_items = pd.read_csv(
    project_folder.joinpath("data", "raw", "olist_order_items_dataset.csv")
)
order_items.head(2)


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93


### Order Payments

In [104]:
order_payments = pd.read_csv(
    project_folder.joinpath("data", "raw", "olist_order_payments_dataset.csv")
)

order_payments.head(2)


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39


### Order Reviews

In [105]:
order_reviews = pd.read_csv(
    project_folder.joinpath("data", "raw", "olist_order_reviews_dataset.csv")
)

order_reviews.head(2)


Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13


### Products

In [106]:
products = pd.read_csv(
    project_folder.joinpath("data", "raw", "olist_products_dataset.csv")
)

products.head(2)


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0


### Sellers

In [107]:
sellers = pd.read_csv(
    project_folder.joinpath("data", "raw", "olist_sellers_dataset.csv")
)

sellers.head(2)


Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP


### Product Category Name Translation

In [108]:
product_category_name_translation = pd.read_csv(
    project_folder.joinpath("data", "raw", "product_category_name_translation.csv")
)

product_category_name_translation.head(2)


Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories


# Exploratory Data Analysis (EDA)

## Number of orders and payment values by state and order status

In [109]:
# Creates a aggregated dataframe with the number of orders and the total payment value by state and order status
value_orders_by_state_status = (
    orders.merge(customers, how="left", on="customer_id")
    .merge(order_payments, how="left", on="order_id")
    .loc[:, ["customer_state", "order_id", "order_status", "payment_value"]]
    .groupby(by=["customer_state", "order_status"])
    .agg({"order_id": "nunique", "payment_value": "sum"})
    .reset_index()
    .rename(columns={"order_id": "n_orders"})
    .sort_values(by="payment_value", ascending=False)
)

value_orders_by_state_status


Unnamed: 0,customer_state,order_status,n_orders,payment_value
139,SP,delivered,40501,5770266.19
101,RJ,delivered,12350,2055690.45
51,MG,delivered,11354,1819277.61
120,RS,delivered,5345,861802.40
94,PR,delivered,4923,781919.55
...,...,...,...,...
7,AL,unavailable,1,69.38
131,SE,canceled,1,61.79
49,MG,approved,1,61.62
133,SE,invoiced,1,43.05


In [110]:
pivot_value_by_state_status = (
    value_orders_by_state_status.pivot_table(
        index="customer_state", columns="order_status", values="payment_value"
    )
    .fillna(0)
    .reset_index()
    .rename_axis(None, axis=1)
    .sort_values(by="customer_state", ascending=False)
)

pivot_value_by_state_status.head(2)


Unnamed: 0,customer_state,approved,canceled,created,delivered,invoiced,processing,shipped,unavailable
26,TO,0.0,250.57,0.0,60007.37,0.0,96.98,999.5,130.91
25,SP,179.46,55829.85,160.94,5770266.19,30663.24,31436.74,46229.42,63461.12


In [111]:
pivot_orders_by_state_status = (
    value_orders_by_state_status.pivot_table(
        index="customer_state", columns="order_status", values="n_orders"
    )
    .fillna(0)
    .reset_index()
    .rename_axis(None, axis=1)
    .sort_values(by="customer_state", ascending=False)
)

pivot_orders_by_state_status.head(2)


Unnamed: 0,customer_state,approved,canceled,created,delivered,invoiced,processing,shipped,unavailable
26,TO,0.0,1.0,0.0,274.0,0.0,1.0,3.0,1.0
25,SP,1.0,327.0,1.0,40501.0,160.0,136.0,328.0,292.0


In [112]:
map_status_color = {
    "approved": "#6e35ff",
    "canceled": "#ff4d65",
    "created": "#6e35ff",
    "delivered": "#00c456",
    "invoiced": "#6e35ff",
    "processing": "#6e35ff",
    "shipped": "#6e35ff",
    "unavailable": "#ff4d65",
}

pprint(map_status_color)

{'approved': '#6e35ff',
 'canceled': '#ff4d65',
 'created': '#6e35ff',
 'delivered': '#00c456',
 'invoiced': '#6e35ff',
 'processing': '#6e35ff',
 'shipped': '#6e35ff',
 'unavailable': '#ff4d65'}


In [113]:
status_order = [
    "created",
    "processing",
    "approved",
    "invoiced",
    "shipped",
    "delivered",
    "unavailable",
    "canceled",
]

pprint(status_order)

['created',
 'processing',
 'approved',
 'invoiced',
 'shipped',
 'delivered',
 'unavailable',
 'canceled']


In [114]:
# Creates a plotly figure with the number of orders and the total payment value by state and order status
fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=("Total Payment Value", "Number of Orders"),
    shared_yaxes=True,
    specs=[[{"type": "bar"}, {"type": "bar"}]],
)

for status in status_order:
    trace1 = go.Bar(
        y=pivot_value_by_state_status["customer_state"],
        x=pivot_value_by_state_status[status],
        orientation="h",
        name=status.title(),
        marker=dict(color=map_status_color[status]),
    )

    trace2 = go.Bar(
        y=pivot_orders_by_state_status["customer_state"],
        x=pivot_orders_by_state_status[status],
        orientation="h",
        name=status.title(),
        showlegend=False,
        marker=dict(color=map_status_color[status]),
    )

    fig.add_trace(trace1, row=1, col=1)
    fig.add_trace(trace2, row=1, col=2)

fig.update_layout(
    title_text="Number of Total Payment Value and Orders by State and Order Status",
    height=600,
    width=1200,
)

fig.update_layout(
    barmode="stack",
    xaxis=dict(title="Value (BRL)"),
    xaxis2=dict(title="# Orders"),
)

fig.show()


## Comments sentiment classification

In [124]:
# Creates a new column with the title and the review message
order_reviews["review_text"] = order_reviews["review_comment_title"].fillna("") + " " + order_reviews["review_comment_message"].fillna("")

# Converts to lower case
order_reviews["review_text"] = order_reviews["review_text"].str.lower()

# Removes special characters
order_reviews["review_text"] = order_reviews["review_text"].str.replace(r"[^\w\s]", " ", regex=True)

# Removes numbers
order_reviews["review_text"] = order_reviews["review_text"].str.replace(r"\d+", "", regex=True)

# Removes new lines
order_reviews["review_text"] = order_reviews["review_text"].str.replace(r"\n", " ", regex=True)

# Removes multiple spaces
order_reviews["review_text"] = order_reviews["review_text"].str.replace(r"\s+", " ", regex=True)

# Removes spaces at the beginning and end of the string
order_reviews["review_text"] = order_reviews["review_text"].str.strip()

# Replaces empty strings with NaN
order_reviews["review_text"] = order_reviews["review_text"].replace("", np.nan)

order_reviews.head(5)


Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,review_text
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59,
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13,
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24,
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06,recebi bem antes do prazo estipulado
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53,parabéns lojas lannister adorei comprar pela i...


In [126]:
# Checks the number of NaN values in the review_text column
order_reviews["review_text"]\
    .isna()\
    .value_counts()

review_text
True     56770
False    42454
Name: count, dtype: int64

More than half of the comments are missing, so we'll have to verify later if this problem is concentrated in a few states or if it's spread across the country.

In [130]:
# Uses the sentiment classifier to predict the sentiment of the review_text column, bringing the top 3 predictions
order_reviews["sentiment"] = order_reviews["review_text"]\
    .map(
            lambda x: {dictionary["label"]:dictionary["score"] for dictionary in sentiment_classifier(x, top_k=None)},
            na_action="ignore"
        )

order_reviews.head(5)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,review_text,sentiment
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59,,
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13,,
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24,,
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06,recebi bem antes do prazo estipulado,"{'NEU': 0.8218241930007935, 'POS': 0.122218288..."
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53,parabéns lojas lannister adorei comprar pela i...,"{'POS': 0.9917405843734741, 'NEU': 0.006466466..."


In [140]:
# Separates the sentiment and the score in different columns
order_reviews["positive_score"] = order_reviews["sentiment"].map(lambda x: x["POS"] if isinstance(x, dict) else np.nan)
order_reviews["neutral_score"]  = order_reviews["sentiment"].map(lambda x: x["NEU"] if isinstance(x, dict) else np.nan)
order_reviews["negative_score"] = order_reviews["sentiment"].map(lambda x: x["NEG"] if isinstance(x, dict) else np.nan)

# Creates a new column with the sentiment label
sentiment_labels = {
    "POS": "Positive",
    "NEU": "Neutral",
    "NEG": "Negative"
}
order_reviews['sentiment_label'] = order_reviews['sentiment'].map(lambda x: sentiment_labels.get(max(x, key=x.get)) if isinstance(x, dict) else np.nan)

order_reviews.head(5)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,review_text,sentiment,positive_score,neutral_score,negative_score,sentiment_label
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59,,,,,,
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13,,,,,,
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24,,,,,,
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06,recebi bem antes do prazo estipulado,"{'NEU': 0.8218241930007935, 'POS': 0.122218288...",0.122218,0.821824,0.055957,Neutral
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53,parabéns lojas lannister adorei comprar pela i...,"{'POS': 0.9917405843734741, 'NEU': 0.006466466...",0.991741,0.006466,0.001793,Positive


In [143]:
# Positive reviews examples
order_reviews[["positive_score", "review_text"]].sort_values(by="positive_score", ascending=False).head(5)

Unnamed: 0,positive_score,review_text
5235,0.994251,o produto é uma graça além de pratico muito li...
8367,0.99411,otímo adorei a cauterização da trivitt quero p...
65108,0.994079,ótimo muito bom e lindo o produto
70627,0.994033,amei o produto agilidade na entrega chegou mui...
64788,0.994011,muito bom a lojas lannister produtos muito bonito


In [144]:
# Neutral reviews examples
order_reviews[["neutral_score", "review_text"]].sort_values(by="neutral_score", ascending=False).head(5)

Unnamed: 0,neutral_score,review_text
49780,0.975681,somente o toner amarelo chegou até o momento
57174,0.974293,dentro do anunciado
54419,0.969607,estampa diferente do anunciado
89361,0.969598,ainda vou avaliar se realmente é original da p...
94277,0.96711,luminária o produto é um plástico duro vamos v...


In [145]:
# Negative reviews examples
order_reviews[["negative_score", "review_text"]].sort_values(by="negative_score", ascending=False).head(5)

Unnamed: 0,negative_score,review_text
17795,0.993699,relógio horrível péssimo sem qualidade materia...
10432,0.99367,não gostei tecido ruim péssimo
65079,0.993641,a manta é uma porcaria não vale nada é pior qu...
44775,0.993618,não gostei do acabamento produto de péssima qu...
55536,0.993502,produto de péssima qualidade a estampa do skat...
