In [15]:
import numpy as np
import pandas as pd

# Cleaned CSVs:
1. customerData
2. productData
3. orderData
4. sellerData
5. leadData

### We will work with the following variables found in orderData:
1. review_comment_title: Comment title from the review left by the customer, in Portuguese.
2. review_comment_message: Comment message from the review left by the customer, in Portuguese.

In [26]:
# customerData = pd.read_csv('customerData.csv')
# productData = pd.read_csv('productData.csv')
orderData = pd.read_csv('orderData.csv')
# sellerData = pd.read_csv('sellerData.csv')
# leadData = pd.read_csv('leadData.csv')

## Overview
### Data Curation
1. change any NaN in review_comment_title to '' (empty string)
2. concatenate review_comment_title with review_comment_message, new column 'review_pt'

### Sentiment Analysis
3. export out as csv to translate using google sheet's function (Google Translate API is too buggy), new column "review_en"
4. perform sentiment analysis (textblob), new column "polarity"

In [27]:
orderData

Unnamed: 0,order_id,order_item_id,seller_id,product_id,customer_id,review_id,order_status,price,payment_type,payment_installments,payment_time,lead_time,delivery_performance,review_score,review_comment_title,review_comment_message,review_time
0,e481f51cbdc54678b7cc49136f2d6af7,1.0,3504c0cb71d7fa48d967e0e4c94d59d9,87285b34884572647811a353c7ac498a,9ef432eb6251297304e76186b10a928d,a54f0611adc9ed256b57ede6b6eb5114,delivered,29.99,credit_card,1.0,0.0,8.0,7.0,4,,"Não testei o produto ainda, mas ele veio corre...",1
1,e481f51cbdc54678b7cc49136f2d6af7,1.0,3504c0cb71d7fa48d967e0e4c94d59d9,87285b34884572647811a353c7ac498a,9ef432eb6251297304e76186b10a928d,a54f0611adc9ed256b57ede6b6eb5114,delivered,29.99,voucher,1.0,0.0,8.0,7.0,4,,"Não testei o produto ainda, mas ele veio corre...",1
2,e481f51cbdc54678b7cc49136f2d6af7,1.0,3504c0cb71d7fa48d967e0e4c94d59d9,87285b34884572647811a353c7ac498a,9ef432eb6251297304e76186b10a928d,a54f0611adc9ed256b57ede6b6eb5114,delivered,29.99,voucher,1.0,0.0,8.0,7.0,4,,"Não testei o produto ainda, mas ele veio corre...",1
3,53cdb2fc8bc7dce0b6741e2150273451,1.0,289cdb325fb7e7f891c38608bf9e0962,595fac2a385ac33a80bd5114aec74eb8,b0830fb4747a6c6d20dea0b8c802d7ef,8d5266042046a06655c8db133d120ba5,delivered,118.70,boleto,1.0,1.0,13.0,5.0,4,Muito boa a loja,Muito bom o produto.,0
4,47770eb9100c2d0c44946d9cf07ec65d,1.0,4869f7a5dfa277a7dca6462dcf3b52b2,aa4383b373c6aca5d8797843e5594415,41ce2a54c0b03bf3443c3d931a367089,e73b67b67587f7644d5bd1a52deb1b01,delivered,159.90,credit_card,3.0,0.0,9.0,17.0,5,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119146,63943bddc261676b46f01ca7ac2f7bd8,1.0,1f9ab4708f3056ede07124aad39a2554,f1d4ce8c6dd66c47bbaa8c6781c2a923,1fca14ff2861355f6e5f14306ff977a7,29bb71b2760d0f876dfa178a76bc4734,delivered,174.90,credit_card,3.0,0.0,22.0,1.0,4,,So uma peça que veio rachado mas tudo bem rs,1
119147,83c1379a015df1e13d02aae0204711ab,1.0,d50d79cb34e38265a8649c383dcffd48,b80910977a37536adeddd63663f916ad,1aa71eb042121263aafbe80c1b562c9c,371579771219f6db2d830d50805977bb,delivered,205.99,credit_card,5.0,0.0,24.0,5.0,5,,Foi entregue antes do prazo.,0
119148,11c177c8e97725db2631073c19f07b62,1.0,a1043bafd471dff536d0c462352beb48,d1c427060a0f73f6b889a5c7c61f2ac4,b331b74b18dc79bcdf6532d51e1637c1,8ab6855b9fe9b812cd03a480a25058a1,delivered,179.99,credit_card,4.0,0.0,17.0,20.0,2,,Foi entregue somente 1. Quero saber do outro p...,1
119149,11c177c8e97725db2631073c19f07b62,2.0,a1043bafd471dff536d0c462352beb48,d1c427060a0f73f6b889a5c7c61f2ac4,b331b74b18dc79bcdf6532d51e1637c1,8ab6855b9fe9b812cd03a480a25058a1,delivered,179.99,credit_card,4.0,0.0,17.0,20.0,2,,Foi entregue somente 1. Quero saber do outro p...,1


In [28]:
orderData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119151 entries, 0 to 119150
Data columns (total 17 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   order_id                119151 non-null  object 
 1   order_item_id           118318 non-null  float64
 2   seller_id               118318 non-null  object 
 3   product_id              118318 non-null  object 
 4   customer_id             119151 non-null  object 
 5   review_id               119151 non-null  object 
 6   order_status            119151 non-null  object 
 7   price                   118318 non-null  float64
 8   payment_type            119148 non-null  object 
 9   payment_installments    119148 non-null  float64
 10  payment_time            118974 non-null  float64
 11  lead_time               115730 non-null  float64
 12  delivery_performance    115730 non-null  float64
 13  review_score            119151 non-null  int64  
 14  review_comment_title

In [29]:
# change any NaN in review_comment_title to '' (empty string)
# test = orderData.copy()
orderData["review_comment_title"].replace(np.nan, '', inplace = True)
orderData.head()

Unnamed: 0,order_id,order_item_id,seller_id,product_id,customer_id,review_id,order_status,price,payment_type,payment_installments,payment_time,lead_time,delivery_performance,review_score,review_comment_title,review_comment_message,review_time
0,e481f51cbdc54678b7cc49136f2d6af7,1.0,3504c0cb71d7fa48d967e0e4c94d59d9,87285b34884572647811a353c7ac498a,9ef432eb6251297304e76186b10a928d,a54f0611adc9ed256b57ede6b6eb5114,delivered,29.99,credit_card,1.0,0.0,8.0,7.0,4,,"Não testei o produto ainda, mas ele veio corre...",1
1,e481f51cbdc54678b7cc49136f2d6af7,1.0,3504c0cb71d7fa48d967e0e4c94d59d9,87285b34884572647811a353c7ac498a,9ef432eb6251297304e76186b10a928d,a54f0611adc9ed256b57ede6b6eb5114,delivered,29.99,voucher,1.0,0.0,8.0,7.0,4,,"Não testei o produto ainda, mas ele veio corre...",1
2,e481f51cbdc54678b7cc49136f2d6af7,1.0,3504c0cb71d7fa48d967e0e4c94d59d9,87285b34884572647811a353c7ac498a,9ef432eb6251297304e76186b10a928d,a54f0611adc9ed256b57ede6b6eb5114,delivered,29.99,voucher,1.0,0.0,8.0,7.0,4,,"Não testei o produto ainda, mas ele veio corre...",1
3,53cdb2fc8bc7dce0b6741e2150273451,1.0,289cdb325fb7e7f891c38608bf9e0962,595fac2a385ac33a80bd5114aec74eb8,b0830fb4747a6c6d20dea0b8c802d7ef,8d5266042046a06655c8db133d120ba5,delivered,118.7,boleto,1.0,1.0,13.0,5.0,4,Muito boa a loja,Muito bom o produto.,0
4,47770eb9100c2d0c44946d9cf07ec65d,1.0,4869f7a5dfa277a7dca6462dcf3b52b2,aa4383b373c6aca5d8797843e5594415,41ce2a54c0b03bf3443c3d931a367089,e73b67b67587f7644d5bd1a52deb1b01,delivered,159.9,credit_card,3.0,0.0,9.0,17.0,5,,,4


In [30]:
# concatenate review_comment_title with review_comment_message (put into 'review_pt')
orderData["review_pt"] = orderData["review_comment_title"].str.cat(orderData["review_comment_message"], sep = ' ')#, na_rep = '')
orderData[["review_comment_title", "review_comment_message", "review_pt"]].head()

Unnamed: 0,review_comment_title,review_comment_message,review_pt
0,,"Não testei o produto ainda, mas ele veio corre...","Não testei o produto ainda, mas ele veio corr..."
1,,"Não testei o produto ainda, mas ele veio corre...","Não testei o produto ainda, mas ele veio corr..."
2,,"Não testei o produto ainda, mas ele veio corre...","Não testei o produto ainda, mas ele veio corr..."
3,Muito boa a loja,Muito bom o produto.,Muito boa a loja Muito bom o produto.
4,,,


### We understand that translating before doing sentiment analysis will be less accurate than directly doing it in the native language, however, we still chose this method because:
1. resources for non-english NLP are very limited 
2. these libraries lack proper documentation 
3. these libraries may also be less accurate due to limited userbase

In [None]:
# export out as csv file to translate using google sheet's function (Google Translate API is too buggy) 
orderData[["review_id", "review_pt"]].dropna().to_csv('reviewData.csv', index = False)

## In Google Sheets (not Excel):
> = GOOGLETRANSLATE(cell, 'pt', 'en')

this translates a string from Portuguese to English

In [45]:
# reviewData
reviewData = pd.read_csv("reviewDataTranslated.csv")
reviewData

Unnamed: 0,review_id,review_pt,review_en
0,a54f0611adc9ed256b57ede6b6eb5114,"Não testei o produto ainda, mas ele veio corr...","I have not tested the product yet, but he cam..."
1,a54f0611adc9ed256b57ede6b6eb5114,"Não testei o produto ainda, mas ele veio corr...","I have not tested the product yet, but he cam..."
2,a54f0611adc9ed256b57ede6b6eb5114,"Não testei o produto ainda, mas ele veio corr...","I have not tested the product yet, but he cam..."
3,8d5266042046a06655c8db133d120ba5,Muito boa a loja Muito bom o produto.,Very good the store very good product.
4,359d03e676b3c069f62cadba8dd3f6e8,O produto foi exatamente o que eu esperava e e...,The product was exactly what I expected and wa...
...,...,...,...
51247,371579771219f6db2d830d50805977bb,Foi entregue antes do prazo.,It was delivered before the deadline.
51248,8ab6855b9fe9b812cd03a480a25058a1,Foi entregue somente 1. Quero saber do outro p...,It was delivered only 1. I know of another pro...
51249,8ab6855b9fe9b812cd03a480a25058a1,Foi entregue somente 1. Quero saber do outro p...,It was delivered only 1. I know of another pro...
51250,,,#VALUE!


In [52]:
# remove duplicate rows in reviewData
reviewData = reviewData.drop_duplicates()

# merge with orderData using review_id
orderData = orderData.merge(reviewData.drop(columns="review_pt"), on="review_id", how='left')
orderData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119151 entries, 0 to 119150
Data columns (total 19 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   order_id                119151 non-null  object 
 1   order_item_id           118318 non-null  float64
 2   seller_id               118318 non-null  object 
 3   product_id              118318 non-null  object 
 4   customer_id             119151 non-null  object 
 5   review_id               119151 non-null  object 
 6   order_status            119151 non-null  object 
 7   price                   118318 non-null  float64
 8   payment_type            119148 non-null  object 
 9   payment_installments    119148 non-null  float64
 10  payment_time            118974 non-null  float64
 11  lead_time               115730 non-null  float64
 12  delivery_performance    115730 non-null  float64
 13  review_score            119151 non-null  int64  
 14  review_comment_title

In [54]:
# drop review_comment_title, review_comment_message, review_pt
orderData = orderData.drop(columns = ["review_comment_title", "review_comment_message", "review_pt"])
orderData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119151 entries, 0 to 119150
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              119151 non-null  object 
 1   order_item_id         118318 non-null  float64
 2   seller_id             118318 non-null  object 
 3   product_id            118318 non-null  object 
 4   customer_id           119151 non-null  object 
 5   review_id             119151 non-null  object 
 6   order_status          119151 non-null  object 
 7   price                 118318 non-null  float64
 8   payment_type          119148 non-null  object 
 9   payment_installments  119148 non-null  float64
 10  payment_time          118974 non-null  float64
 11  lead_time             115730 non-null  float64
 12  delivery_performance  115730 non-null  float64
 13  review_score          119151 non-null  int64  
 14  review_time           119151 non-null  int64  
 15  

In [62]:
# perform sentiment analysis (textblob), new column "polarity"
from textblob import TextBlob

' I have not tested the product yet, but he came right and in good condition. Just box it came well dented and damaged, which will be boring, because it is a gift.'

In [63]:
# Example:
TextBlob(orderData["review_en"].iloc[2]).sentiment[0]
# .sentiment returns a tuple (polarity = ..., subjectivity = ...)

-0.004761904761904782

In [65]:
review_list = orderData["review_en"].values.tolist()

In [66]:
# append results into a list, then join it onto the DataFrame
polarity_list = []
i = 0
for review in review_list:
    try:
        polarity_list.append(TextBlob(review).sentiment[0])
    except:
        polarity_list.append(0)    
    if i % 10000 == 0:
        print(i, sep=', ')
    i += 1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000


In [72]:
len(polarity_list)

119151

In [None]:
# convert list to dataframe, join with orderData
orderData = orderData.join(pd.DataFrame(polarity_list))
orderData = orderData.rename(columns={0:"polarity"})

In [None]:
# Note: polarity = 0.0 means either NaN review_en or a neutral review

In [113]:
orderData.to_csv('orderDataSA.csv', index = False)