### Imports

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Constants

In [5]:
PATH_FILE_ORDER = './file/olist_orders_dataset.csv'
PATH_FILE_ORDER_ITEM = './file/olist_order_items_dataset.csv'
PATH_FILE_ORDER_REVIEW = './file/olist_order_reviews_dataset.csv'
PATH_FILE_ORDER_PAYMENT = './file/olist_order_payments_dataset.csv'

PATH_FILE_PROD = './file/olist_products_dataset.csv'
PATH_FILE_SELLER = './file/olist_sellers_dataset.csv'
PATH_FILE_CUSTOMER = './file/olist_customers_dataset.csv'
PATH_FILE_GEOLOCATION = './file/olist_geolocation_dataset.csv'
PATH_FILE_PROD_CATEGORY_TRANSLATE = './file/product_category_name_translation.csv'

### Rename...

In [132]:

# Colunas originais
COL_REV_ID = 'review_id'
COL_REV_MSG = 'review_comment_message'
COL_REV_ORDER = 'order_id'
COL_REV_SCORE = 'review_score'
COL_REV_TITLE = 'review_comment_title'
COL_REV_ANSWER = 'review_answer_timestamp'
COL_REV_CREATION = 'review_creation_date'

# Colunas customizadas

COL_REV_CUS_MSG_LEN = 'review_custom_msg_length'


'''
    lines: 10k
    columns: 07
'''

orderBy = [COL_REV_SCORE, COL_REV_CUS_MSG_LEN, COL_REV_CREATION, COL_REV_MSG, COL_REV_TITLE]
columns = [COL_REV_SCORE, COL_REV_CUS_MSG_LEN, COL_REV_MSG, COL_REV_TITLE]

reviewsDF = pd.read_csv(PATH_FILE_ORDER_REVIEW)

# Trata valores NaN
reviewsDF.loc[reviewsDF[COL_REV_MSG].isnull(), COL_REV_MSG] = ''
reviewsDF.loc[reviewsDF[COL_REV_TITLE].isnull(), COL_REV_TITLE] = ''

# Computa comprimento das avaliacoes
reviewsDF[COL_REV_CUS_MSG_LEN] = reviewsDF[COL_REV_MSG].apply(lambda msg: len(msg))

# Exibe estatisticas sobre o DF
reviewsCount = reviewsDF.shape[0]
noMsgReviewsCount = reviewsDF.loc[reviewsDF[COL_REV_CUS_MSG_LEN] == 0].shape[0]
noMsgReviewsRatio = round(noMsgReviewsCount / reviewsCount * 100, 2)

print(f'Reviews: {reviewsCount}')
print(f'Empty message reviews: {noMsgReviewsCount} / {reviewsCount} ({noMsgReviewsRatio}%)')

scoreValues = sorted(reviewsDF[COL_REV_SCORE].unique())
for score in scoreValues:

    scoredDF = reviewsDF.loc[reviewsDF[COL_REV_SCORE] == score]

    scoredCount = scoredDF.shape[0]
    scoredRatio = round(scoredCount / reviewsCount * 100, 2)
    noMsgScoredCount = scoredDF.loc[reviewsDF[COL_REV_CUS_MSG_LEN] == 0].shape[0]
    noMsgScoredRation = round(noMsgScoredCount / scoredCount * 100, 2)

    print(f'{score} Score: {scoredCount} / {reviewsCount} ({scoredRatio}%)')
    print(f'\tNo message: {noMsgScoredCount} / {scoredCount} ({noMsgScoredRation}%)')

# reviewsDF = reviewsDF.sort_values(by=orderBy, ascending=False)
# reviewsDF[columns].head()
# print(reviewsDF.groupby(by=COL_REV_SCORE).mean())
# plt.figure(figsize=(14, 10), dpi=80)
# plt.plot(reviewsDF[COL_REV_SCORE].values, reviewsDF[COL_REV_CUS_MSG_LEN].values, '-.')

Reviews: 100000
Empty message reviews: 58247 / 100000 (58.25%)
1 Score: 11858 / 100000 (11.86%)
	No message: 2679 / 11858 (22.59%)
2 Score: 3235 / 100000 (3.23%)
	No message: 1006 / 3235 (31.1%)
3 Score: 8287 / 100000 (8.29%)
	No message: 4622 / 8287 (55.77%)
4 Score: 19200 / 100000 (19.2%)
	No message: 13166 / 19200 (68.57%)
5 Score: 57420 / 100000 (57.42%)
	No message: 36774 / 57420 (64.04%)
