### Imports

In [110]:
import math
import datetime

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

### Constants

In [None]:
PATH_FILE_ORDER = './file/olist_orders_dataset.csv'
PATH_FILE_ORDER_ITEM = './file/olist_order_items_dataset.csv'
PATH_FILE_ORDER_REVIEW = './file/olist_order_reviews_dataset.csv'
PATH_FILE_ORDER_PAYMENT = './file/olist_order_payments_dataset.csv'

PATH_FILE_PROD = './file/olist_products_dataset.csv'
PATH_FILE_SELLER = './file/olist_sellers_dataset.csv'
PATH_FILE_CUSTOMER = './file/olist_customers_dataset.csv'
PATH_FILE_GEOLOCATION = './file/olist_geolocation_dataset.csv'
PATH_FILE_PROD_CATEGORY_TRANSLATE = './file/product_category_name_translation.csv'

# Reviews Data

### Constants

In [None]:

# Original columns
COL_REV_ID = 'review_id'
COL_REV_MSG = 'review_comment_message'
COL_REV_ORDER = 'order_id'
COL_REV_SCORE = 'review_score'
COL_REV_TITLE = 'review_comment_title'
COL_REV_ANSWER = 'review_answer_timestamp'
COL_REV_CREATION = 'review_creation_date'

# Custom Columns
COL_REV_CUS_MSG_LEN = 'Message Length'

### Build Dataframe

In [None]:
# Import file
reviewsDF = pd.read_csv(PATH_FILE_ORDER_REVIEW)
reviewsDFClean = reviewsDF[reviewsDF[COL_REV_MSG].notnull()]

# Handle NaN values
reviewsDF.loc[reviewsDF[COL_REV_MSG].isnull(), COL_REV_MSG] = ''
reviewsDF.loc[reviewsDF[COL_REV_TITLE].isnull(), COL_REV_TITLE] = ''

reviewsDFClean.loc[reviewsDFClean[COL_REV_TITLE].isnull(), COL_REV_TITLE] = ''

# Compute review lengths
reviewsDF[COL_REV_CUS_MSG_LEN] = reviewsDF[COL_REV_MSG].apply(lambda msg: len(msg))
reviewsDFClean[COL_REV_CUS_MSG_LEN] = reviewsDFClean[COL_REV_MSG].apply(lambda msg: len(msg))

# Format
sort_review = [COL_REV_SCORE, COL_REV_CUS_MSG_LEN, COL_REV_CREATION, COL_REV_MSG, COL_REV_TITLE]
# columns_review = [COL_REV_SCORE, COL_REV_CUS_MSG_LEN, COL_REV_MSG, COL_REV_TITLE]
reviewsDF = reviewsDF.sort_values(by=sort_review, ascending=False)
reviewsDFClean = reviewsDFClean.sort_values(by=sort_review, ascending=False)


### Statistics

In [None]:
reviewsCount = reviewsDF.shape[0]
noMsgReviewsCount = reviewsDF.loc[reviewsDF[COL_REV_CUS_MSG_LEN] == 0].shape[0]
noMsgReviewsRatio = round(noMsgReviewsCount / reviewsCount * 100, 2)

print(f'Reviews: {reviewsCount}')
print(f'Empty message reviews: {noMsgReviewsCount} / {reviewsCount} ({noMsgReviewsRatio}%)')

scoreValues = sorted(reviewsDF[COL_REV_SCORE].unique())
for score in scoreValues:

    scoredDF = reviewsDF.loc[reviewsDF[COL_REV_SCORE] == score]

    scoredCount = scoredDF.shape[0]
    scoredRatio = round(scoredCount / reviewsCount * 100, 2)
    noMsgScoredCount = scoredDF.loc[reviewsDF[COL_REV_CUS_MSG_LEN] == 0].shape[0]
    noMsgScoredRation = round(noMsgScoredCount / scoredCount * 100, 2)

    print(f'{score} Score: {scoredCount} / {reviewsCount} ({scoredRatio}%)')
    print(f'\tNo message: {noMsgScoredCount} / {scoredCount} ({noMsgScoredRation}%)')

### Plots

In [None]:

# Build grid
figure = plt.figure(figsize=(26, 8))

a11 = plt.subplot2grid((2, 4), (0, 0), fig=figure)
a12 = plt.subplot2grid((2, 4), (0, 1), fig=figure)
a13 = plt.subplot2grid((2, 4), (0, 2), fig=figure)

a14 = plt.subplot2grid((2, 4), (0, 3), fig=figure, rowspan=2)

a21 = plt.subplot2grid((2, 4), (1, 0), fig=figure)
a22 = plt.subplot2grid((2, 4), (1, 1), fig=figure)
a23 = plt.subplot2grid((2, 4), (1, 2), fig=figure)

# Bars graph: Total of review per each score.
bars = pd.DataFrame({
        'reviews': [
            reviewsDF[reviewsDF[COL_REV_SCORE] == 1].shape[0],
            reviewsDF[reviewsDF[COL_REV_SCORE] == 2].shape[0],
            reviewsDF[reviewsDF[COL_REV_SCORE] == 3].shape[0],
            reviewsDF[reviewsDF[COL_REV_SCORE] == 4].shape[0],
            reviewsDF[reviewsDF[COL_REV_SCORE] == 5].shape[0],
        ]
    },
    index=[1, 2, 3, 4, 5],
)

bars.plot.bar(ax=a14, title='Review Scores', color='cyan')

# Bars graph: Show proportion of reviews with or without comments per each score level.
yes = []
no = []
scores = []

for i in range(1, 6):
    _yes = reviewsDF[(reviewsDF[COL_REV_SCORE] == i) & (reviewsDF[COL_REV_CUS_MSG_LEN] > 0)].shape[0]
    _no = reviewsDF[(reviewsDF[COL_REV_SCORE] == i) & (reviewsDF[COL_REV_CUS_MSG_LEN] == 0)].shape[0]
    total = _yes + _no

    scores.append('Score: 0' + str(i))
    yes.append(_yes)
    no.append(_no)

barh = pd.DataFrame({ 'Yes': yes, 'No': no }, index=scores)
barh.plot.barh(
    ax=a13,
    title='Comments proportion by each score',
    color={ 'Yes': 'green', 'No': 'orange', 'AVG': 'c'},
)

# Histograms: Show length of commentaries per each review score level
figPositionMap = {
    1: a11, 2: a12,
    3: a21, 4: a22, 5: a23,
}

for i in range(1, 6):
    a = figPositionMap.get(i)
    a.hist(reviewsDFClean[reviewsDFClean[COL_REV_SCORE] == i][COL_REV_CUS_MSG_LEN].values, bins=15, facecolor='y', snap=False)
    a.set_title('Score ' + str(i))

figure.suptitle('Comment X Reviews')
figure.show()

# Orders

### Constants

In [3]:

# Original columns
COL_ORD_ID = 'order_id'
COL_ORD_STATUS = 'order_status'
COL_ORD_CUSTOMER = 'customer_id'

COL_ORD_DATE_BUY = 'order_purchase_timestamp'
COL_ORD_DATE_APPROV = 'order_approved_at'
COL_ORD_DATE_CARRIER = 'order_delivered_carrier_date'
COL_ORD_DATE_DELIVER = 'order_delivered_customer_date'
COL_ORD_DATE_DELIVER_EST = 'order_estimated_delivery_date'

# Custom columns
COL_ORD_CUS_TIME_SHIP = 'Time to Ship'
COL_ORD_CUS_TIME_DELAY = 'Delivery Delay'
COL_ORD_CUS_TIME_APPROV = 'Time to Approve'
COL_ORD_CUS_TIME_DELIVER = 'Time to Deliver'

# Status
STATUS_ORD_APPROV = 'approved'
STATUS_ORD_CANCEL = 'canceled'
STATUS_ORD_CREATED = 'created'
STATUS_ORD_DELIVERED = 'delivered'
STATUS_ORD_INVOICED = 'invoiced'
STATUS_ORD_PROCESSING = 'processing'
STATUS_ORD_SHIPPED = 'shipped'
STATUS_ORD_UNAVAILABLE = 'unavailable'

# Rename
sortOrder = [COL_ORD_STATUS, COL_ORD_CUS_TIME_DELAY, COL_ORD_CUS_TIME_DELIVER]

dateColumns = [COL_ORD_DATE_BUY, COL_ORD_DATE_DELIVER, COL_ORD_DATE_DELIVER_EST]

columnsOrder = [
    COL_ORD_STATUS,
    COL_ORD_DATE_BUY, COL_ORD_DATE_DELIVER_EST, COL_ORD_DATE_DELIVER,
    COL_ORD_CUS_TIME_DELIVER, COL_ORD_CUS_TIME_DELAY
]

endStatuses = [STATUS_ORD_CANCEL, STATUS_ORD_DELIVERED, STATUS_ORD_UNAVAILABLE]
midStatuses = [STATUS_ORD_CREATED, STATUS_ORD_APPROV, STATUS_ORD_INVOICED, STATUS_ORD_PROCESSING, STATUS_ORD_SHIPPED]

### Utils

In [103]:

def getDaysInterval(mainDate: datetime.date, relativeDate: datetime.date = None, isRelativeToToday = True):
    '''
        TODO: 2021-10-14 - ADD Description
    '''

    upperDate = datetime.datetime.today() if isRelativeToToday else relativeDate
    return math.floor((upperDate - mainDate) / np.timedelta64(1,'D'))



def getDaysDelay(maxDate: datetime.date, deliveryDate: np.datetime64 = None):
    '''
        TODO: 2021-10-14 - ADD Description
    '''

    upperDate = deliveryDate or datetime.datetime.today()
    isDelayed = upperDate > maxDate
    return getDaysInterval(deliveryDate, upperDate, False) if isDelayed else 0

def setDeliveryDays(df: pd.DataFrame) -> pd.Series:
    '''
        TODO: 2021-10-14 - ADD Description
    '''
    
    return np.vectorize(getDaysInterval)(
        df[COL_ORD_DATE_DELIVER_EST],
        df[COL_ORD_DATE_DELIVER],
        df[COL_ORD_STATUS] != STATUS_ORD_DELIVERED,
    )


def setDelayDays(df: pd.DataFrame) -> pd.Series:
    '''
        TODO: 2021-10-14 - ADD Description
    '''
    
    return np.vectorize(getDaysDelay)(
        df[COL_ORD_DATE_DELIVER_EST],
        df[COL_ORD_DATE_DELIVER],
    )

### Build Dataframe

In [109]:

# Import file
ordersDF = pd.read_csv(PATH_FILE_ORDER, parse_dates=dateColumns)

# Clean data: Step 01
ordersDF = ordersDF[
  (ordersDF[COL_ORD_STATUS] != STATUS_ORD_UNAVAILABLE)
  & ~((ordersDF[COL_ORD_STATUS] == STATUS_ORD_DELIVERED) & (ordersDF[COL_ORD_DATE_DELIVER].isnull())) # Avoid failure on time interval calculations
]

# Add calculated interval fields 
ordersDF[dateColumns] = ordersDF[dateColumns].apply(pd.to_datetime)
ordersDF[COL_ORD_CUS_TIME_DELAY] = setDelayDays(ordersDF)
ordersDF[COL_ORD_CUS_TIME_DELIVER] = setDeliveryDays(ordersDF)

# Clean data: Step 02
ordersDF = ordersDF[
    (ordersDF[COL_ORD_CUS_TIME_DELAY] >= 0) | (ordersDF[COL_ORD_CUS_TIME_DELIVER] >= 0)
    | ~ordersDF[COL_ORD_STATUS].isin(endStatuses)
]
ordersDF = ordersDF.sort_values(by=sortOrder, ascending=True, na_position='first')
# ordersDF[columnsOrder].groupby(COL_ORD_STATUS).count()

Unnamed: 0_level_0,order_purchase_timestamp,order_estimated_delivery_date,order_delivered_customer_date,Time to Deliver,Delivery Delay
order_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
approved,2,2,0,2,2
canceled,625,625,6,625,625
created,5,5,0,5,5
delivered,96470,96470,96470,96470,96470
invoiced,314,314,0,314,314
processing,301,301,0,301,301
shipped,1107,1107,0,1107,1107


### Foo

In [None]:
# ordersDF[columnsOrder].loc[ ordersDF[COL_ORD_CUS_TIME_DELAY] > 0 ]
# ordersDF[[COL_ORD_STATUS, COL_ORD_CUS_TIME_DELAY]].groupby(COL_ORD_STATUS).mean()
ordersDF.head()

