In [1]:
# import packages
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re

In [2]:
# read in data file for order review analysis
reviews = pd.read_csv('/Users/joelam/docs/Data Science/Brazilian_Ecomm/Data Files/processed/orders_sales_reviews.csv')

In [3]:
# review first 5 lines of reviews
reviews.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,sales,cust_id,cust_zip,units,review_message,review_score,days_from_est_deliv,order_count,days_between_orders,on_time
0,e22acc9c116caa3f2b7121bbb380d08e,fadbb3709178fc513abc1b2670aa1ad2,delivered,2018-05-10 10:56:27,2018-05-10 11:11:18,2018-05-12 08:18:00,2018-05-16 20:48:37,2018-05-21 00:00:00,129.899994,0000366f3b9a7992bf8c76cfdf3221e2,7787,1,"Adorei a cortina, ficou linda na minha sala, e...",5.0,-5.0,1,,on time
1,3594e05a005ac4d06a72673270ef9ec9,4cb282e167ae9234755102258dd52ee8,delivered,2018-05-07 11:11:27,2018-05-07 18:25:44,2018-05-09 12:18:00,2018-05-10 18:02:42,2018-05-15 00:00:00,18.9,0000b849f77a49e4a4ce2b2a4ca5be3f,6053,1,,4.0,-5.0,1,,on time
2,b33ec3b699337181488304f362a6b734,9b3932a6253894a02c1df9d19004239f,delivered,2017-03-10 21:05:03,2017-03-10 21:05:03,2017-03-13 12:58:30,2017-04-05 14:38:47,2017-04-07 00:00:00,69.0,0000f46a3911fa3c0805444483337064,88115,1,,3.0,-2.0,1,,on time
3,41272756ecddd9a9ed0180413cc22fb6,914991f0c02ef0843c0e7010c819d642,delivered,2017-10-12 20:29:41,2017-10-12 20:49:17,2017-10-13 20:08:19,2017-11-01 21:23:05,2017-11-13 00:00:00,25.99,0000f6ccb0745a6a4b88665a16c9f078,66812,1,Bom vendedor,4.0,-12.0,1,,on time
4,d957021f1127559cd947b62533f484f7,47227568b10f5f58a524a75507e6992c,delivered,2017-11-14 19:45:42,2017-11-14 20:06:52,2017-11-16 19:52:10,2017-11-27 23:08:56,2017-12-05 00:00:00,180.0,0004aac84e0df4da2b147fca70cf8255,18040,1,,5.0,-8.0,1,,on time


In [4]:
# Categorize the review messages

# convert NaN to strings in review messages
data = reviews['review_message'].astype(str)

df = pd.DataFrame(data)

# Define exact phrases or words for categorization
exact_phrases_categories = {
    'bom': 'good_experience',
}

# Define partial phrases with regular expressions for categorization
partial_phrases_patterns = {
    r'não .* eficiente': 'poor_experience',
    r'nao .* eficiente': 'poor_experience',
    r'não .* satisfeit': 'poor_experience',
    r'nao .* satisfeit': 'poor_experience',
}

# Define partial phrases for categorization
partial_phrases_categories = {
    'qualidade ruim': 'poor_quality', 
    'péssima qualidade': 'poor_quality', 
    'pessima qualidade': 'poor_quality', 
    'baixa qualidade': 'poor_quality', 
    'não é bom': 'poor_quality', 
    'produto deixou': 'poor_quality', 
    'produto não funciona': 'poor_quality',
    'produto veio quebrado': 'poor_quality',  
    'falho': 'poor_quality', 
    'defeituoso': 'poor_quality', 
    'não funciona direito': 'poor_quality', 
    'não recomendado produto': 'poor_quality', 
    'não recomendo produto': 'poor_quality', 
    'antes do': 'arrived_early', 
    'antes mesmo': 'arrived_early', 
    'até antes': 'arrived_early', 
    'ate antes': 'arrived_early',
    'antes da': 'arrived_early', 
    'bem antes': 'arrived_early', 
    'chegou antes': 'arrived_early', 
    'chegou adiantado': 'arrived_early', 
    'entrega rápid': 'arrived_early', 
    'rapid entrega': 'arrived_early',
    'entrega super rápid': 'arrived_early', 
    'entrega super rapid': 'arrived_early', 
    'mercadoria antes': 'arrived_early',
    'muito rápido': 'arrived_early', 
    'muito rapido': 'arrived_early', 
    'muito rápida': 'arrived_early', 
    'muito rapida': 'arrived_early', 
    'adorei o produtos': 'good_quality', 
    'produto em perfeito estado': 'good_quality', 
    'recomendo o produto': 'good_quality', 
    'recomendo produto': 'good_quality', 
    'excelente qualidad': 'good_quality', 
    'qualidade excelente': 'good_quality', 
    'produto excelente': 'good_quality', 
    'produto de qualidade': 'good_quality', 
    'gostei do produto': 'good_quality', 
    'produto perfeito': 'good_quality', 
    'ótimo produto': 'good_quality', 
    'otimo produto': 'good_quality',
    'ótima qualidade': 'good_quality', 
    'otima qualidade': 'good_quality',
    'produto ótimo': 'good_quality',
    'produto otimo': 'good_quality',
    'bom produto': 'good_quality',
    'bom o produto': 'good_quality',
    'produto bom': 'good_quality',   
    'amei o produto': 'good_quality',
    'produto muito bom': 'good_quality',
    'produto é bom': 'good_quality',  
    'produto e bom': 'good_quality',
    'não recebi': 'did_not_receive', 
    'não entregue': 'did_not_receive',
    'nao recebi': 'did_not_receive', 
    'nao entregue': 'did_not_receive',
    'não foi entregue': 'did_not_receive', 
    'nao foi entregue': 'did_not_receive', 
    'não entrega': 'did_not_receive', 
    'nao entrega': 'did_not_receive', 
    'muito bom': 'good_experience', 
    'excelente': 'good_experience', 
    'exelente': 'good_experience', 
    'errado': 'wrong_product', 
    'perfeito': 'good_experience', 
    'ótimo': 'good_experience', 
    'otimo': 'good_experience', 
    'ótima': 'good_experience', 
    'otima': 'good_experience', 
    'muito bo': 'good_experience',
    'bom atendimento': 'good_experience', 
    'não recomend': 'poor_experience', 
    'nao recomend': 'poor_experience', 
    'recomend': 'good_experience', 
    'inferior': 'poor_quality', 
    'eficiente': 'good_experience',
    'satisfeit': 'good_experience',  
    'nan': 'did_not_leave_a_review'
}

# Function to categorize comments based on exact and partial matches
def categorize_comment(comment):
    # Convert comment to lowercase for case-insensitive matching
    comment_lower = comment.lower()

    # Check for exact matches
    for phrase, category in exact_phrases_categories.items():
        if comment_lower == phrase:
            return category
        
    # Check for partial matches with regular expressions
    for pattern, category in partial_phrases_patterns.items():
        if re.search(pattern, comment_lower):
            return category
    
    # Check for partial matches
    for phrase, category in partial_phrases_categories.items():
        if phrase in comment_lower:
            return category
    
    return 'Uncategorized'

# Apply the function to create a new column
reviews['review_cat'] = df['review_message'].apply(categorize_comment)

In [5]:
# summarize counts of newly created review categories
reviews['review_cat'].value_counts()

did_not_leave_a_review    58713
Uncategorized             19375
good_experience            8052
arrived_early              6660
did_not_receive            2792
good_quality               2784
poor_quality                397
wrong_product               340
poor_experience             328
Name: review_cat, dtype: int64

In [6]:
# create an uncategorized dataframe as a variable to assess further categorizations
uncategorized_reviews = reviews[reviews['review_cat'] == 'Uncategorized']

In [7]:
# export csv file
reviews.to_csv('/Users/joelam/docs/Data Science/Brazilian_Ecomm/Data Files/processed/order_sales_review_cats.csv', index=False)

In [8]:
# check sum total to ensure dups were removed
reviews['sales'].sum()

13591643.701720357