In [None]:
# !pip install -r ../requirements.txt

In [4]:
import pandas as pd

base_path = "../data/"  

orders = pd.read_csv(base_path + "olist_orders_dataset.csv")
reviews = pd.read_csv(base_path + "olist_order_reviews_dataset.csv")
items = pd.read_csv(base_path + "olist_order_items_dataset.csv")
customers = pd.read_csv(base_path + "olist_customers_dataset.csv")

# print("orders:", orders.shape)
# print("reviews:", reviews.shape)
# print("items:", items.shape)
# print("customers:", customers.shape)

# orders.head()

In [6]:
# import sys
# !{sys.executable} -m pip install ydata-profiling


In [8]:
from ydata_profiling import ProfileReport

dfs = {
    "orders": orders,
    "reviews": reviews,
    "items": items,
    "customers": customers
}

for name, df in dfs.items():
    print(f"Generating report for: {name}")
    profile = ProfileReport(df, title=f"{name.capitalize()} EDA Report", explorative=True)
    profile.to_file(f"{name}_eda_report.html")


Generating report for: orders




Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                            | 0/8 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00,  1.74it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Generating report for: reviews




Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                            | 0/7 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:05<00:00,  1.31it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Generating report for: items




Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                            | 0/7 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:02<00:00,  2.42it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Generating report for: customers




Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.15it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# print("=== ORDERS ===")
# print(orders.shape)
print(orders.dtypes)
# print(orders.isnull().sum())
# print(orders.nunique())
# print(orders.describe())
# print(orders.describe(include='object'))


In [None]:
# Convert dates to datetime format
orders["order_purchase_timestamp"] = pd.to_datetime(orders["order_purchase_timestamp"])
orders["order_delivered_customer_date"] = pd.to_datetime(orders["order_delivered_customer_date"])
orders["order_estimated_delivery_date"] = pd.to_datetime(orders["order_estimated_delivery_date"])

# Calculate total number of orders and last order date per customer
customer_orders = orders.groupby("customer_id").agg(
    total_orders=("order_id", "nunique"),
    last_order_date=("order_purchase_timestamp", "max")
).reset_index()

customer_orders.head()


In [None]:
orders_reviews = pd.merge(orders, reviews, on="order_id", how="inner")


In [None]:
reviews.head()

In [None]:
avg_review_scores = orders_reviews.groupby("customer_id").agg(
    avg_review_score=("review_score", "mean")
).reset_index()


In [None]:
customer_summary = pd.merge(customer_orders, avg_review_scores, on="customer_id", how="left")
customer_summary.head()


In [None]:
# Delivery delay = actual delivery date - estimated delivery date
orders["delivery_delay_days"] = (orders["order_delivered_customer_date"] - orders["order_estimated_delivery_date"]).dt.days


In [None]:
avg_delay = orders.groupby("customer_id").agg(
    avg_delivery_delay=("delivery_delay_days", "mean")
).reset_index()


In [None]:
customer_summary = pd.merge(customer_summary, avg_delay, on="customer_id", how="left")
customer_summary.head()


In [None]:
import matplotlib.pyplot as plt

customer_summary['avg_delivery_delay'].hist(bins=30)
plt.xlabel('Average Delivery Delay')
plt.ylabel('Number of Customers')
plt.title('Distribution of Average Delivery Delay')
plt.show()


In [None]:
# import sys
# !{sys.executable} -m pip install textblob


In [None]:
# import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')

In [None]:
# pip install deep-translator


In [None]:
from textblob import TextBlob
from deep_translator import GoogleTranslator

def get_sentiment_v2(text):
    if pd.isnull(text) or text.strip() == "":
        return None
    try:
        translated = GoogleTranslator(source='auto', target='en').translate(text)
        return TextBlob(translated).sentiment.polarity
    except:
        return None


In [None]:
from tqdm.auto import tqdm
tqdm.pandas()

sample_reviews = reviews.sample(100, random_state=42).copy()
sample_reviews["sentiment_score"] = sample_reviews["review_comment_message"].progress_apply(get_sentiment_v2)


In [None]:
sample_reviews["review_comment_message"].isnull().sum()

In [None]:
from deep_translator import GoogleTranslator

def try_translate(text):
    if pd.isnull(text) or text.strip() == "":
        return None
    try:
        return GoogleTranslator(source='auto', target='en').translate(text)
    except:
        return None

# 원문과 번역 컬럼 추가
sample_reviews["translated_comment"] = sample_reviews["review_comment_message"].apply(try_translate)

# 앞 100개만 보기
sample_reviews[["review_comment_message", "translated_comment"]].head(100)


In [None]:
total_by_score = reviews.groupby('review_score').size()

In [None]:
text_by_score = reviews[reviews['review_comment_message'].notnull()].groupby('review_score').size()

In [None]:
text_ratio = (text_by_score / total_by_score).fillna(0).round(2)

In [None]:
comparison_df = pd.DataFrame({
    'Total Reviews': total_by_score,
    'Reviews with Text': text_by_score,
    'Text Review Ratio': text_ratio
})

print(comparison_df)

In [None]:
plt.figure(figsize=(8, 5))
text_ratio.sort_index().plot(kind='bar', color='skyblue')

plt.title('Text Review Ratio by Star Rating')
plt.xlabel('Review Score')
plt.ylabel('Ratio of Reviews with Text')
plt.ylim(0, 1)
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()