In [None]:
%pip install nltk wordcloud xgboost lightgbm catboost 

Collecting leia-br
  Downloading leia_br-0.0.1-py2.py3-none-any.whl.metadata (2.9 kB)
Downloading leia_br-0.0.1-py2.py3-none-any.whl (130 kB)
Installing collected packages: leia-br
Successfully installed leia-br-0.0.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [41]:
%pip uninstall leia-br -y
%pip install leia-br

Found existing installation: leia-br 0.0.1
Uninstalling leia-br-0.0.1:
  Successfully uninstalled leia-br-0.0.1
Note: you may need to restart the kernel to use updated packages.
Collecting leia-br
  Using cached leia_br-0.0.1-py2.py3-none-any.whl.metadata (2.9 kB)
Using cached leia_br-0.0.1-py2.py3-none-any.whl (130 kB)
Installing collected packages: leia-br
Successfully installed leia-br-0.0.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [42]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import os
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

# Text Processing Libraries
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from collections import Counter
from nltk import ngrams

# Sentiment Analysis
from LeIA import SentimentIntensityAnalyzer

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Machine Learning Libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
)
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Evaluation Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, classification_report, 
    confusion_matrix, roc_curve, roc_auc_score, ConfusionMatrixDisplay, auc
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mumu1\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mumu1\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [3]:
from IPython.core.display import HTML
HTML('''<style>
div.output_area {
    max-height: none !important;
}
</style>''')

# 출력 제한 완전히 제거
get_ipython().run_line_magic('config', "InlineBackend.print_figure_kwargs = {'bbox_inches': None}") 

In [4]:
from RFM import df_order_reviews, df_product_category_name_translation, df_products, merge_coi

# Merging

In [15]:
# 1) 전체 분석 review_comment_message, product_category_name, order_id
# 2) 이탈고객 vs 잠재우수 고객

df_reviews = df_order_reviews[['order_id', 'review_comment_message']]

# merge_coi + df_products = NLP
NLP = merge_coi.merge(
    df_products[['product_id', 'product_category_name']],  
    on='product_id',
    how='inner'
)

# NLP + df_reviews = NLP
NLP = NLP.merge(
    df_reviews[['order_id', 'review_comment_message']], 
    on='order_id',
    how='inner'
)


# Data cleaning

In [45]:
# 1. 데이터 선택 및 기본 정제
clean_NLP = NLP[['order_id', 'review_comment_message', 'product_category_name']]

# 결측치 제거
clean_NLP = clean_NLP.dropna(subset=['product_category_name', 'review_comment_message'])

# 중복 제거
clean_NLP = clean_NLP.drop_duplicates(subset=['review_comment_message']).reset_index(drop=True)

clean_NLP.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33495 entries, 0 to 33494
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   order_id                33495 non-null  object
 1   review_comment_message  33495 non-null  object
 2   product_category_name   33495 non-null  object
dtypes: object(3)
memory usage: 785.2+ KB


# text preprocessing

In [None]:
# 1. Portuguese stopwords 정의
STOP_WORDS = set(stopwords.words('portuguese'))

# 2. 텍스트 정제 및 토큰화 함수
def clean_and_tokenize(text):
    """
    텍스트를 정제하고 토큰화
    
    Returns:
        tuple: (정제된 텍스트, 토큰 리스트)
    """
    if not isinstance(text, str):
        return "", []
    
    # 소문자 변환 및 구두점 제거
    cleaned_text = text.lower().translate(str.maketrans('', '', string.punctuation))
    
    # 토큰화
    words = cleaned_text.split()
    
    # 불용어 제거
    filtered_words = [word for word in words if word not in STOP_WORDS]
    
    return " ".join(filtered_words), filtered_words

# 3. 전처리 적용
clean_NLP[['review_comment_message_clean', 'review_comment_message_tokens']] = \
    clean_NLP['review_comment_message'].apply(
        lambda text: pd.Series(clean_and_tokenize(text))
    )

clean_NLP[['review_comment_message', 'review_comment_message_clean']].head(3)
clean_NLP.head(10)

                           review_comment_message  \
0  O baratheon Ã¨ esxelente Amo adoro o baratheon   
1                               Loja responsÃ¡vel   
2                       chegou antes do prometido   

                 review_comment_message_clean  
0  baratheon ã¨ esxelente amo adoro baratheon  
1                           loja responsã¡vel  
2                      chegou antes prometido  


Unnamed: 0,order_id,review_comment_message,product_category_name,review_comment_message_clean,review_comment_message_tokens
0,6b7d50bd145f6fc7f33cebabd7e49d0f,O baratheon Ã¨ esxelente Amo adoro o baratheon,casa_conforto,baratheon ã¨ esxelente amo adoro baratheon,"[baratheon, ã¨, esxelente, amo, adoro, baratheon]"
1,5741ea1f91b5fbab2bd2dc653a5b5099,Loja responsÃ¡vel,esporte_lazer,loja responsã¡vel,"[loja, responsã¡vel]"
2,1ebeea841c590e86a14a0d7a48e7d062,chegou antes do prometido,brinquedos,chegou antes prometido,"[chegou, antes, prometido]"
3,7433cbcc783205509d66a5260da5b574,"Ã³timo, entregou antes da data prevista.",moveis_decoracao,ã³timo entregou antes data prevista,"[ã³timo, entregou, antes, data, prevista]"
4,8428e578bb1cf839ae26a6b7615502b9,Td certo.Produto e prazo de entrega.,automotivo,td certoproduto prazo entrega,"[td, certoproduto, prazo, entrega]"
5,f86c5ed7048ac10eb88ec21c00f71892,"Pena o produto nÃ£o ter sido entregue em casa,...",informatica_acessorios,pena produto nã£o ter sido entregue casa q ret...,"[pena, produto, nã£o, ter, sido, entregue, cas..."
6,852d2f4d37773bcbc21c8e09a05a4ea5,Produto chegou no prazo o problema que veio na...,telefonia,produto chegou prazo problema veio frente capa...,"[produto, chegou, prazo, problema, veio, frent..."
7,eac76692452422620996fe5e1a7f8bb0,Ameiiii !!! excelente produto...excelente qual...,ferramentas_jardim,ameiiii excelente produtoexcelente qualidade e...,"[ameiiii, excelente, produtoexcelente, qualida..."
8,72bab69c50432c6f94d8b50a5f84b69a,Produto chegou antes da data prevista.,automotivo,produto chegou antes data prevista,"[produto, chegou, antes, data, prevista]"
9,aaff8afa47c8426e414a6d908a97713c,Bom dia eu fiz uma compra de 03 peÃ§as sÃ³ mim...,ferramentas_jardim,bom dia fiz compra 03 peã§as sã³ mim entregaro...,"[bom, dia, fiz, compra, 03, peã§as, sã³, mim, ..."
