In [1]:
import numpy as np
import random # 시드 고정을 위해
import os # 시드 고정을 위해
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import torch
from torch import Tensor,tensor
from torch.utils.data import DataLoader,Dataset
from torch.nn import Module,Sequential

db = pd.read_excel(r'D:\Project\financial_marketing\Final\Zigzag\Zigzag_streamlit\data\DB.xlsx', index_col=False)
db.fillna(0,inplace=True)
db['리뷰']=db['리뷰'].str.replace('\n',' ')
main_features=['색감','핏','재질','퀄리티','제품상태','가격','두께']

In [2]:
thick_data_1 = pd.DataFrame(db[db['두께'] != 0]['리뷰'], columns=['리뷰']).reset_index(drop=True)
thick_data_1['target'] = 1

thick_data_2 = pd.DataFrame(db[db['두께'] == 0]['리뷰'], columns=['리뷰']).reset_index(drop=True)
thick_data_2['target'] = 0

thick_data = pd.concat([thick_data_1, thick_data_2], axis=0).reset_index(drop=True)
thick_data.isnull().sum()

리뷰        0
target    0
dtype: int64

In [3]:
from mecab import MeCab
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

filter=['NNG','MAG','EC','VA','VA+EF','VV+ETM','NNB+JKB','VCP+EC','VCP','MAG+JX','VCN']

def tokenizer(data):
    tokenizer = MeCab()
    
    list = []
    
    for text in tqdm(data["리뷰"]):
        prah = []  
        lst = tokenizer.pos(text)  
        for word, pos in lst:
            if pos in filter:
                prah.append(word)  
        list.append(' '.join(prah))  
    
    return pd.DataFrame({'tokens':list})

DB_list=tokenizer(thick_data)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(max_features=100)
vectorizer=vectorizer.fit(DB_list["tokens"])
joblib.dump(vectorizer,'thick_vectorizer.pkl')

legacy_tfidf=vectorizer.transform(DB_list["tokens"])
legacy_tfidf=legacy_tfidf.toarray()

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler = scaler.fit(legacy_tfidf)
joblib.dump(scaler,'thick_scaler.pkl')

legacy_tfidf=scaler.transform(legacy_tfidf)

from catboost import CatBoostClassifier


catboost_model = CatBoostClassifier(random_state=42, verbose=0)


catboost_model.fit(legacy_tfidf, thick_data['target'])


feature_importances = catboost_model.feature_importances_

feature_names = vectorizer.get_feature_names_out()


sorted_indices = np.argsort(feature_importances)[::-1]
sorted_feature_importances = feature_importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

  0%|          | 0/7193 [00:00<?, ?it/s]

100%|██████████| 7193/7193 [00:09<00:00, 739.78it/s] 


In [4]:
top_10_indices = sorted_indices[:10]

legacy_tfidf_filtered = legacy_tfidf[:, top_10_indices]

In [5]:
joblib.dump(top_10_indices,'thick_col.pkl')

['thick_col.pkl']

In [6]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Define the objective function for optimization
def objective(trial):
    # Define the search space
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
    }
    
    # Initialize CatBoostClassifier with suggested parameters
    catboost_model = CatBoostClassifier(**params, random_state=42, verbose=0)
    
    # Split the data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(legacy_tfidf_filtered, thick_data['target'], test_size=0.2, random_state=42)
    
    # Fit the model
    catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=False)
    
    # Get validation predictions
    y_pred = catboost_model.predict(X_val)
    
    # Calculate precision, recall, and F1 score for class '1'
    precision = precision_score(y_val, y_pred, pos_label=1)
    recall = recall_score(y_val, y_pred, pos_label=1)
    f1 = f1_score(y_val, y_pred, pos_label=1)
    
    return f1

# Create an Optuna study object
study = optuna.create_study(direction='maximize')

# Optimize the objective function
study.optimize(objective, n_trials=50)

# Get the best parameters and best score
best_params = study.best_params
best_score = study.best_value

print("Best Parameters:", best_params)
print("Best F1 Score for class '1':", best_score)

# Train CatBoostClassifier with the best parameters
best_catboost_model = CatBoostClassifier(**best_params, random_state=42, verbose=0)


[I 2024-02-22 14:28:19,127] A new study created in memory with name: no-name-3570d46e-8e92-4463-8dea-4e3cc93aac79
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
  'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
[I 2024-02-22 14:28:20,676] Trial 0 finished with value: 0.32628398791540786 and parameters: {'iterations': 220, 'learning_rate': 0.007814000287408295, 'depth': 9, 'l2_leaf_reg': 3.235301417477438, 'border_count': 76, 'random_strength': 1.5409331292201351, 'bagging_temperature': 2.408341202759616}. Best is trial 0 with value: 0.32628398791540786.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
  'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
  'bagging_temperature': tr

Best Parameters: {'iterations': 487, 'learning_rate': 0.05479327504722714, 'depth': 5, 'l2_leaf_reg': 3.875693792068282, 'border_count': 123, 'random_strength': 0.7544946076020188, 'bagging_temperature': 9.868501805541914}
Best F1 Score for class '1': 0.367816091954023


In [7]:
best_catboost_model.fit(legacy_tfidf_filtered, thick_data['target'])
joblib.dump(best_catboost_model, 'thick_catboost_model.pkl')

['thick_catboost_model.pkl']