In [1]:
import numpy as np
import random # 시드 고정을 위해
import os # 시드 고정을 위해
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import torch
from torch import Tensor,tensor
from torch.utils.data import DataLoader,Dataset
from torch.nn import Module,Sequential

db = pd.read_excel(r'D:\Project\financial_marketing\Final\Zigzag\Zigzag_streamlit\data\DB.xlsx', index_col=False)
db.fillna(0,inplace=True)
db['리뷰']=db['리뷰'].str.replace('\n',' ')
main_features=['색감','핏','재질','퀄리티','제품상태','가격','두께']

In [2]:
rsc_data_1 = pd.DataFrame(db[db['재질'] != 0]['리뷰'], columns=['리뷰']).reset_index(drop=True)
rsc_data_1['target'] = 1

rsc_data_2 = pd.DataFrame(db[db['재질'] == 0]['리뷰'], columns=['리뷰']).reset_index(drop=True)
rsc_data_2['target'] = 0

rsc_data = pd.concat([rsc_data_1, rsc_data_2], axis=0).reset_index(drop=True)
rsc_data.isnull().sum()

리뷰        0
target    0
dtype: int64

In [3]:
from mecab import MeCab
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

filter=['NNG','MAG','EC','VA','VA+EF','VV+ETM','NNB+JKB','VCP+EC','VCP','MAG+JX','VCN']

def tokenizer(data):
    tokenizer = MeCab()
    
    list = []
    
    for text in tqdm(data["리뷰"]):
        prah = []  
        lst = tokenizer.pos(text)  
        for word, pos in lst:
            if pos in filter:
                prah.append(word)  
        list.append(' '.join(prah))  
    
    return pd.DataFrame({'tokens':list})

DB_list=tokenizer(rsc_data)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(max_features=100)
vectorizer=vectorizer.fit(DB_list["tokens"])
joblib.dump(vectorizer,'texture_vectorizer.pkl')

legacy_tfidf=vectorizer.transform(DB_list["tokens"])
legacy_tfidf=legacy_tfidf.toarray()

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler = scaler.fit(legacy_tfidf)
joblib.dump(scaler,'texture_scaler.pkl')

legacy_tfidf=scaler.transform(legacy_tfidf)

from catboost import CatBoostClassifier


catboost_model = CatBoostClassifier(random_state=42, verbose=0)


catboost_model.fit(legacy_tfidf, rsc_data['target'])


feature_importances = catboost_model.feature_importances_

feature_names = vectorizer.get_feature_names_out()


sorted_indices = np.argsort(feature_importances)[::-1]
sorted_feature_importances = feature_importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

  2%|▏         | 133/7193 [00:00<00:10, 704.59it/s]

100%|██████████| 7193/7193 [00:06<00:00, 1044.45it/s]


In [4]:
important_feature_indices = sorted_indices[sorted_feature_importances >= 1]
legacy_tfidf_filtered = legacy_tfidf[:, important_feature_indices]

In [5]:
joblib.dump(important_feature_indices,'texture_col.pkl')

['texture_col.pkl']

In [6]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Define the objective function for optimization
def objective(trial):
    # Define the search space
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
    }
    
    # Initialize CatBoostClassifier with suggested parameters
    catboost_model = CatBoostClassifier(**params, random_state=42, verbose=0)
    
    # Split the data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(legacy_tfidf_filtered, rsc_data['target'], test_size=0.2, random_state=42)
    
    # Fit the model
    catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=False)
    
    # Get validation predictions
    y_pred = catboost_model.predict(X_val)
    
    # Calculate precision, recall, and F1 score for class '1'
    precision = precision_score(y_val, y_pred, pos_label=1)
    recall = recall_score(y_val, y_pred, pos_label=1)
    f1 = f1_score(y_val, y_pred, pos_label=1)
    
    return f1

# Create an Optuna study object
study = optuna.create_study(direction='maximize')

# Optimize the objective function
study.optimize(objective, n_trials=50)

# Get the best parameters and best score
best_params = study.best_params
best_score = study.best_value

print("Best Parameters:", best_params)
print("Best F1 Score for class '1':", best_score)

# Train CatBoostClassifier with the best parameters
best_catboost_model = CatBoostClassifier(**best_params, random_state=42, verbose=0)


[I 2024-02-22 14:23:39,996] A new study created in memory with name: no-name-740f222b-93a9-4864-b514-78a0bf8defb6
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
  'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
[I 2024-02-22 14:23:44,108] Trial 0 finished with value: 0.5902097902097903 and parameters: {'iterations': 419, 'learning_rate': 0.020198295661503215, 'depth': 8, 'l2_leaf_reg': 4.992284760393922, 'border_count': 166, 'random_strength': 5.920443973813208, 'bagging_temperature': 1.281639134639321}. Best is trial 0 with value: 0.5902097902097903.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
  'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
  'bagging_temperature': tria

Best Parameters: {'iterations': 179, 'learning_rate': 0.0014407844830673954, 'depth': 10, 'l2_leaf_reg': 3.4782762820825517, 'border_count': 113, 'random_strength': 0.13409164356274414, 'bagging_temperature': 4.882928148833553}
Best F1 Score for class '1': 0.6431372549019608


In [7]:
best_catboost_model.fit(legacy_tfidf_filtered, rsc_data['target'])
joblib.dump(best_catboost_model, 'texture_catboost_model.pkl')

['texture_catboost_model.pkl']