In [30]:
import numpy as np
import random # 시드 고정을 위해
import os # 시드 고정을 위해
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import torch
from torch import Tensor,tensor
from torch.utils.data import DataLoader,Dataset
from torch.nn import Module,Sequential

device= 'cuda' if torch.cuda.is_available() else 'cpu'

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

import pymysql
import pandas as pd

conn = pymysql.connect(host='127.0.0.1', user='wodus1530',
                       password='dkvms7255', db='Zigzag')


cursor = conn.cursor()


sql_query = 'SELECT * FROM db'
cursor.execute(sql_query)

result = cursor.fetchall()
column_names = [i[0] for i in cursor.description]


db = pd.DataFrame(result, columns=column_names)

cursor.close()
conn.close()



In [31]:
outfit_data_1 = pd.DataFrame(db[db['핏'] != 0]['리뷰'], columns=['리뷰']).reset_index(drop=True)
outfit_data_1['target'] = 1

outfit_data_2 = pd.DataFrame(db[db['핏'] == 0]['리뷰'], columns=['리뷰']).reset_index(drop=True)
outfit_data_2['target'] = 0

outfit_data = pd.concat([outfit_data_1, outfit_data_2], axis=0).reset_index(drop=True)
outfit_data.isnull().sum()

리뷰        0
target    0
dtype: int64

In [32]:
from mecab import MeCab
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

filter=['NNG','MAG','EC','VA','VA+EF','VV+ETM','NNB+JKB','VCP+EC','VCP','MAG+JX','VCN']

def tokenizer(data):
    tokenizer = MeCab()
    
    list = []
    
    for text in tqdm(data["리뷰"]):
        prah = []  
        lst = tokenizer.pos(text)  
        for word, pos in lst:
            if pos in filter:
                prah.append(word)  
        list.append(' '.join(prah))  
    
    return pd.DataFrame({'tokens':list})

DB_list=tokenizer(outfit_data)

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=100)
legacy_tfidf = vectorizer.fit_transform(DB_list["tokens"])

joblib.dump(vectorizer,'outfit_vectorizer.pkl')

legacy_tfidf=legacy_tfidf.toarray()

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

legacy_tfidf = scaler.fit_transform(legacy_tfidf)

joblib.dump(scaler,'outfit_scaler.pkl')

from catboost import CatBoostClassifier


catboost_model = CatBoostClassifier(random_state=42, verbose=0)


catboost_model.fit(legacy_tfidf, outfit_data['target'])


feature_importances = catboost_model.feature_importances_

feature_names = vectorizer.get_feature_names_out()


sorted_indices = np.argsort(feature_importances)[::-1]
sorted_feature_importances = feature_importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

100%|██████████| 5135/5135 [00:03<00:00, 1710.96it/s]


In [33]:
important_feature_indices = sorted_indices[sorted_feature_importances >= 1]
legacy_tfidf_filtered = legacy_tfidf[:, important_feature_indices]

In [34]:
joblib.dump(important_feature_indices,'outfit_col.pkl')

['outfit_col.pkl']

In [35]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Define the objective function for optimization
def objective(trial):
    # Define the search space
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
    }
    
    # Initialize CatBoostClassifier with suggested parameters
    catboost_model = CatBoostClassifier(**params, random_state=42, verbose=0)
    
    # Split the data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(legacy_tfidf_filtered, outfit_data['target'], test_size=0.2, random_state=42)
    
    # Fit the model
    catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=False)
    
    # Get validation predictions
    y_pred = catboost_model.predict(X_val)
    
    # Calculate precision, recall, and F1 score for class '1'
    precision = precision_score(y_val, y_pred, pos_label=1)
    recall = recall_score(y_val, y_pred, pos_label=1)
    f1 = f1_score(y_val, y_pred, pos_label=1)
    
    return f1

# Create an Optuna study object
study = optuna.create_study(direction='maximize')

# Optimize the objective function
study.optimize(objective, n_trials=50)

# Get the best parameters and best score
best_params = study.best_params
best_score = study.best_value

print("Best Parameters:", best_params)
print("Best F1 Score for class '1':", best_score)

# Train CatBoostClassifier with the best parameters
best_catboost_model = CatBoostClassifier(**best_params, random_state=42, verbose=0)


[I 2024-02-19 19:19:56,000] A new study created in memory with name: no-name-2cf91f87-5f45-46f2-a6da-a9a2489f7f21
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
  'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
[I 2024-02-19 19:20:00,085] Trial 0 finished with value: 0.7310252996005326 and parameters: {'iterations': 453, 'learning_rate': 0.0019288508082283067, 'depth': 7, 'l2_leaf_reg': 4.427051776176991, 'border_count': 246, 'random_strength': 9.6034378105466, 'bagging_temperature': 0.43656460359017396}. Best is trial 0 with value: 0.7310252996005326.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
  'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
  'bagging_temperature': tri

Best Parameters: {'iterations': 347, 'learning_rate': 0.0023871257927802997, 'depth': 10, 'l2_leaf_reg': 1.3475837016217613, 'border_count': 86, 'random_strength': 5.5274115050156905, 'bagging_temperature': 0.6517711249724978}
Best F1 Score for class '1': 0.7384615384615385


In [9]:
best_catboost_model.fit(legacy_tfidf_filtered, outfit_data['target'])
joblib.dump(best_catboost_model, 'outfit_catboost_model.pkl')

0:	learn: 0.6880533	total: 23.7ms	remaining: 11.3s
1:	learn: 0.6831901	total: 58.2ms	remaining: 13.8s
2:	learn: 0.6796883	total: 77.6ms	remaining: 12.3s
3:	learn: 0.6751735	total: 101ms	remaining: 12s
4:	learn: 0.6713727	total: 124ms	remaining: 11.7s
5:	learn: 0.6669132	total: 140ms	remaining: 11s
6:	learn: 0.6631145	total: 160ms	remaining: 10.8s
7:	learn: 0.6599793	total: 181ms	remaining: 10.6s
8:	learn: 0.6563399	total: 200ms	remaining: 10.4s
9:	learn: 0.6530024	total: 220ms	remaining: 10.3s
10:	learn: 0.6501297	total: 237ms	remaining: 10.1s
11:	learn: 0.6471996	total: 257ms	remaining: 9.95s
12:	learn: 0.6445031	total: 279ms	remaining: 9.97s
13:	learn: 0.6424865	total: 299ms	remaining: 9.9s
14:	learn: 0.6401254	total: 317ms	remaining: 9.77s
15:	learn: 0.6376230	total: 338ms	remaining: 9.74s
16:	learn: 0.6353186	total: 358ms	remaining: 9.69s
17:	learn: 0.6339533	total: 379ms	remaining: 9.66s
18:	learn: 0.6324332	total: 398ms	remaining: 9.6s
19:	learn: 0.6304173	total: 418ms	remaining:

['outfit_catboost_model.pkl']