In [111]:
import numpy as np
import random # 시드 고정을 위해
import os # 시드 고정을 위해
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import torch
from torch import Tensor,tensor
from torch.utils.data import DataLoader,Dataset
from torch.nn import Module,Sequential

device= 'cuda' if torch.cuda.is_available() else 'cpu'

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

import pymysql
import pandas as pd

conn = pymysql.connect(host='127.0.0.1', user='wodus1530',
                       password='dkvms7255', db='Zigzag')


cursor = conn.cursor()


sql_query = 'SELECT * FROM db'
cursor.execute(sql_query)

result = cursor.fetchall()
column_names = [i[0] for i in cursor.description]


db = pd.DataFrame(result, columns=column_names)

cursor.close()
conn.close()



In [112]:
color_data_1 = pd.DataFrame(db[db['색감'] != 0]['리뷰'], columns=['리뷰']).reset_index(drop=True)
color_data_1['target'] = 1

color_data_2 = pd.DataFrame(db[db['색감'] == 0]['리뷰'], columns=['리뷰']).reset_index(drop=True)
color_data_2['target'] = 0

color_data = pd.concat([color_data_1, color_data_2], axis=0).reset_index(drop=True)
color_data.isnull().sum()

리뷰        0
target    0
dtype: int64

In [113]:
from mecab import MeCab
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

filter=['NNG','MAG','EC','VA','VA+EF','VV+ETM','NNB+JKB','VCP+EC','VCP','MAG+JX','VCN']

def tokenizer(data):
    tokenizer = MeCab()
    
    list = []
    
    for text in tqdm(data["리뷰"]):
        prah = []  
        lst = tokenizer.pos(text)  
        for word, pos in lst:
            if pos in filter:
                prah.append(word)  
        list.append(' '.join(prah))  
    
    return pd.DataFrame({'tokens':list})

In [114]:
DB_list=tokenizer(color_data)

100%|██████████| 5135/5135 [00:02<00:00, 1742.58it/s]


In [115]:
# res=tokenizer(sample_data)

In [116]:
# res

In [117]:
from sklearn.feature_extraction.text import TfidfVectorizer
loaded_vectorizer=TfidfVectorizer(max_features=100)

In [118]:
loaded_vectorizer.fit(DB_list["tokens"])

In [119]:
loaded_vectorizer.get_feature_names_out()

array(['가격', '가슴', '가을', '같이', '거나', '검정', '고민', '괜찮', '구김', '구매', '그냥',
       '기모', '기본', '기장', '길이', '나시', '냄새', '너무', '네요', '느낌', '는데', '니까',
       '니트', '다가', '다고', '단독', '단추', '대비', '두께', '디자인', '라서', '레이어드',
       '마감', '마음', '만족', '많이', '바지', '배송', '별로', '보풀', '부드럽', '부분', '불편',
       '블랙', '비침', '사이즈', '사진', '살짝', '색감', '색깔', '색상', '생각', '세탁', '셔츠',
       '소매', '소재', '속옷', '실밥', '아니', '아서', '아쉽', '아요', '아용', '아주', '약간',
       '어깨', '어도', '어서', '어야', '엄청', '엉덩이', '여름', '예뻐요', '예쁘', '오버', '완전',
       '으려고', '으면', '은데', '이너', '이뻐요', '이쁘', '이상', '인데', '일단', '자체', '재질',
       '정도', '정말', '조금', '주문', '지만', '진짜', '처음', '추천', '컬러', '편하', '화면',
       '후기', '흰색'], dtype=object)

In [120]:
loaded_vectorizer = joblib.load('color_vectorizer.pkl')
legacy_tfidf=loaded_vectorizer.transform(DB_list["tokens"])

legacy_tfidf=legacy_tfidf.toarray()

In [121]:
loaded_vectorizer.transform(res["tokens"]).toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.32440749, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.50229578, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [122]:
from sklearn.preprocessing import MinMaxScaler


loaded_scaler = joblib.load('color_scaler.pkl')
legacy_tfidf = loaded_scaler.transform(legacy_tfidf)


In [123]:
sample_tfidf=loaded_vectorizer.transform(res["tokens"])
sample_tfidf=sample_tfidf.toarray()
sample_tfidf=loaded_scaler.transform(sample_tfidf)

In [124]:
from catboost import CatBoostClassifier


catboost_model = CatBoostClassifier(random_state=42, verbose=0)


catboost_model.fit(legacy_tfidf, color_data['target'])


feature_importances = catboost_model.feature_importances_

feature_names = legacy_df.columns


sorted_indices = np.argsort(feature_importances)[::-1]
sorted_feature_importances = feature_importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

In [125]:

important_feature_indices = sorted_indices[sorted_feature_importances >= 1]
legacy_tfidf_filtered = legacy_tfidf[:, important_feature_indices]

In [126]:
legacy_df.iloc[:,important_feature_indices]

Unnamed: 0,색감,색상,색깔,화면,컬러,사진,이쁘,는데,생각,예쁘,부분,너무,가슴,지만,구매,어서,가격,느낌,흰색,예뻐요
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.337702,0.000000,0.0,0.000000,0.196963,0.000000,0.000000,0.247952,0.216733,0.0,0.000000,0.000000,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.612719,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.296312,0.352671,0.0,0.000000,0.000000,0.000000,0.000000,0.435123,0.000000,0.0,0.000000,0.000000,0.0
3,1.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
4,0.0,0.0,0.586646,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5130,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.181094,0.000000,0.0,0.292424,0.422487,0.332734,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
5131,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.266494,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.554723,0.0
5132,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.199233,0.000000,0.000000,0.000000,0.000000,0.0,0.247107,0.000000,0.0
5133,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.154301,0.183650,0.0,0.000000,0.179991,0.000000,0.414829,0.000000,0.000000,0.0,0.446482,0.000000,0.0


In [127]:
joblib.dump(important_feature_indices,'color_col.pkl')

['color_col.pkl']

In [132]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Define the objective function for optimization
def objective(trial):
    # Define the search space
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
    }
    
    # Initialize CatBoostClassifier with suggested parameters
    catboost_model = CatBoostClassifier(**params, random_state=42, verbose=0)
    
    # Split the data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(legacy_tfidf_filtered, color_data['target'], test_size=0.2, random_state=42)
    
    # Fit the model
    catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=False)
    
    # Get validation predictions
    y_pred = catboost_model.predict(X_val)
    
    # Calculate precision, recall, and F1 score for class '1'
    precision = precision_score(y_val, y_pred, pos_label=1)
    recall = recall_score(y_val, y_pred, pos_label=1)
    f1 = f1_score(y_val, y_pred, pos_label=1)
    
    return f1

# Create an Optuna study object
study = optuna.create_study(direction='maximize')

# Optimize the objective function
study.optimize(objective, n_trials=50)

# Get the best parameters and best score
best_params = study.best_params
best_score = study.best_value

print("Best Parameters:", best_params)
print("Best F1 Score for class '1':", best_score)

# Train CatBoostClassifier with the best parameters
best_catboost_model = CatBoostClassifier(**best_params, random_state=42, verbose=0)



[I 2024-02-19 20:57:29,736] A new study created in memory with name: no-name-ee51d76e-d281-4638-93bf-10ef1fb783f3
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
  'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
[I 2024-02-19 20:57:33,313] Trial 0 finished with value: 0.6567164179104478 and parameters: {'iterations': 248, 'learning_rate': 0.002058586308836775, 'depth': 9, 'l2_leaf_reg': 1.4820688911849427, 'border_count': 190, 'random_strength': 1.4196710112154598, 'bagging_temperature': 0.15539869870914857}. Best is trial 0 with value: 0.6567164179104478.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
  'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
  'bagging_temperature': 

Best Parameters: {'iterations': 404, 'learning_rate': 0.001035482231368648, 'depth': 6, 'l2_leaf_reg': 1.0025323029815192, 'border_count': 248, 'random_strength': 0.1157517818484876, 'bagging_temperature': 7.930975816811782}
Best F1 Score for class '1': 0.7115384615384616


In [129]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

# Define the objective function for optimization
def objective(trial):
    # Define the search space
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
    }
    
    # Initialize CatBoostClassifier with suggested parameters
    catboost_model = CatBoostClassifier(**params, random_state=42, verbose=0)
    
    # Split the data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(legacy_tfidf_filtered, color_data['target'], test_size=0.2, random_state=42)
    
    # Fit the model
    catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=False)
    
    # Get validation accuracy
    accuracy = catboost_model.score(X_val, y_val)
    
    return accuracy

# Create an Optuna study object
study = optuna.create_study(direction='maximize')

# Optimize the objective function
study.optimize(objective, n_trials=50)

# Get the best parameters and best score
best_params = study.best_params
best_score = study.best_value

print("Best Parameters:", best_params)
print("Best Accuracy:", best_score)

# Train CatBoostClassifier with the best parameters
best_catboost_model = CatBoostClassifier(**best_params, random_state=42, verbose=0)


[I 2024-02-19 20:49:37,100] A new study created in memory with name: no-name-b20064b3-1882-41b0-b23a-57b548fe8b05
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
  'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
[I 2024-02-19 20:49:37,571] Trial 0 finished with value: 0.8442064264849075 and parameters: {'iterations': 606, 'learning_rate': 0.06273858126806568, 'depth': 5, 'l2_leaf_reg': 1.9467886157756802, 'border_count': 183, 'random_strength': 0.20253947668534567, 'bagging_temperature': 4.3554869451172396}. Best is trial 0 with value: 0.8442064264849075.
[I 2024-02-19 20:49:38,402] Trial 1 finished with value: 0.8412852969814996 and parameters: {'iterations': 662, 'learning_rate': 0.05965466741300639, 'depth': 5, 'l2_leaf_reg': 5.894480032444355, 'border_count': 110, 'random_strength': 6.96

Best Parameters: {'iterations': 907, 'learning_rate': 0.001210554295503293, 'depth': 6, 'l2_leaf_reg': 5.39543682380551, 'border_count': 228, 'random_strength': 0.22557339694936, 'bagging_temperature': 0.3639745316199349}
Best Accuracy: 0.8490749756572541


In [133]:
best_catboost_model.fit(legacy_tfidf_filtered, color_data['target'])

<catboost.core.CatBoostClassifier at 0x2a54a1e81d0>

In [134]:
joblib.dump(best_catboost_model, 'color_catboost_model.pkl')

['color_catboost_model.pkl']