In [23]:
import numpy as np
import random # 시드 고정을 위해
import os # 시드 고정을 위해
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import torch
from torch import Tensor,tensor
from torch.utils.data import DataLoader,Dataset
from torch.nn import Module,Sequential

# device= 'cuda' if torch.cuda.is_available() else 'cpu'

# def reset_seeds(seed):
#     random.seed(seed)
#     os.environ['PYTHONHASHSEED'] = str(seed)
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True

# import pymysql
# import pandas as pd

# conn = pymysql.connect(host='127.0.0.1', user='wodus1530',
#                        password='dkvms7255', db='Zigzag')


# cursor = conn.cursor()


# sql_query = 'SELECT * FROM db'
# cursor.execute(sql_query)

# result = cursor.fetchall()
# column_names = [i[0] for i in cursor.description]


# db = pd.DataFrame(result, columns=column_names)

# cursor.close()
# conn.close()


db = pd.read_excel(r'D:\Project\financial_marketing\Final\Zigzag\Zigzag_streamlit\data\DB.xlsx', index_col=False)
db.fillna(0,inplace=True)
db['리뷰']=db['리뷰'].str.replace('\n',' ')
main_features=['색감','핏','재질','퀄리티','제품상태','가격','두께']

In [24]:
color_data_1 = pd.DataFrame(db[db['색감'] != 0]['리뷰'], columns=['리뷰']).reset_index(drop=True)
color_data_1['target'] = 1

color_data_2 = pd.DataFrame(db[db['색감'] == 0]['리뷰'], columns=['리뷰']).reset_index(drop=True)
color_data_2['target'] = 0

color_data = pd.concat([color_data_1, color_data_2], axis=0).reset_index(drop=True)
color_data.isnull().sum()

리뷰        0
target    0
dtype: int64

In [25]:
from mecab import MeCab
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

filter=['NNG','MAG','EC','VA','VA+EF','VV+ETM','NNB+JKB','VCP+EC','VCP','MAG+JX','VCN']

def tokenizer(data):
    tokenizer = MeCab()
    
    list = []
    
    for text in tqdm(data["리뷰"]):
        prah = []  
        lst = tokenizer.pos(text)  
        for word, pos in lst:
            if pos in filter:
                prah.append(word)  
        list.append(' '.join(prah))  
    
    return pd.DataFrame({'tokens':list})
DB_list=tokenizer(color_data)

100%|██████████| 7193/7193 [00:06<00:00, 1064.80it/s]


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(max_features=100)
vectorizer=vectorizer.fit(DB_list["tokens"])
joblib.dump(vectorizer,'color_vectorizer.pkl')

legacy_tfidf=vectorizer.transform(DB_list["tokens"])
legacy_tfidf=legacy_tfidf.toarray()

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler = scaler.fit(legacy_tfidf)
joblib.dump(scaler,'color_scaler.pkl')

legacy_tfidf=scaler.transform(legacy_tfidf)

In [41]:
from catboost import CatBoostClassifier


catboost_model = CatBoostClassifier(random_state=42, verbose=0)


catboost_model.fit(legacy_tfidf, color_data['target'])


feature_importances = catboost_model.feature_importances_

feature_names = vectorizer.get_feature_names_out()


sorted_indices = np.argsort(feature_importances)[::-1]
sorted_feature_importances = feature_importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

In [42]:

important_feature_indices = sorted_indices[sorted_feature_importances >= 1]
legacy_tfidf_filtered = legacy_tfidf[:, important_feature_indices]

In [45]:
joblib.dump(important_feature_indices,'color_col.pkl')

['color_col.pkl']

In [48]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Define the objective function for optimization
def objective(trial):
    # Define the search space
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
    }
    
    # Initialize CatBoostClassifier with suggested parameters
    catboost_model = CatBoostClassifier(**params, random_state=42, verbose=0)
    
    # Split the data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(legacy_tfidf_filtered, color_data['target'], test_size=0.2, random_state=42)
    
    # Fit the model
    catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=False)
    
    # Get validation predictions
    y_pred = catboost_model.predict(X_val)
    
    # Calculate precision, recall, and F1 score for class '1'
    precision = precision_score(y_val, y_pred, pos_label=1)
    recall = recall_score(y_val, y_pred, pos_label=1)
    f1 = f1_score(y_val, y_pred, pos_label=1)
    
    return f1

# Create an Optuna study object
study = optuna.create_study(direction='maximize')

# Optimize the objective function
study.optimize(objective, n_trials=50)

# Get the best parameters and best score
best_params = study.best_params
best_score = study.best_value

print("Best Parameters:", best_params)
print("Best F1 Score for class '1':", best_score)

# Train CatBoostClassifier with the best parameters
best_catboost_model = CatBoostClassifier(**best_params, random_state=42, verbose=0)



[I 2024-02-22 14:02:03,517] A new study created in memory with name: no-name-fa165efc-91a0-484b-823a-3348fd60adf1


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
  'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
[I 2024-02-22 14:02:05,996] Trial 0 finished with value: 0.6639676113360324 and parameters: {'iterations': 195, 'learning_rate': 0.0369645147313006, 'depth': 8, 'l2_leaf_reg': 6.2749593987081065, 'border_count': 214, 'random_strength': 1.3611353386836942, 'bagging_temperature': 0.9332342682706218}. Best is trial 0 with value: 0.6639676113360324.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
  'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
[I 2024-02-22 14:02:13,180] Trial 1 finished with value: 0.6

Best Parameters: {'iterations': 179, 'learning_rate': 0.0013007800585719874, 'depth': 7, 'l2_leaf_reg': 5.84875883482354, 'border_count': 157, 'random_strength': 0.18268172507454628, 'bagging_temperature': 1.199676699443126}
Best F1 Score for class '1': 0.6948717948717948


In [49]:
best_catboost_model.fit(legacy_tfidf_filtered, color_data['target'])

<catboost.core.CatBoostClassifier at 0x1e466471c10>

In [51]:
joblib.dump(best_catboost_model, 'color_catboost_model.pkl')

['color_catboost_model.pkl']