In [40]:
import numpy as np
import random # 시드 고정을 위해
import os # 시드 고정을 위해
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import torch
from torch import Tensor,tensor
from torch.utils.data import DataLoader,Dataset
from torch.nn import Module,Sequential

device= 'cuda' if torch.cuda.is_available() else 'cpu'

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

import pymysql
import pandas as pd

conn = pymysql.connect(host='127.0.0.1', user='wodus1530',
                       password='dkvms7255', db='Zigzag')


cursor = conn.cursor()


sql_query = 'SELECT * FROM db'
cursor.execute(sql_query)

result = cursor.fetchall()
column_names = [i[0] for i in cursor.description]


db = pd.DataFrame(result, columns=column_names)

cursor.close()
conn.close()



In [41]:
quality_data_1 = pd.DataFrame(db[db['퀄리티'] != 0]['리뷰'], columns=['리뷰']).reset_index(drop=True)
quality_data_1['target'] = 1

quality_data_2 = pd.DataFrame(db[db['퀄리티'] == 0]['리뷰'], columns=['리뷰']).reset_index(drop=True)
quality_data_2['target'] = 0

quality_data = pd.concat([quality_data_1, quality_data_2], axis=0).reset_index(drop=True)
quality_data
quality_data.isnull().sum()

리뷰        0
target    0
dtype: int64

In [42]:
from mecab import MeCab
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

filter=['NNG','MAG','EC','VA','VA+EF','VV+ETM','NNB+JKB','VCP+EC','VCP','MAG+JX','VCN']

def tokenizer(data):
    tokenizer = MeCab()
    
    list = []
    
    for text in tqdm(data["리뷰"]):
        prah = []  
        lst = tokenizer.pos(text)  
        for word, pos in lst:
            if pos in filter:
                prah.append(word)  
        list.append(' '.join(prah))  
    
    return pd.DataFrame({'tokens':list})

In [43]:
DB_list=tokenizer(quality_data)


100%|██████████| 5135/5135 [00:03<00:00, 1566.32it/s]


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=100)
legacy_tfidf = vectorizer.fit_transform(DB_list["tokens"])

joblib.dump(vectorizer,'quality_vectorizer.pkl')

legacy_tfidf=legacy_tfidf.toarray()


In [45]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

legacy_tfidf = scaler.fit_transform(legacy_tfidf)

joblib.dump(scaler,'quality_scaler.pkl')


['quality_scaler.pkl']

In [46]:
from catboost import CatBoostClassifier


catboost_model = CatBoostClassifier(random_state=42, verbose=0)


catboost_model.fit(legacy_tfidf, quality_data['target'])


feature_importances = catboost_model.feature_importances_

feature_names = vectorizer.get_feature_names_out()


sorted_indices = np.argsort(feature_importances)[::-1]
sorted_feature_importances = feature_importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

In [47]:

important_feature_indices = sorted_indices[sorted_feature_importances >= 1]
legacy_tfidf_filtered = legacy_tfidf[:, important_feature_indices]

In [48]:
joblib.dump(important_feature_indices,'quality_col.pkl')

['quality_col.pkl']

In [49]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Define the objective function for optimization
def objective(trial):
    # Define the search space
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
    }
    
    # Initialize CatBoostClassifier with suggested parameters
    catboost_model = CatBoostClassifier(**params, random_state=42, verbose=0)
    
    # Split the data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(legacy_tfidf_filtered, quality_data['target'], test_size=0.2, random_state=42)
    
    # Fit the model
    catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=False)
    
    # Get validation predictions
    y_pred = catboost_model.predict(X_val)
    
    # Calculate precision, recall, and F1 score for class '1'
    precision = precision_score(y_val, y_pred, pos_label=1)
    recall = recall_score(y_val, y_pred, pos_label=1)
    f1 = f1_score(y_val, y_pred, pos_label=1)
    
    return f1

# Create an Optuna study object
study = optuna.create_study(direction='maximize')

# Optimize the objective function
study.optimize(objective, n_trials=50)

# Get the best parameters and best score
best_params = study.best_params
best_score = study.best_value

print("Best Parameters:", best_params)
print("Best F1 Score for class '1':", best_score)

# Train CatBoostClassifier with the best parameters
best_catboost_model = CatBoostClassifier(**best_params, random_state=42, verbose=0)


[I 2024-02-19 19:37:41,146] A new study created in memory with name: no-name-b2b7b795-f7b1-494f-b6f3-0b413a56bee4


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
  'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
[I 2024-02-19 19:37:41,828] Trial 0 finished with value: 0.40562248995983935 and parameters: {'iterations': 994, 'learning_rate': 0.06196157137794761, 'depth': 6, 'l2_leaf_reg': 1.5749067704957624, 'border_count': 227, 'random_strength': 0.11705779512886988, 'bagging_temperature': 0.3464477161814357}. Best is trial 0 with value: 0.40562248995983935.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
  'random_strength': trial.suggest_loguniform('random_strength', 0.1, 10),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 10)
[I 2024-02-19 19:37:49,945] Trial 1 finished with value:

Best Parameters: {'iterations': 245, 'learning_rate': 0.09242823715325771, 'depth': 9, 'l2_leaf_reg': 5.989911341953647, 'border_count': 92, 'random_strength': 0.21660721751229697, 'bagging_temperature': 4.785641301597593}
Best F1 Score for class '1': 0.4378698224852071


In [50]:
best_catboost_model.fit(legacy_tfidf_filtered, quality_data['target'])
joblib.dump(best_catboost_model, 'quality_catboost_model.pkl')

['quality_catboost_model.pkl']