In [None]:
import numpy as np
import random # 시드 고정을 위해
import os # 시드 고정을 위해
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import torch
from torch import Tensor,tensor
from torch.utils.data import DataLoader,Dataset
from torch.nn import Module,Sequential

device= 'cuda' if torch.cuda.is_available() else 'cpu'

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

import pymysql
import pandas as pd

conn = pymysql.connect(host='127.0.0.1', user='wodus1530',
                       password='dkvms7255', db='Zigzag')


cursor = conn.cursor()


sql_query = 'SELECT * FROM db'
cursor.execute(sql_query)

result = cursor.fetchall()
column_names = [i[0] for i in cursor.description]


db = pd.DataFrame(result, columns=column_names)

cursor.close()
conn.close()



In [None]:
color_data_1 = pd.DataFrame(db[db['색감'] != 0]['리뷰'], columns=['리뷰']).reset_index(drop=True)
color_data_1['target'] = 1

color_data_2 = pd.DataFrame(db[db['색감'] == 0]['리뷰'], columns=['리뷰']).reset_index(drop=True)
color_data_2['target'] = 0

color_data = pd.concat([color_data_1, color_data_2], axis=0).reset_index(drop=True)
color_data.isnull().sum()

In [None]:
# from sklearn.model_selection import train_test_split

# SEED = 42
# color_targets=color_data['target']

# legacy, new, legacy_target, new_target = train_test_split(color_data, color_targets, train_size=0.8, test_size=0.2, random_state=SEED, shuffle=True)

# len(legacy),len(new),len(legacy_target),len(new_target)

In [None]:
from mecab import MeCab
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

filter=['NNG','MAG','EC','VA','VA+EF','VV+ETM','NNB+JKB','VCP+EC','VCP','MAG+JX','VCN']

def tokenizer(data):
    tokenizer = MeCab()
    
    list = []
    
    for text in tqdm(data["리뷰"]):
        prah = []  
        lst = tokenizer.pos(text)  
        for word, pos in lst:
            if pos in filter:
                prah.append(word)  
        list.append(' '.join(prah))  
    
    return pd.DataFrame({'tokens':list})

In [None]:
def text_tokenizer(data):
    tokenizer = MeCab()
    
    list = []
    
    for text in tqdm(data):
        prah = []  
        lst = tokenizer.pos(text)  
        for word, pos in lst:
            if pos in filter:
                prah.append(word)  
        list.append(' '.join(prah))  
    
    return pd.DataFrame({'tokens':list})

In [None]:
sample_data=np.array([['이옷 너무 좋아요 특히 색감이 화면에 나온거랑 완전 똑같아요!'],
                      ['보들보들 짱짱 여름에도 좋을듯!!']])


In [None]:

text_tokenizer(sample_data)

In [None]:
# train_list=tokenizer(legacy)
# test_list=tokenizer(new)

In [None]:
DB_list=tokenizer(color_data)

In [None]:
# legacy = pd.DataFrame({'tokens': train_list})
# new = pd.DataFrame({'tokens': test_list})

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = TfidfVectorizer(max_features=100)
# legacy_tfidf = vectorizer.fit_transform(legacy["tokens"])

# new_tfidf=vectorizer.transform(new["tokens"])

# legacy_tfidf=legacy_tfidf.toarray()
# new_tfidf=new_tfidf.toarray()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=100)
legacy_tfidf = vectorizer.fit_transform(DB_list["tokens"])

legacy_tfidf=legacy_tfidf.toarray()

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

legacy_tfidf = scaler.fit_transform(legacy_tfidf)


In [None]:
legacy_df = pd.DataFrame(legacy_tfidf, columns=vectorizer.get_feature_names_out())

In [None]:

important_feature_indices = sorted_indices[sorted_feature_importances >= 1]
legacy_tfidf_filtered = legacy_tfidf[:, important_feature_indices]

In [None]:
legacy_df.iloc[:,important_feature_indices]

In [None]:
from catboost import CatBoostClassifier
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] ='Malgun Gothic'
plt.rcParams['axes.unicode_minus'] =False


catboost_model = CatBoostClassifier(random_state=42, verbose=0)


catboost_model.fit(legacy_tfidf, color_data['target'])


feature_importances = catboost_model.feature_importances_

feature_names = legacy_df.columns


sorted_indices = np.argsort(feature_importances)[::-1]
sorted_feature_importances = feature_importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# Plot feature importances
plt.figure(figsize=(15, 15))
plt.barh(range(len(sorted_feature_importances)), sorted_feature_importances, align='center', alpha=0.7)
plt.yticks(range(len(sorted_feature_importances)), sorted_feature_names)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importances (CatBoost)')
plt.show()


In [None]:

important_feature_indices = sorted_indices[sorted_feature_importances >= 1]
legacy_tfidf_filtered = legacy_tfidf[:, important_feature_indices]

X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(legacy_tfidf_filtered, legacy_target, test_size=0.2, random_state=SEED)


catboost_model_filtered = CatBoostClassifier(random_state=42, verbose=0)
catboost_model_filtered.fit(X_train_filtered, y_train_filtered)


accuracy_filtered = catboost_model_filtered.score(X_test_filtered, y_test_filtered)
print("Accuracy on the filtered test set:", accuracy_filtered)


In [None]:

top_10_indices = sorted_indices[:10]

legacy_tfidf_filtered = legacy_tfidf[:, top_10_indices]

X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(legacy_tfidf_filtered, legacy_target, test_size=0.2, random_state=SEED)

catboost_model_filtered = CatBoostClassifier(random_state=42, verbose=0)
catboost_model_filtered.fit(X_train_filtered, y_train_filtered)


accuracy_filtered = catboost_model_filtered.score(X_test_filtered, y_test_filtered)
print("Accuracy on the filtered test set using top 10 features:", accuracy_filtered)


In [None]:
pred=catboost_model.predict(new_tfidf)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(pred,new_target)

In [None]:
train_list=tokenizer(color_data)

In [None]:
from catboost import CatBoostClassifier
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] ='Malgun Gothic'
plt.rcParams['axes.unicode_minus'] =False


catboost_model = CatBoostClassifier(random_state=42, verbose=0)


catboost_model.fit(legacy_tfidf, color_data['target'])


feature_importances = catboost_model.feature_importances_

feature_names = legacy_df.columns


sorted_indices = np.argsort(feature_importances)[::-1]
sorted_feature_importances = feature_importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# Plot feature importances
plt.figure(figsize=(15, 15))
plt.barh(range(len(sorted_feature_importances)), sorted_feature_importances, align='center', alpha=0.7)
plt.yticks(range(len(sorted_feature_importances)), sorted_feature_names)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importances (CatBoost)')
plt.show()


In [None]:

important_feature_indices = sorted_indices[sorted_feature_importances >= 1]
legacy_tfidf_filtered = legacy_tfidf[:, important_feature_indices]

X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(legacy_tfidf_filtered, color_data['target'] ,test_size=0.2, random_state=SEED)


catboost_model_filtered = CatBoostClassifier(random_state=42, verbose=0)
catboost_model_filtered.fit(X_train_filtered, y_train_filtered)


accuracy_filtered = catboost_model_filtered.score(X_test_filtered, y_test_filtered)
print("Accuracy on the filtered test set:", accuracy_filtered)


In [None]:
sample=pd.DataFrame(['색감이너무 좋아요 화면이랑 똑같아요!'],columns=['리뷰'])

sample_data=tokenizer(sample)

sample_data['tokens']=sample_data['tokens'].apply(lambda x:' '.join(x))

catboost_model_filtered.predict()