In [1]:
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier

In [2]:
# loading dataframe
df = pd.read_csv('./newly_reduced_bungae_df.csv', lineterminator='\n') # 이걸로 오류 해결
df['id'] = df['id'].astype(str)
df['cat_id'] = df['cat_id'].astype(str)
df['maincat_id'] = df['maincat_id'].astype(str)
df['midcat_id'] = df['midcat_id'].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109248 entries, 0 to 109247
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          109248 non-null  object
 1   name        109248 non-null  object
 2   info        108882 non-null  object
 3   cat_id      109248 non-null  object
 4   tag         99639 non-null   object
 5   maincat_id  109248 non-null  object
 6   midcat_id   109248 non-null  object
dtypes: object(7)
memory usage: 5.8+ MB


In [3]:
# spliting data

def split_train_test_data(df):
    # -- train, test split --
    train_x, test_x, train_y, test_y = train_test_split(df['name'],
#                                                         df['midcat_id'].values, # mid  
                                                        df['cat_id'].values, # sub  
                                                        random_state=42, 
                                                        test_size=.2)
    
    # -- train_df, test_df split --
    train_df, test_df = df.loc[train_x.index], df.loc[test_x.index]
    
    # -- resetting index -- 
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    return train_x, test_x, train_y, test_y, train_df, test_df

train_x, test_x, train_y, test_y, train_df, test_df = split_train_test_data(df)

In [4]:
tfidf_vect = TfidfVectorizer(ngram_range=(1,2), max_df=300, min_df=5)
# -- x_train vectorization --
X_train_cnt_vect = tfidf_vect.fit_transform(train_x)
# -- x_test vectorization --
X_test_cnt_vect = tfidf_vect.transform(test_x)

In [50]:
# -- logistic regression --
lr_clf = LogisticRegression(solver='liblinear', n_jobs=-1)
lr_clf.fit(X_train_cnt_vect , train_y)
proba_preds = lr_clf.predict_proba(X_test_cnt_vect)

# Get top 3 labels for each prediction
top3_labels = []
for pred_probs in proba_preds:
    top_labels = lr_clf.classes_[pred_probs.argsort()[-3:][::-1]]
    top3_labels.append(top_labels)





In [52]:
for idx in range(len(test_df)):
    test_df.loc[idx, 'predicted_labels'] = ' '.join(top3_labels[idx])

# logistic regression accuracy
for idx in range(len(test_df)):
#     label = test_df.loc[idx, 'midcat_id'] # mid
    label = test_df.loc[idx, 'cat_id'] # sub
    predicted_labels = test_df.loc[idx, 'predicted_labels'].split(' ')
    if label in predicted_labels:
        test_df.loc[idx, 'accuracy'] = 1
    else:
        test_df.loc[idx, 'accuracy'] = 0

test_df.head()

Unnamed: 0,id,name,info,cat_id,tag,maincat_id,midcat_id,predicted_labels,accuracy
0,214623042,로저 비비에 15CM 크리스탈 버클 사틴 벨트 140만원 -65만원,현재 매장 판매가는 [140만원 ]에 판매되는 제품으로 꾸준하게 사랑받는 제품 입니...,400110200,"에르메스여성벨트,프라다여성벨트,루이비통여성벨트,샤넬벨트,구찌여성벨트",400,400110,400110100 400110200 422200500,1.0
1,126501574,디스이즈네버댓지갑,디스이즈네버댓 지갑 지금은 현재 구할 수 없는 제품이고요 사용감 있어서 싸게 ...,430999,"디스이즈네버댓,지갑,반지갑,중지갑,카드지갑",430,430999,310220998 320180998 430100500,0.0
2,216027104,몽벨 트레킹화 여성235,몽벨 여성 트레킹화 트레킹 하실때 착용 하시면 편한 제품입니다 제품 상태는 자연스러...,405400100,"아웃도어,등산,트레킹화",405,405400,405400100 400070500 405200999,1.0
3,200354005,샤넬 롱코트 급처분합니다~!!,샤넬 울 트위드 롱코트 사이즈 38 매장가 1800만원대 제품입니다 가격대...,310300300,"샤넬,샤넬롱코트,샤넬트위드,명품",310,310300,310300300 320300100 310300100,1.0
4,226782760,아페쎄 APC 쁘띠 뉴 스탠다드 진 28사이즈,☺️사이즈 정보 사이즈: 28 허리단면: 38 총장: 104 ☺️실착용 한 적 ...,320120600,"아페쎄,APC,쁘띠뉴스탠다드,청바지,진",320,320120,430200300 400130100 430100200,0.0


In [53]:
print('TF-IDF Vectorized Logistic Regression의 예측 정확도는 {0:.3f}'.format(test_df['accuracy'].sum() / len(test_df)))

TF-IDF Vectorized Logistic Regression의 예측 정확도는 0.617


In [None]:
# -- lightGBM --
lgbm_clf = LGBMClassifier(n_jobs=-1)
lgbm_clf.fit(X_train_cnt_vect, train_y)
proba_preds = lgbm_clf.predict_proba(X_test_cnt_vect)

# Get top 3 labels for each prediction
top3_labels = []
for pred_probs in proba_preds:
    top_labels = lgbm_clf.classes_[pred_probs.argsort()[-3:][::-1]]
    top3_labels.append(top_labels)

In [None]:
for idx in range(len(test_df)):
    test_df.loc[idx, 'predicted_labels'] = ' '.join(top3_labels[idx])

# logistic regression accuracy
for idx in range(len(test_df)):
#     label = test_df.loc[idx, 'midcat_id'] # mid
    label = test_df.loc[idx, 'cat_id'] # sub
    predicted_labels = test_df.loc[idx, 'predicted_labels'].split(' ')
    if label in predicted_labels:
        test_df.loc[idx, 'accuracy'] = 1
    else:
        test_df.loc[idx, 'accuracy'] = 0

test_df.head()

In [None]:
print('lightGBM Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(test_df['accuracy'].sum() / len(test_df)))