In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from xgboost import XGBRegressor
from xgboost import plot_importance

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from matplotlib import pyplot

  from pandas import MultiIndex, Int64Index


In [2]:
# Import datas
# 2144 세탁기
# 2204 노트북
df2144 = pd.read_csv('../data/220117.2144.csv', encoding = 'UTF-8')
df2204 = pd.read_csv('../data/220117.2204.csv', encoding = 'UTF-8')

df2144.describe(), df2144.tail()

(            label
 count  630.000000
 mean     0.855556
 std      0.351819
 min      0.000000
 25%      1.000000
 50%      1.000000
 75%      1.000000
 max      1.000000,
        main   mid        sub                    name      price    ship  \
 2287  가전/TV  생활가전  세탁기+건조기세트  LG전자 오브제컬렉션 워시타워 W16EG  3,175,840  (무료배송)   
 2288  가전/TV  생활가전  세탁기+건조기세트  LG전자 오브제컬렉션 워시타워 W16EG  3,175,850  (무료배송)   
 2289  가전/TV  생활가전  세탁기+건조기세트  LG전자 오브제컬렉션 워시타워 W16EG  3,177,510  (무료배송)   
 2290  가전/TV  생활가전  세탁기+건조기세트  LG전자 오브제컬렉션 워시타워 W16EG  3,188,270  (무료배송)   
 2291  가전/TV  생활가전  세탁기+건조기세트  LG전자 오브제컬렉션 워시타워 W16EG  3,203,900  (무료배송)   
 
      platform                                               link  label  
 2287  신세계TV쇼핑  http://prod.danawa.com/bridge/loadingBridge.ht...    1.0  
 2288      한샘몰  http://prod.danawa.com/bridge/loadingBridge.ht...    1.0  
 2289       G9  http://prod.danawa.com/bridge/loadingBridge.ht...    1.0  
 2290       G9  http://prod.danawa.com/bridge/loadingBridge.ht...    1

In [3]:
# NaN의 갯수는 세지 않는구나.
df2144['label'].unique(), df2144['label'].value_counts(), df2144['label'].count()

(array([ 0.,  1., nan]),
 1.0    539
 0.0     91
 Name: label, dtype: int64,
 630)

In [4]:
# axis: NaN 데이터에 대해서 행 or 열을 Drop할지 정하는 변수
# how: any or all 옵션 존재, any의 경우 하나라도 없을 경우 해당 행 또는 열을 드랍
concat_list = []
fst_df = df2144.dropna(axis='index', how='any')
sec_df = df2204.dropna(axis='index', how='any')
fst_df['label'].value_counts()

concat_list.append(fst_df)
concat_list.append(sec_df)

In [5]:
main_df = pd.concat(concat_list, axis=0, ignore_index=True)
main_df['label'].value_counts()

1.0    1166
0.0     104
Name: label, dtype: int64

In [6]:
main_df.describe()

Unnamed: 0,label
count,1270.0
mean,0.91811
std,0.274305
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [7]:
main_df.head()

Unnamed: 0,main,mid,sub,name,price,ship,platform,link,label
0,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2272610,(무료배송),옥션,http://prod.danawa.com/bridge/loadingBridge.ht...,0.0
1,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2272610,(무료배송),옥션,http://prod.danawa.com/bridge/loadingBridge.ht...,0.0
2,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2277440,(무료배송),11번가,http://prod.danawa.com/bridge/loadingBridge.ht...,1.0
3,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2291960,(무료배송),11번가,http://prod.danawa.com/bridge/loadingBridge.ht...,0.0
4,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2311700,(무료배송),11번가,http://prod.danawa.com/bridge/loadingBridge.ht...,0.0


In [8]:
main_df = main_df.drop('link', axis=1)
main_df.head()

Unnamed: 0,main,mid,sub,name,price,ship,platform,label
0,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2272610,(무료배송),옥션,0.0
1,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2272610,(무료배송),옥션,0.0
2,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2277440,(무료배송),11번가,1.0
3,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2291960,(무료배송),11번가,0.0
4,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2311700,(무료배송),11번가,0.0


In [9]:
main_df['ship'].value_counts()

(무료배송)           1249
(배송비 3,000원)       13
(배송비 5,000원)        3
(배송비 30,000원)       2
(배송비 2,500원)        2
(유/무료배송)            1
Name: ship, dtype: int64

In [10]:
# ship price numeric data type으로 변경
test = main_df
#test['ship'].value_counts()
# regex로 replace가 동작했기 때문에 defalut value가 동작하지 않음
# null 값을 변환하는 추가 과정이 필요
test = test['ship'].replace( value='0', regex='[^0-9]' )
test = test.replace('', '0')
# test.value_counts()

change_df = main_df
change_df['ship'] = test
main_df = change_df
main_df['ship'].value_counts()

000000           1249
000003000000       13
000005000000        3
0000030000000       2
000002050000        2
00000000            1
Name: ship, dtype: int64

In [11]:
price_series = main_df['price']
main_df.head()

Unnamed: 0,main,mid,sub,name,price,ship,platform,label
0,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2272610,0,옥션,0.0
1,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2272610,0,옥션,0.0
2,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2277440,0,11번가,1.0
3,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2291960,0,11번가,0.0
4,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2311700,0,11번가,0.0


In [12]:
price_series = price_series.replace('[\$,]', '', regex=True)
# price_series.head()
main_df['price'] = price_series
main_df.dtypes

main         object
mid          object
sub          object
name         object
price        object
ship         object
platform     object
label       float64
dtype: object

In [13]:
test = main_df
#tf.keras.utils.to_categorical(test['platform'], int(len(test['platform'].unique())))

# 원했던 one-hot encoding
plt_onehot = pd.get_dummies(test['platform'])

In [14]:
# axis가 없는 경우 그냥 하나로
test = pd.concat([test, plt_onehot], axis=1)
test = test.drop('platform', axis=1)

In [15]:
test.head()

Unnamed: 0,main,mid,sub,name,price,ship,label,11번가,AK몰,BestPC,...,피씨로드,피온,하이마트 쇼핑몰,하프클럽,한샘몰,한성컴퓨터,해밀컴,현대Hmall,홈&쇼핑,홈플러스
0,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2272610,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2272610,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2277440,0,1.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2291960,0,0.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,가전/TV,생활가전,세탁기+건조기세트,LG전자 트롬 워시타워 W17WTA,2311700,0,0.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
test = test.drop('main', axis=1)
test = test.drop('mid', axis=1)

In [17]:
sub_onehot = pd.get_dummies(test['sub'])
name_onehot = pd.get_dummies(test['name'])

In [18]:
pre_onehot = pd.concat([sub_onehot, name_onehot, test], axis=1)
pre_onehot = pre_onehot.drop('sub', axis=1)
pre_onehot = pre_onehot.drop('name', axis=1)
pre_onehot.dtypes

노트북 전체                                                     uint8
세탁기+건조기세트                                                  uint8
전원기기/어댑터                                                   uint8
APPLE 2020 맥북에어 MGN63KH/A (8GB, SSD 256GB)                 uint8
APPLE 2021 맥북프로14 MKGP3KH/A (M1 PRO 8core, 16GB, 512GB)    uint8
                                                           ...  
한성컴퓨터                                                      uint8
해밀컴                                                        uint8
현대Hmall                                                    uint8
홈&쇼핑                                                       uint8
홈플러스                                                       uint8
Length: 111, dtype: object

In [19]:
test = pre_onehot
num_price = pd.to_numeric(test['price'])
num_ship = pd.to_numeric(test['ship'])
num_label = pd.to_numeric(test['label'])

test = test.drop('price', axis=1)
test = test.drop('ship', axis=1)
test = test.drop('label', axis=1)

test = pd.concat([test, num_price, num_ship, num_label], axis=1)
#test.describe()

### Modeling stage
전에 사용했던 XGboosting model을 사용

df를 모델링의 중점이 되는 데이터프레임으로 할당

In [20]:
df = test

In [21]:
# y: index for data sizes
y = [idx for idx in range(0, df.shape[0])]
# Test-Set 분리
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.5, random_state=10)
x_train.shape, x_test.shape

((635, 111), (635, 111))

In [22]:
x = x_train.drop('label', axis=1)
x_vali = x_test.drop('label', axis=1)

y = x_train['label']
y_vali = x_test['label']

In [23]:
# K-fold로 10개의 generator 생성
kf = KFold(n_splits =  10, shuffle = True, random_state = 96)
# Make Model
xgb = XGBRegressor(random_state = 256)

In [24]:
rmse_list = []
xgb_pred = np.zeros((x_train.shape[0]))

# K(10)-fold validation Loop
for tr_idx, val_idx in kf.split(x, y) :
    tr_x, tr_y = x.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = x.iloc[val_idx], y.iloc[val_idx]

    xgb.fit(tr_x, tr_y)

    pred = np.expm1([0 if x < 0 else x for x in xgb.predict(val_x)])
    sub_pred = np.expm1([0 if x < 0 else x for x in xgb.predict(x_vali)])
    rmse = np.sqrt(mean_squared_error(val_y, pred))


    rmse_list.append(rmse)
    xgb_pred += (sub_pred / 10)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [25]:
print(f'MSE: {np.mean(rmse_list)}')

MSE: 0.742183896879351
