# mercari 技術課題

In [44]:
import os
import numpy as np
import pandas as pd
from datetime import datetime as dt
import scipy
import sklearn.model_selection
from sklearn.metrics import f1_score
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials

In [2]:
os.getcwd()

'/Users/tamurakouichirou/Documents/python/lecture/mercari'

In [3]:
df_train = pd.read_csv("./datasets/train.csv") 
df_test = pd.read_csv("./datasets/test.csv")
print(df_train.shape)
print(df_test.shape)

(700, 9)
(300, 8)


In [4]:
df_train.head()

Unnamed: 0,item_id,category_class,sold_price,price,area_name,condition,size,listing_at,item_tag_hash
0,445,0,1164,1162,fff,Fair,7,2017-02-01 16:11:18.978516,3ca192bd7558780793444f73366c58d60c9d7775
1,481,0,1005,1004,fff,Fair,3,2017-02-02 14:42:51.693295,fbaacb960902382e4f6c96f2d8f225c24eecadb4
2,327,2,944,944,aaa,Fair,5,2017-02-03 01:55:53.406374,785a7925363bf133a7c5413c563f331c5e02cc69
3,823,0,1068,1067,fff,Fair,7,2017-01-30 23:34:02.268603,f8997252c6e5ae3d950b736e1a81160a2e937a7f
4,718,3,1407,1407,ddd,Good,15,2017-02-03 13:38:37.845010,ca59bcd3da4daac52f3fcfdc0ab963f65cf421bc


In [5]:
df_test.head()

Unnamed: 0,item_id,sold_price,price,area_name,condition,size,listing_at,item_tag_hash
0,101,1006,1006,aaa,Like New,4,2017-02-01 13:13:59.048372,dd01903921ea24941c26a48f2cec24e0bb0e8cc7
1,499,1149,1147,fff,Fair,3,2017-02-01 08:19:21.532519,784e9240155834852dff458a730cceb50229df32
2,393,1044,1042,ddd,Like New,21,2017-02-02 13:27:40.620084,7c9fe6831f52e30e0ede4f8c54fd9bba673e8d8b
3,215,1547,1545,kkk,Like New,20,2017-02-02 20:59:30.470107,54c1792c99a96a96a2881600f0cce1d81061e8b8
4,578,1015,1014,ddd,Like New,5,2017-01-30 17:43:31.962058,081be7c370bf9e7b4c6e696276c1b2d57623b26b


## purpose: category_classを当てる

In [6]:
# category_classはいくつあるのか
df_train["category_class"].value_counts()

0    226
2    157
1    149
3     86
4     82
Name: category_class, dtype: int64

category_class
クラスラベル
0: Men
1: Home
2: Women
3: Electronics
4: Kids

数には多少のばらつきがある

#### column分析

In [7]:
# item id
print(df_train["item_id"].max())
print(df_test["item_id"].max())

print(df_train["item_id"].mean())
print(df_test["item_id"].mean())

print(len(df_train["item_id"].unique()))
print(len(df_test["item_id"].unique()))

1099
1098
598.4085714285715
602.0466666666666
700
300


平均はほぼ一緒で最大値も一緒。idがそもそも1000あって，それからランダムに抽出された可能性高い

In [8]:
# price 
df_train["price_diff"] = df_train["sold_price"] - df_train["price"]
df_test["price_diff"] = df_test["sold_price"] - df_test["price"]

In [9]:
# area_name
# 出品者の出品エリア名
df_train["area_name"].value_counts()

ggg    78
ccc    76
kkk    74
ddd    74
jjj    71
bbb    68
aaa    66
fff    66
eee    65
hhh    62
Name: area_name, dtype: int64

In [10]:
# dummy変数に変換
df_train = pd.concat([df_train, pd.get_dummies(df_train["area_name"])], axis=1)
df_test = pd.concat([df_test, pd.get_dummies(df_test["area_name"])], axis=1)
del df_train["area_name"]
del df_test["area_name"]

In [11]:
# condition
df_train["condition"].value_counts()

Good        272
Fair        218
Like New    210
Name: condition, dtype: int64

In [12]:
# dummy変数に変換
df_train = pd.concat([df_train, pd.get_dummies(df_train["condition"])], axis=1)
df_test = pd.concat([df_test, pd.get_dummies(df_test["condition"])], axis=1)
del df_train["condition"]
del df_test["condition"]

In [13]:
# listing_at
print("max of listign_at: %s" %df_train["listing_at"].max())
print("min of listign_at: %s" %df_train["listing_at"].min())

max of listign_at: 2017-02-05 05:59:49.516616
min of listign_at: 2017-01-27 15:10:14.057584


約一週間分のデータ。時間帯以外のデータはあまり意味がなさそう

時間帯を整数で入れてしまう(0時での区切れ目は本当はちゃんとすべきだけどデータセット少ないしいいや)

In [14]:
df_train["listing_at"].max()[0:16]

'2017-02-05 05:59'

In [15]:
df_train["listing_at"] = df_train["listing_at"].apply(lambda x: dt.strptime(x[0:16], '%Y-%m-%d %H:%M').hour)
df_test["listing_at"] = df_test["listing_at"].apply(lambda x: dt.strptime(x[0:16], '%Y-%m-%d %H:%M').hour)

In [16]:
df_train.head()

Unnamed: 0,item_id,category_class,sold_price,price,size,listing_at,item_tag_hash,price_diff,aaa,bbb,...,ddd,eee,fff,ggg,hhh,jjj,kkk,Fair,Good,Like New
0,445,0,1164,1162,7,16,3ca192bd7558780793444f73366c58d60c9d7775,2,0,0,...,0,0,1,0,0,0,0,1,0,0
1,481,0,1005,1004,3,14,fbaacb960902382e4f6c96f2d8f225c24eecadb4,1,0,0,...,0,0,1,0,0,0,0,1,0,0
2,327,2,944,944,5,1,785a7925363bf133a7c5413c563f331c5e02cc69,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,823,0,1068,1067,7,23,f8997252c6e5ae3d950b736e1a81160a2e937a7f,1,0,0,...,0,0,1,0,0,0,0,1,0,0
4,718,3,1407,1407,15,13,ca59bcd3da4daac52f3fcfdc0ab963f65cf421bc,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [17]:
df_test.head()

Unnamed: 0,item_id,sold_price,price,size,listing_at,item_tag_hash,price_diff,aaa,bbb,ccc,ddd,eee,fff,ggg,hhh,jjj,kkk,Fair,Good,Like New
0,101,1006,1006,4,13,dd01903921ea24941c26a48f2cec24e0bb0e8cc7,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1,499,1149,1147,3,8,784e9240155834852dff458a730cceb50229df32,2,0,0,0,0,0,1,0,0,0,0,1,0,0
2,393,1044,1042,21,13,7c9fe6831f52e30e0ede4f8c54fd9bba673e8d8b,2,0,0,0,1,0,0,0,0,0,0,0,0,1
3,215,1547,1545,20,20,54c1792c99a96a96a2881600f0cce1d81061e8b8,2,0,0,0,0,0,0,0,0,0,1,0,0,1
4,578,1015,1014,5,17,081be7c370bf9e7b4c6e696276c1b2d57623b26b,1,0,0,0,1,0,0,0,0,0,0,0,0,1


#### 現状 item_tag_hashは使わない

In [18]:
del df_train["item_tag_hash"]
del df_test["item_tag_hash"]

### 統計量など分析

#### groupby

In [19]:
df_train.groupby(by="category_class").mean()

Unnamed: 0_level_0,item_id,sold_price,price,size,listing_at,price_diff,aaa,bbb,ccc,ddd,eee,fff,ggg,hhh,jjj,kkk,Fair,Good,Like New
category_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,499.221239,1012.814159,1011.800885,5.942478,12.19469,1.013274,0.10177,0.119469,0.168142,0.106195,0.110619,0.115044,0.084071,0.053097,0.057522,0.084071,0.5,0.336283,0.163717
1,575.845638,1280.919463,1335.0,14.355705,10.684564,-54.080537,0.053691,0.080537,0.067114,0.087248,0.073826,0.100671,0.09396,0.107383,0.154362,0.181208,0.073826,0.389262,0.536913
2,644.643312,990.929936,989.917197,5.643312,11.178344,1.012739,0.11465,0.076433,0.070064,0.146497,0.031847,0.076433,0.133758,0.095541,0.146497,0.10828,0.184713,0.433121,0.382166
3,680.372093,1305.674419,1430.523256,20.337209,10.895349,-124.848837,0.069767,0.046512,0.05814,0.05814,0.081395,0.05814,0.186047,0.197674,0.127907,0.116279,0.337209,0.476744,0.186047
4,738.292683,937.439024,936.52439,5.707317,12.060976,0.914634,0.134146,0.158537,0.146341,0.109756,0.207317,0.097561,0.097561,0.02439,0.012195,0.012195,0.439024,0.353659,0.207317


item idがかなり順番に並んでいる説

#### 相関

In [20]:
df_train.corr()

Unnamed: 0,item_id,category_class,sold_price,price,size,listing_at,price_diff,aaa,bbb,ccc,ddd,eee,fff,ggg,hhh,jjj,kkk,Fair,Good,Like New
item_id,1.0,0.288152,0.036802,0.048034,0.097377,-0.016676,-0.036249,-0.058764,-0.042112,-0.015912,-0.034367,-0.016289,-0.017521,0.011646,0.028902,0.040005,0.101003,0.175722,0.003665,-0.181468
category_class,0.288152,1.0,0.003725,0.045501,0.156726,-0.025043,-0.067387,0.025537,-0.006759,-0.063878,-0.00306,0.045642,-0.046457,0.063891,0.044756,-0.001389,-0.054378,-0.060626,0.054792,0.002984
sold_price,0.036802,0.003725,1.0,0.776618,0.440042,-0.011497,-0.156196,-0.083579,-0.031587,-0.084384,-0.09638,-0.032125,0.039481,0.025001,0.057387,0.092579,0.114904,-0.160421,0.030153,0.130036
price,0.048034,0.045501,0.776618,1.0,0.447971,-0.008191,-0.743543,-0.051342,-0.022174,-0.08899,-0.096702,0.001976,-0.002815,0.04404,0.066974,0.08029,0.071914,-0.140078,0.005702,0.135486
size,0.097377,0.156726,0.440042,0.447971,1.0,-0.018349,-0.235284,-0.021859,-0.049289,-0.044128,-0.027482,-0.010392,0.004469,-0.009173,0.059803,0.012192,0.088097,-0.085744,0.054214,0.028981
listing_at,-0.016676,-0.025043,-0.011497,-0.008191,-0.018349,1.0,0.00064,-0.03999,0.004959,-0.011882,-0.003243,0.067886,0.024971,-0.019,-0.013319,0.016328,-0.023597,0.047093,0.074964,-0.127324
price_diff,-0.036249,-0.067387,-0.156196,-0.743543,-0.235284,0.00064,1.0,-0.008217,0.001239,0.049957,0.049315,-0.037197,0.046321,-0.042513,-0.044094,-0.027618,0.009212,0.049346,0.023066,-0.0744
aaa,-0.058764,0.025537,-0.083579,-0.051342,-0.021859,-0.03999,-0.008217,1.0,-0.105834,-0.112601,-0.110932,-0.103228,-0.104101,-0.114256,-0.10058,-0.1084,-0.110932,-0.037522,0.013583,0.023469
bbb,-0.042112,-0.006759,-0.031587,-0.022174,-0.049289,0.004959,0.001239,-0.105834,1.0,-0.114475,-0.112778,-0.104946,-0.105834,-0.116158,-0.102254,-0.110205,-0.112778,-0.001845,-0.053667,0.058947
ccc,-0.015912,-0.063878,-0.084384,-0.08899,-0.044128,-0.011882,0.049957,-0.112601,-0.114475,1.0,-0.119989,-0.111657,-0.112601,-0.123585,-0.108793,-0.117251,-0.119989,0.0727,-0.023848,-0.048099


#### 統計量

In [21]:
df_train.describe()

Unnamed: 0,item_id,category_class,sold_price,price,size,listing_at,price_diff,aaa,bbb,ccc,ddd,eee,fff,ggg,hhh,jjj,kkk,Fair,Good,Like New
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,598.408571,1.498571,1092.124286,1118.312857,9.407143,11.47,-26.188571,0.094286,0.097143,0.108571,0.105714,0.092857,0.094286,0.111429,0.088571,0.101429,0.105714,0.311429,0.388571,0.3
std,285.625877,1.359015,201.94903,298.3012,7.804199,6.853084,190.256356,0.292435,0.296364,0.311323,0.307691,0.29044,0.292435,0.314887,0.284327,0.302111,0.307691,0.463409,0.487774,0.458585
min,100.0,0.0,646.0,645.0,0.0,0.0,-1815.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,349.5,0.0,948.75,948.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,586.5,1.0,1059.0,1058.0,7.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,849.25,2.0,1188.25,1193.75,12.0,17.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
max,1099.0,4.0,1874.0,3181.0,59.0,23.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 学習用データセットの作成

In [22]:
train_y = df_train["category_class"].ravel()
del df_train["category_class"]
train_x = df_train
test_x = df_test

In [23]:
train_x.head()

Unnamed: 0,item_id,sold_price,price,size,listing_at,price_diff,aaa,bbb,ccc,ddd,eee,fff,ggg,hhh,jjj,kkk,Fair,Good,Like New
0,445,1164,1162,7,16,2,0,0,0,0,0,1,0,0,0,0,1,0,0
1,481,1005,1004,3,14,1,0,0,0,0,0,1,0,0,0,0,1,0,0
2,327,944,944,5,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
3,823,1068,1067,7,23,1,0,0,0,0,0,1,0,0,0,0,1,0,0
4,718,1407,1407,15,13,0,0,0,0,1,0,0,0,0,0,0,0,1,0


In [24]:
train_y[:5]

array([0, 0, 2, 0, 3])

### model

#### とりあえずlr

In [25]:
from sklearn.linear_model import LogisticRegression

In [77]:
# params
scoring = u"f1_macro"
objective = u"multi:softprob"
eval_metric = u"mlogloss"
output_dim = 5

In [27]:
#  正規化
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
_train_x = minmax_scaler.fit_transform(train_x)
_test_x = minmax_scaler.transform(test_x)

In [28]:
pd.DataFrame(_train_x).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.345345,0.421824,0.203864,0.118644,0.695652,0.9989,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.381381,0.292345,0.141562,0.050847,0.608696,0.998351,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.227227,0.242671,0.117902,0.084746,0.043478,0.997801,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.723724,0.343648,0.166404,0.118644,1.0,0.998351,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.618619,0.619707,0.300473,0.254237,0.565217,0.997801,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [29]:
lr = LogisticRegression()
scores = sklearn.model_selection.cross_val_score(
                lr, _train_x, train_y,
                scoring=scoring, cv=5)

In [30]:
scores.mean()

0.50424585607996542

#### とりあえずdecision tree

In [31]:
from sklearn.tree import DecisionTreeClassifier

In [32]:
dt = DecisionTreeClassifier()
scores = sklearn.model_selection.cross_val_score(
                dt, train_x, train_y,
                scoring=scoring, cv=5)

In [33]:
scores.mean()

0.81839853222007053

決定木だと強め

#### hyperopt 

In [105]:
class Objective(object):
    """objective of hyperopt"""

    def __init__(self, train_x, train_y, model_name, space, scoring="f1"):
        self.train_x = train_x.as_matrix()
        self.train_y = np.array(train_y)
        print(train_x.shape)
        print(train_y.shape)

        # define model
        self.model_name = model_name
        self.model = space[self.model_name]['model']["classification"]
        
        self.n_jobs = -1

        # how to cross-validation
        self.cv = sklearn.model_selection.StratifiedKFold(n_splits=3, random_state=1, shuffle=True)

        # best params
        self.best_params = None
        self.best_score = None

    def __call__(self, space):
        """
        # attention: sklearn.model_selection.cross_val_score returns sign flipped value.
        :param space: hyper-parameter space
        :return:
        """
        if self.model_name == "xgboost":
            # cross validation
            scores = []
            for train_index, test_index in self.cv.split(self.train_x, self.train_y):
                _train_x, _test_x = self.train_x[train_index], self.train_x[test_index]
                _train_y, _test_y = self.train_y[train_index], self.train_y[test_index]
                dtrain = xgb.DMatrix(_train_x, _train_y)
                cv_output = xgb.cv(dict(space, silent=1,num_class=5), dtrain, num_boost_round=1000, early_stopping_rounds=20)
                num_boost_rounds = len(cv_output)
                
                model = xgb.train(dict(space, silent=1,num_class=5), dtrain, num_boost_round=num_boost_rounds)
                y_pred = model.predict(xgb.DMatrix(_test_x))
                score = f1_score(_test_y, np.argmax(y_pred, axis=1), average='macro')
                scores.append(score)

        else:
            model = self.model(**space)
            scores = sklearn.model_selection.cross_val_score(
                model, self.train_x, self.train_y,
                scoring=scoring, cv=self.cv, n_jobs=self.n_jobs)

        # objective score is better when score is high
        score = scipy.mean(scores) * -1

        if (self.best_score is None) or (score <= self.best_score):
            self.best_score = score
            self.best_params = space

        print(score)
        print(space)
        # minimize score
        return score



    def get_best_score(self):
        """
        :return: best score
        """
        return self.best_score * -1

    def get_best_params(self):
        """
        :return: best params
        """
        return self.best_params


### xgboost

In [106]:
# 探索空間
space ={
    'xgboost': {
                'params': {
                    # 'learning_rate': hp.quniform('learning_rate', 0.01, 0.2, 0.02),
                    'eta': hp.quniform('learning_rate', 0.01, 0.2, 0.02),
                    'max_depth': hp.choice('max_depth', np.arange(2, 10, dtype=int)),
                    "subsample": hp.choice('subsample', [0.7, 0.8, 0.9]),
                    "min_child_weight": hp.choice('min_child_weight', np.arange(1, 10, dtype=int)),
                    "colsample_bytree": hp.quniform('colsample_bytree', 0.5, 1, 0.05),
                    'objective': 'multi:softprob',
                    'eval_metric': eval_metric,
                },
                'model': {
                    'classification': xgb.XGBClassifier,
                    'regression': xgb.XGBRegressor,
                },
    }
}

In [107]:
objective = Objective(train_x, train_y,  "xgboost", space)
trials = Trials()

best_params = fmin(objective, space["xgboost"]["params"], algo=tpe.suggest,
                   max_evals=50, trials=trials)

best_params = objective.get_best_params()
best_score = objective.get_best_score()

(700, 19)
(700,)
-0.933206795732
{'colsample_bytree': 1.0, 'min_child_weight': 7, 'subsample': 0.9, 'max_depth': 6, 'eta': 0.08, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
-0.937166513095
{'colsample_bytree': 0.8500000000000001, 'min_child_weight': 2, 'subsample': 0.7, 'max_depth': 5, 'eta': 0.08, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
-0.929201284365
{'colsample_bytree': 0.9, 'min_child_weight': 7, 'subsample': 0.7, 'max_depth': 4, 'eta': 0.1, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
-0.933708553309
{'colsample_bytree': 0.75, 'min_child_weight': 6, 'subsample': 0.9, 'max_depth': 2, 'eta': 0.04, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
-0.937114590989
{'colsample_bytree': 0.9, 'min_child_weight': 4, 'subsample': 0.8, 'max_depth': 5, 'eta': 0.06, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
-0.907862471097
{'colsample_bytree': 0.7000000000000001, 'min_child_weight': 9, 'subsample': 0.7, 'max_depth': 4, 'eta

-0.937895776375
{'colsample_bytree': 0.9, 'min_child_weight': 4, 'subsample': 0.7, 'max_depth': 2, 'eta': 0.1, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
-0.931546201472
{'colsample_bytree': 0.9500000000000001, 'min_child_weight': 6, 'subsample': 0.8, 'max_depth': 9, 'eta': 0.04, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}
-0.938360021369
{'colsample_bytree': 0.8, 'min_child_weight': 5, 'subsample': 0.9, 'max_depth': 4, 'eta': 0.16, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob'}


In [108]:
best_params

{'colsample_bytree': 0.8,
 'eta': 0.04,
 'eval_metric': 'mlogloss',
 'max_depth': 2,
 'min_child_weight': 3,
 'objective': 'multi:softprob',
 'subsample': 0.8}

In [109]:
best_score

0.94648472509638337

#### train

In [114]:
dtrain = xgb.DMatrix(train_x.as_matrix(), train_y)
cv_output = xgb.cv(dict(best_params, num_class=5), dtrain, num_boost_round=1000, early_stopping_rounds=20,)
num_boost_rounds = len(cv_output)
model = xgb.train(dict(best_params, silent=0,num_class=5), dtrain, num_boost_round=num_boost_rounds)

In [117]:
dtest = xgb.DMatrix(test_x.as_matrix())
predict = model.predict(dtest)

In [119]:
np.argmax(predict, axis=1)

array([0, 0, 3, 1, 1, 0, 4, 0, 4, 3, 4, 1, 0, 3, 0, 2, 3, 3, 0, 4, 2, 0, 0,
       3, 0, 2, 2, 4, 0, 4, 4, 1, 0, 0, 1, 4, 2, 1, 0, 2, 2, 2, 3, 2, 1, 1,
       3, 0, 1, 0, 2, 4, 1, 3, 0, 0, 1, 1, 0, 0, 2, 4, 0, 1, 3, 0, 0, 0, 4,
       1, 4, 0, 1, 3, 0, 4, 3, 0, 0, 0, 0, 2, 1, 0, 0, 2, 0, 1, 1, 0, 3, 2,
       1, 4, 4, 2, 0, 0, 2, 0, 1, 2, 1, 1, 4, 2, 2, 1, 3, 1, 2, 0, 1, 2, 2,
       0, 0, 0, 2, 4, 2, 1, 0, 0, 0, 4, 0, 0, 2, 0, 0, 1, 0, 3, 1, 2, 2, 1,
       1, 2, 4, 4, 3, 2, 2, 4, 3, 1, 0, 0, 0, 2, 2, 3, 0, 3, 1, 1, 0, 4, 0,
       3, 1, 4, 0, 2, 4, 1, 2, 4, 0, 0, 2, 0, 3, 1, 0, 1, 3, 1, 4, 2, 3, 0,
       3, 4, 1, 2, 0, 0, 3, 1, 1, 4, 1, 0, 1, 2, 4, 2, 1, 0, 4, 0, 0, 0, 0,
       3, 1, 4, 2, 0, 1, 1, 2, 2, 2, 2, 1, 0, 2, 3, 1, 2, 4, 2, 3, 0, 0, 1,
       0, 1, 1, 2, 1, 4, 1, 1, 2, 0, 4, 2, 0, 2, 1, 0, 3, 0, 0, 1, 3, 2, 0,
       1, 1, 0, 2, 4, 3, 2, 3, 1, 0, 0, 4, 4, 2, 0, 3, 1, 0, 4, 0, 4, 4, 0,
       1, 0, 2, 1, 2, 3, 1, 0, 2, 2, 0, 2, 0, 1, 0, 1, 3, 0, 1, 0, 3, 1, 4,
       3])

In [124]:
submission = pd.DataFrame(np.argmax(predict, axis=1))
submission.index = test_x["item_id"]
submission.columns = ["category_class"]

In [126]:
submission.head(10)

Unnamed: 0_level_0,category_class
item_id,Unnamed: 1_level_1
101,0
499,0
393,3
215,1
578,1
503,0
755,4
455,0
1067,4
356,3


In [127]:
submission.to_csv("mercari_submission.csv")