In [1]:
import pandas as pd
import numpy as np

In [2]:
types_dict_train = {'train_id':'int64', 'item_condition_id':'int8', 'price':'float64', 'shipping':'int8'}
types_dict_test = {'test_id':'int64', 'item_condition_id':'int8', 'shipping':'int8'}
 
# tsvファイルからPandas DataFrameへ読み込み
train_df = pd.read_csv('train.tsv', delimiter='\t', low_memory=True, dtype=types_dict_train)
test_df = pd.read_csv('test_stg2.tsv', delimiter='\t', low_memory=True, dtype=types_dict_test)


In [3]:
train_df["item_condition_id"].value_counts()

1    640549
3    432161
2    375479
4     31962
5      2384
Name: item_condition_id, dtype: int64

In [4]:
train_df["brand_name"].value_counts()

PINK                 54088
Nike                 54043
Victoria's Secret    48036
LuLaRoe              31024
Apple                17322
                     ...  
Howard Miller            1
Christopher Blue         1
Olian                    1
JNCO                     1
Tickled Pink Tees        1
Name: brand_name, Length: 4809, dtype: int64

In [5]:
train_df["category_name"].value_counts()

Women/Athletic Apparel/Pants, Tights, Leggings    60177
Women/Tops & Blouses/T-Shirts                     46380
Beauty/Makeup/Face                                34335
Beauty/Makeup/Lips                                29910
Electronics/Video Games & Consoles/Games          26557
                                                  ...  
Handmade/Knitting/Cozy                                1
Handmade/Patterns/Baby                                1
Vintage & Collectibles/Furniture/Other                1
Handmade/Knitting/Scarf                               1
Handmade/Books and Zines/Comic                        1
Name: category_name, Length: 1287, dtype: int64

In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
plt.style.use("ggplot")

In [8]:
all_df = pd.concat([train_df,test_df],sort=False).reset_index(drop=True)

In [9]:
all_df

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,test_id
0,0.0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,
1,1.0,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,
2,2.0,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,
3,3.0,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,
4,4.0,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,
...,...,...,...,...,...,...,...,...,...
4943255,,DARK SAMPLE BYE FOUNDATION MOISTURIZER,1,Beauty/Fragrance/Women,,,1,It cosmetics Bye Bye Foundation Full Coverage ...,3460720.0
4943256,,bundle for @brandystash,2,Women/Pants/Other,,,1,♡2 pairs of omighty trackiez. one m and one s....,3460721.0
4943257,,Toddler High-top Converse,3,Kids/Girls 0-24 Mos/Shoes,Nike,,0,Size 5 (toddler). Pink high top converse shoes...,3460722.0
4943258,,Yoga Sling Sandals,1,Women/Shoes/Sandals,Sanuk,,1,super super comfy. i have a pair but ordered t...,3460723.0


In [10]:
categories = all_df.columns[all_df.dtypes == "object"]
print(categories)

Index(['name', 'category_name', 'brand_name', 'item_description'], dtype='object')


In [11]:
all_df.isnull().sum().sort_values(ascending=False)

train_id             3460725
price                3460725
brand_name           2109172
test_id              1482535
category_name          21160
item_description          10
name                       0
item_condition_id          0
shipping                   0
dtype: int64

In [12]:
all_df.category_name.value_counts()

Women/Athletic Apparel/Pants, Tights, Leggings    199589
Women/Tops & Blouses/T-Shirts                     154714
Beauty/Makeup/Face                                114389
Beauty/Makeup/Lips                                 99565
Electronics/Video Games & Consoles/Games           89134
                                                   ...  
Vintage & Collectibles/Furniture/Desk                  1
Handmade/Furniture/Fixture                             1
Handmade/Holidays/Hanukkah                             1
Vintage & Collectibles/Furniture/Bench                 1
Handmade/Ceramics and Pottery/Teapots                  1
Name: category_name, Length: 1353, dtype: int64

In [13]:
all_df["category_name"]

0                                          Men/Tops/T-shirts
1          Electronics/Computers & Tablets/Components & P...
2                                Women/Tops & Blouses/Blouse
3                         Home/Home Décor/Home Décor Accents
4                                    Women/Jewelry/Necklaces
                                 ...                        
4943255                               Beauty/Fragrance/Women
4943256                                    Women/Pants/Other
4943257                            Kids/Girls 0-24 Mos/Shoes
4943258                                  Women/Shoes/Sandals
4943259                          Women/Coats & Jackets/Other
Name: category_name, Length: 4943260, dtype: object

In [14]:
women_list = all_df['category_name'].str.contains('Women',na=False)
men_list = all_df['category_name'].str.contains('Men',na=False)
kids_list = all_df['category_name'].str.contains('Kids',na=False)
elec_list = all_df['category_name'].str.contains('Electronics|Computers|Tablets|Cell Phones',na=False)
beauty_list = all_df['category_name'].str.contains('Beauty',na=False)
shoes_list = all_df['category_name'].str.contains('Shoes',na=False)
chanel_list = all_df['brand_name'].str.contains('Chanel',na=False)
david_list = all_df['brand_name'].str.contains('David Yurman',na=False)
mary_list = all_df['brand_name'].str.contains('Mary Kay',na=False)
apple_list = all_df['brand_name'].str.contains('Apple',na=False)
louis_list = all_df['brand_name'].str.contains('Louis Vuitton',na=False)
celine_list = all_df['brand_name'].str.contains('Celine',na=False)

In [15]:
women_list = all_df['category_name'].str.contains('Women',na=False)
women_list

0          False
1          False
2           True
3          False
4           True
           ...  
4943255     True
4943256     True
4943257    False
4943258     True
4943259     True
Name: category_name, Length: 4943260, dtype: bool

In [16]:
all_df["sex"] = 0

In [17]:
all_df["sex"][women_list] = 1
all_df["sex"][men_list] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
all_df["elec"] = 0

In [19]:
all_df["elec"][elec_list] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
all_df["apple"] = 0

In [23]:
all_df["apple"][apple_list] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [24]:
all_df.category_name.value_counts()

Women/Athletic Apparel/Pants, Tights, Leggings    199589
Women/Tops & Blouses/T-Shirts                     154714
Beauty/Makeup/Face                                114389
Beauty/Makeup/Lips                                 99565
Electronics/Video Games & Consoles/Games           89134
                                                   ...  
Vintage & Collectibles/Furniture/Desk                  1
Handmade/Furniture/Fixture                             1
Handmade/Holidays/Hanukkah                             1
Vintage & Collectibles/Furniture/Bench                 1
Handmade/Ceramics and Pottery/Teapots                  1
Name: category_name, Length: 1353, dtype: int64

In [25]:
from sklearn.preprocessing import LabelEncoder

In [36]:
from optuna.integration import lightgbm as lgb

In [26]:
for cat in categories:
    le = LabelEncoder()
    print(cat)
    all_df[cat].fillna("missing",inplace=True)
    le = le.fit(all_df[cat])
    all_df[cat] = le.transform(all_df[cat])
    all_df[cat] = all_df[cat].astype("category")

name
category_name
brand_name
item_description


In [27]:
all_df

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,test_id,sex,elec,celine,apple
0,0.0,1947660,3,864,6286,10.0,1,2597284,,2,0,0,0
1,1.0,2748291,3,87,4660,52.0,0,3515053,,0,1,0,0
2,2.0,280690,1,1320,5489,10.0,1,368759,,1,0,0,0
3,3.0,1707130,1,536,6286,35.0,1,2518182,,0,0,0,0
4,4.0,139775,1,1247,6286,44.0,0,1174303,,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4943255,,944100,1,11,6286,,1,1834853,3460720.0,1,0,0,0
4943256,,3604650,2,1268,6286,,1,4105029,3460721.0,1,0,0,0
4943257,,3143603,3,679,4020,,0,3080966,3460722.0,0,0,0,0
4943258,,3566603,1,1279,4945,,1,4060816,3460723.0,1,0,0,0


In [28]:
train_df_le = all_df[~all_df["price"].isnull()]
test_df_le = all_df[all_df["price"].isnull()]

In [29]:
import lightgbm as lgb

In [30]:
from sklearn.model_selection import KFold
folds = 3
kf = KFold(n_splits=folds)

In [31]:
lgbm_params = {
    "objective":"regression",
    "random_seed":1234,
}

In [32]:
train_X = train_df_le.drop(["price","train_id"],axis=1)
train_Y = train_df_le["price"]

In [33]:
def rmsle(y, y0):
     assert len(y) == len(y0)
     return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2)))

In [34]:
models = []
rmsles = []
oof = np.zeros(len(train_X))
for train_index,val_index in kf.split(train_X):
    X_train = train_X.iloc[train_index]
    X_valid = train_X.iloc[val_index]
    y_train = train_Y.iloc[train_index]
    y_valid = train_Y.iloc[val_index]
    lgb_train = lgb.Dataset(X_train,y_train)
    lgb_eval = lgb.Dataset(X_valid,y_valid,reference=lgb_train)
    model_lgb = lgb.train(lgbm_params,
                         lgb_train,
                         valid_sets=lgb_eval,
                         num_boost_round=1000,
                         early_stopping_rounds=20,
                         verbose_eval=100,
                         )
    y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
    print(f'y_pred:{y_pred}')
    tmp_rmsle = rmsle(y_pred, y_valid)
    print(f'tmp_rmsle:{tmp_rmsle}')
    models.append(model_lgb)
    rmsles.append(tmp_rmsle)
    oof[val_index] = y_pred

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5368
[LightGBM] [Info] Number of data points in the train set: 988356, number of used features: 10




[LightGBM] [Info] Start training from score 26.749067
Training until validation scores don't improve for 20 rounds
[100]	valid_0's l2: 954.732
[200]	valid_0's l2: 949.411
[300]	valid_0's l2: 947.915
Early stopping, best iteration is:
[306]	valid_0's l2: 947.812
y_pred:[11.64994994 57.22877361 10.34934064 ... 13.28803572 21.48476641
 13.38455961]
tmp_rmsle:0.5914071096760947


  This is separate from the ipykernel package so we can avoid doing imports until


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5372
[LightGBM] [Info] Number of data points in the train set: 988357, number of used features: 10
[LightGBM] [Info] Start training from score 26.716268
Training until validation scores don't improve for 20 rounds
[100]	valid_0's l2: 946.45
[200]	valid_0's l2: 942.189
[300]	valid_0's l2: 941.019
Early stopping, best iteration is:
[304]	valid_0's l2: 940.94
y_pred:[13.32664712 17.81217837 24.47762138 ... 28.59880812 16.6029249
 10.22674604]
tmp_rmsle:0.5919853281031677
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5400
[LightGBM] [Info] Number of data points in the train set: 988357, number of used features: 10
[LightGBM] [Info] Start training from score 26.747213
Training until validation scores don't improve for 20 rounds
[100]	valid

In [35]:
sum(rmsles)/len(rmsles)

0.591346617962767