In [50]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
from sklearn.model_selection import train_test_split
pd.set_option('display.float_format', lambda x:'%.5f' % x)
import numpy as np

In [51]:

# データタイプを指定
types_dict_train = {'train_id':'int64', 'item_condition_id':'int8', 'price':'float64', 'shipping':'int8'}
types_dict_test = {'test_id':'int64', 'item_condition_id':'int8', 'shipping':'int8'}

# tsvファイルからPandas DataFrameへ読み込み
train = pd.read_csv('train.tsv', delimiter='\t', low_memory=True, dtype=types_dict_train)
test = pd.read_csv('test.tsv', delimiter='\t', low_memory=True, dtype=types_dict_test)

## train = pd.read_table('train.tsv', low_memory=True, dtype=types_dict_train)
## test = pd.read_table('test.tsv', low_memory=True, dtype=types_dict_test)
でも可能。low_memoryとはcolumn dtypeを最後まで読み取り統一されているか確認するもの。何万行ものdtypeを一つ一つ最後まで確かめるのは重い。始めにdtypeを上のように指定してあげるとメモリーも少なくて済む。

In [52]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [53]:
test.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


In [54]:
# trainとtestのサイズを確認
train.shape, test.shape
((1482535, 9), (693359, 8))


((1482535, 9), (693359, 8))

In [55]:
# ?不明
def display_all(df):
    with pd.option_context("display.max_rows", 1000):
        with pd.option_context("display.max_columns", 1000):
            display(df)

# trainの基本統計量を表示
display_all(train.describe(include='all').transpose())

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
train_id,1482535.0,,,,741267.0,427971.135,0.0,370633.5,741267.0,1111900.5,1482534.0
name,1482535.0,1225273.0,Bundle,2232.0,,,,,,,
item_condition_id,1482535.0,,,,1.90738,0.90316,1.0,1.0,2.0,3.0,5.0
category_name,1476208.0,1287.0,"Women/Athletic Apparel/Pants, Tights, Leggings",60177.0,,,,,,,
brand_name,849853.0,4809.0,PINK,54088.0,,,,,,,
price,1482535.0,,,,26.73752,38.58607,0.0,10.0,17.0,29.0,2009.0
shipping,1482535.0,,,,0.44727,0.49721,0.0,0.0,0.0,1.0,1.0
item_description,1482531.0,1281426.0,No description yet,82489.0,,,,,,,


In [56]:
train.dtypes

train_id               int64
name                  object
item_condition_id       int8
category_name         object
brand_name            object
price                float64
shipping                int8
item_description      object
dtype: object

In [57]:
# trainのカテゴリ名、商品説明、投稿タイトル、ブランド名のデータタイプを「category」へ変換する
train.category_name = train.category_name.astype('category')
train.item_description = train.item_description.astype('category')
train.name = train.name.astype('category')
train.brand_name = train.brand_name.astype('category')
 
# testのカテゴリ名、商品説明、投稿タイトル、ブランド名のデータタイプを「category」へ変換する
test.category_name = test.category_name.astype('category')
test.item_description = test.item_description.astype('category')
test.name = test.name.astype('category')
test.brand_name = test.brand_name.astype('category')
 
# 文字列からcategoryに。
# dtypesで念のためデータ形式を確認しましょう
train.dtypes

train_id                int64
name                 category
item_condition_id        int8
category_name        category
brand_name           category
price                 float64
shipping                 int8
item_description     category
dtype: object

In [58]:
test.dtypes

test_id                 int64
name                 category
item_condition_id        int8
category_name        category
brand_name           category
shipping                 int8
item_description     category
dtype: object

In [59]:
# これでunique valueを表示しても良い
def unique(x):
    print(x.nunique())

In [60]:
unique(train)

train_id             1482535
name                 1225273
item_condition_id          5
category_name           1287
brand_name              4809
price                    828
shipping                   2
item_description     1281426
dtype: int64


In [61]:
# trainの中のユニークな値を確認する
train.apply(lambda x: x.nunique())

train_id             1482535
name                 1225273
item_condition_id          5
category_name           1287
brand_name              4809
price                    828
shipping                   2
item_description     1281426
dtype: int64

In [62]:
# trainの欠損データの個数と%を確認
train.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          4
dtype: int64

In [63]:
train.isnull().sum()/train.shape[0]

train_id            0.00000
name                0.00000
item_condition_id   0.00000
category_name       0.00427
brand_name          0.42676
price               0.00000
shipping            0.00000
item_description    0.00000
dtype: float64

In [64]:
#　test中のNaNの割合
test.isnull().sum()/test.shape[0]

test_id             0.00000
name                0.00000
item_condition_id   0.00000
category_name       0.00441
brand_name          0.42622
shipping            0.00000
item_description    0.00000
dtype: float64

In [65]:
train.columns

Index(['train_id', 'name', 'item_condition_id', 'category_name', 'brand_name',
       'price', 'shipping', 'item_description'],
      dtype='object')

In [67]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [68]:
# trainとtestのidカラム名を変更する
train = train.rename(columns = {'train_id':'id'})
test = test.rename(columns = {'test_id':'id'})
# 両方のセットへ「is_train」のカラムを追加
# 1 = trainのデータ、0 = testデータ
train['is_train'] = 1
test['is_train'] = 0

In [78]:
# trainのprice(価格）以外のデータをtestと連結
train_test_combine = pd.concat([train.drop(['price'], axis=1),test],axis=0)
# 念のためデータの中身を表示させましょう
train_test_combine.head()

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,shipping,item_description,is_train
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,1,No description yet,1
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,0,This keyboard is in great condition and works ...,1
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,Adorable top with a hint of lace and a key hol...,1
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,1,New with tags. Leather horses. Retail for [rm]...,1
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,0,Complete with certificate of authenticity,1


In [85]:
train_test_combine.dtypes

id                    int64
name                 object
item_condition_id      int8
category_name        object
brand_name           object
shipping               int8
item_description     object
is_train              int64
dtype: object

In [86]:
# train_test_combineの文字列のデータタイプを「category」へ変換
train_test_combine.category_name = train_test_combine.category_name.astype('category')
train_test_combine.item_description = train_test_combine.item_description.astype('category')
train_test_combine.name = train_test_combine.name.astype('category')
train_test_combine.brand_name = train_test_combine.brand_name.astype('category')

In [87]:
train_test_combine.dtypes

id                      int64
name                 category
item_condition_id        int8
category_name        category
brand_name           category
shipping                 int8
item_description     category
is_train                int64
dtype: object

In [88]:
# combinedDataの文字列を「.cat.codes」で数値へ変換する
train_test_combine.name = train_test_combine.name.cat.codes
train_test_combine.category_name = train_test_combine.category_name.cat.codes
train_test_combine.brand_name = train_test_combine.brand_name.cat.codes
train_test_combine.item_description = train_test_combine.item_description.cat.codes
 
# データの中身とデータ形式を表示して確認しましょう
train_test_combine.head()
train_test_combine.dtypes

id                   int64
name                 int32
item_condition_id     int8
category_name        int16
brand_name           int16
shipping              int8
item_description     int32
is_train             int64
dtype: object

In [89]:
train_test_combine.head()

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,shipping,item_description,is_train
0,0,916335,3,829,-1,1,1172053,1
1,1,1292428,3,86,3889,0,1585539,1
2,2,131013,1,1277,4588,1,167133,1
3,3,802671,1,503,-1,1,1136643,1
4,4,65051,1,1204,-1,0,531909,1


In [105]:
# 「is_train」のフラグでcombineからtestとtrainへ切り分ける
df_test = train_test_combine.loc[train_test_combine['is_train'] == 0]
df_train = train_test_combine.loc[train_test_combine['is_train'] == 1]
# 「is_train」をtrainとtestのデータフレームから落とす
df_test = df_test.drop(['is_train'], axis=1)
df_train = df_train.drop(['is_train'], axis=1)
# サイズの確認をしておきましょう
df_test.shape, df_train.shape
((693359, 7), (1482535, 7))


((693359, 7), (1482535, 7))

In [107]:
# df_trainへprice（価格）を戻す
df_train['price'] = train.price
# price（価格）をlog関数で処理
df_train['price'] = df_train['price'].apply(lambda x: np.log(x) if x>0 else x)
# df_trainを表示して確認
df_train.head()


Unnamed: 0,id,name,item_condition_id,category_name,brand_name,shipping,item_description,price
0,0,916335,3,829,-1,1,1172053,2.30259
1,1,1292428,3,86,3889,0,1585539,3.95124
2,2,131013,1,1277,4588,1,167133,2.30259
3,3,802671,1,503,-1,1,1136643,3.55535
4,4,65051,1,1204,-1,0,531909,3.78419


In [108]:
df_train.dtypes

id                     int64
name                   int32
item_condition_id       int8
category_name          int16
brand_name             int16
shipping                int8
item_description       int32
price                float64
dtype: object

In [3]:
import time

In [118]:
# x ＝ price以外の全ての値、y = price（ターゲット）で切り分ける
x_train, y_train = df_train.drop(['price'], axis=1), df_train.price
# モデルの作成
start_time = time.time()
m = RandomForestRegressor(n_jobs=-1, min_samples_leaf=5, n_estimators=200)
m.fit(x_train, y_train)

show_time = time.time()-start_time
print('time is:{0:.2f}'. format(show_time))

time is:{0:.2f} 1224.3866579532623


In [1]:
m.score(x_train, y_train)

NameError: name 'm' is not defined

In [None]:
# 作成したランダムフォレストのモデル「m」に「df_test」を入れて予測する
preds = m.predict(df_test)
# 予測値 predsをnp.exp()で処理
np.exp(preds)
# Numpy配列からpandasシリーズへ変換
preds = pd.Series(np.exp(preds))
# テストデータのIDと予測値を連結
submit = pd.concat([df_test.id, preds], axis=1)
# カラム名をメルカリの提出指定の名前をつける
submit.columns = ['test_id', 'price']
# 提出ファイルとしてCSVへ書き出し
submit.to_csv('submit_rf_base.csv', index=False)
