In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline



In [2]:
types_dict_train = {'train_id':'int64', 'item_condition_id':'int8', 'price':'float64', 'shipping':'int8'}
types_dict_test = {'test_id':'int64', 'item_condition_id':'int8', 'shipping':'int8'}

train = pd.read_csv('train.tsv', delimiter='\t', low_memory=True, dtype=types_dict_train)
test = pd.read_csv('test.tsv', delimiter='\t', low_memory=True, dtype=types_dict_test)

In [3]:
target_col = 'price'
exclude_cols= ['price','train_id','test_id','name']
train_feature_cols= [col for col in train.columns if col not in exclude_cols]

In [4]:
# trainのカテゴリ名、商品説明、投稿タイトル、ブランド名のデータタイプを「category」へ変換する
train.category_name = train.category_name.astype('category')
train.item_description = train.item_description.astype('category')
train.name = train.name.astype('category')
train.brand_name = train.brand_name.astype('category')
 
# testのカテゴリ名、商品説明、投稿タイトル、ブランド名のデータタイプを「category」へ変換する
test.category_name = test.category_name.astype('category')
test.item_description = test.item_description.astype('category')
test.name = test.name.astype('category')
test.brand_name = test.brand_name.astype('category')
train.dtypes, test.dtypes

(train_id                int64
 name                 category
 item_condition_id        int8
 category_name        category
 brand_name           category
 price                 float64
 shipping                 int8
 item_description     category
 dtype: object, test_id                 int64
 name                 category
 item_condition_id        int8
 category_name        category
 brand_name           category
 shipping                 int8
 item_description     category
 dtype: object)

In [5]:
train.name = train.name.cat.codes
train.category_name = train.category_name.cat.codes
train.brand_name = train.brand_name.cat.codes
train.item_description = train.item_description.cat.codes

In [6]:
test.name = test.name.cat.codes
test.category_name = test.category_name.cat.codes
test.brand_name = test.brand_name.cat.codes
test.item_description = test.item_description.cat.codes

In [7]:
train['price'] = train['price'].apply(lambda x: np.log(x) if x>0 else x)


In [8]:
y = np.array(train[target_col])
X = np.array(train[train_feature_cols])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)

In [10]:
X_train1, X_train2, y_train1, y_train2 = train_test_split(X_train,y_train,test_size = 0.3)

In [11]:
def get_gfs_feature_indices(X, y,features, clf):
    X_train_, X_test_, y_train_,  y_test_ = train_test_split(X, y,test_size=0.3,random_state=1234)
    feature_indices = {feature: idx for idx, feature in enumerate(features)}
    features = set(features)
    last_mse = np.inf
    chosen_features = set()
    while len(chosen_features) < len(features):
        mse_features = []
        for feature in (features - chosen_features):
            candidates = chosen_features.union(set([feature]))
            indices = [feature_indices[feature] for feature in candidates]
            clf.fit(X_train_[:, indices], y_train_)
            y_pred = clf.predict(X_test_[:, indices])
            mse = mean_squared_error(y_test_, y_pred)
            mse_features += [(mse,feature)]
        mse, feature = min(mse_features)
        if mse >= last_mse:
            break
        last_mse = mse
        print('Newly Added Feature: {},\t MSE Score: {}'.format(feature, mse))
        chosen_features.add(feature)
    return [feature_indices[feature] for feature in chosen_features]

In [12]:
selected_feature_index = get_gfs_feature_indices(X = X_train,y = y_train,features = train_feature_cols, clf= RandomForestRegressor())

Newly Added Feature: category_name,	 MSE Score: 0.47630765199329694
Newly Added Feature: brand_name,	 MSE Score: 0.38958883496698876
Newly Added Feature: item_condition_id,	 MSE Score: 0.37116859046317796
Newly Added Feature: shipping,	 MSE Score: 0.3585427484499377


In [20]:
rf = RandomForestRegressor()
params = { 'max_depth':[5,10,20], 'min_samples_leaf':[100,200]}
gscv = GridSearchCV(rf, param_grid=params, cv=3, scoring='neg_mean_squared_error', n_jobs =-1)
gscv.fit(X_train[:, selected_feature_index], y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [5, 10, 15, 20], 'min_samples_leaf': [100, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [25]:
gscv.best_params_

{'max_depth': 20, 'min_samples_leaf': 100}

In [26]:
test_feature_cols = ["item_condition_id","category_name","brand_name","shipping"]

In [27]:
X = np.array(test[test_feature_cols])

In [28]:
preds = gscv.predict(X)
np.exp(preds)
preds = pd.Series(np.exp(preds))

In [29]:
submit = pd.concat([test.test_id, preds], axis=1)
submit.columns = ['test_id', 'price']
submit.to_csv('submit_rf_base1.csv', index=False)

In [31]:
test.shape

(693359, 7)