In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [5]:
train = pd.read_csv('../data/train.csv')
val = pd.read_csv('../data/val.csv')

In [7]:
cols = ['item_price_level', 'item_sales_level', 'item_collected_level']
X_train, y_train = train.loc[:, cols], train['is_trade']
X_val, y_val = val.loc[:, cols], val['is_trade']

In [9]:
rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [22]:
from sklearn.metrics import roc_auc_score, log_loss
y_pred = rf.predict_proba(X_val)[:, 1]
print(roc_auc_score(y_val, y_pred))
print(log_loss(y_val, y_pred))
print(rf.feature_importances_)

0.6344680621335345
0.09001688105108706
[0.38821744 0.3482432  0.26353936]


In [30]:
import lightgbm as lgb
to_remove = ['instance_id', 'item_id', 'item_category_list', 'item_property_list', 
             'user_id', 'context_id', 'context_timestamp', 'predict_category_property', 'shop_id', 'is_trade']
cols = [col for col in train.columns if col not in to_remove]
print(cols)

['item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level', 'context_page_id', 'shop_review_num_level', 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description']


In [56]:
X_train = train.loc[:, cols]
X_val = val.loc[:, cols]
y_train = train['is_trade']
y_val = val['is_trade']
train_dst = lgb.Dataset(X_train, y_train)
val_dst = lgb.Dataset(X_val, y_val)
print(X_train.shape)
print(X_val.shape)

(420693, 17)
(57418, 17)


In [57]:
ordered_cat = [col for col in cols if 'level' in col]
unordered_cat = [col for col in cols if 'id' in col]
print(ordered_cat)
for col in X_train:
    if col in ordered_cat:
        X_train[col] = X_train[col].astype('category', ordered=True)
        X_val[col] = X_val[col].astype('category', ordered=True)
    elif col in unordered_cat:
        X_train[col] = X_train[col].astype('category')
        X_val[col] = X_val[col].astype('category')

for col in X_train:
    print(col, X_train[col].dtype)

['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_age_level', 'user_star_level', 'shop_review_num_level', 'shop_star_level']
item_brand_id category
item_city_id category
item_price_level category
item_sales_level category
item_collected_level category
item_pv_level category
user_gender_id category
user_age_level category
user_occupation_id category
user_star_level category
context_page_id category
shop_review_num_level category
shop_review_positive_rate float64
shop_star_level category
shop_score_service float64
shop_score_delivery float64
shop_score_description float64


In [68]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary', 'auc'}, 
    'num_leaves': 31,
    'learning_rate': 0.02,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
gbm = lgb.train(params,
                train_dst,
                num_boost_round=500,
                valid_sets=[val_dst, train_dst],
                early_stopping_rounds=10)

[1]	training's auc: 0.606001	training's binary_logloss: 0.674825	valid_0's auc: 0.576595	valid_0's binary_logloss: 0.674723
Training until validation scores don't improve for 10 rounds.
[2]	training's auc: 0.606001	training's binary_logloss: 0.657221	valid_0's auc: 0.576595	valid_0's binary_logloss: 0.657021
[3]	training's auc: 0.610872	training's binary_logloss: 0.640295	valid_0's auc: 0.583148	valid_0's binary_logloss: 0.639989
[4]	training's auc: 0.610682	training's binary_logloss: 0.624011	valid_0's auc: 0.582856	valid_0's binary_logloss: 0.623602
[5]	training's auc: 0.616523	training's binary_logloss: 0.608333	valid_0's auc: 0.587551	valid_0's binary_logloss: 0.607835
[6]	training's auc: 0.620612	training's binary_logloss: 0.593229	valid_0's auc: 0.589766	valid_0's binary_logloss: 0.592628
[7]	training's auc: 0.621719	training's binary_logloss: 0.578671	valid_0's auc: 0.590657	valid_0's binary_logloss: 0.577979
[8]	training's auc: 0.621756	training's binary_logloss: 0.56463	valid_