# Hyper Parameters Tuning

*Based on:* https://www.kaggle.com/garethjns/microsoft-lightgbm-with-parameter-tuning-0-823

In [1]:
%load_ext autoreload

In [2]:
%run ../modules.ipynb


Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.

Using TensorFlow backend.


In [3]:
init_data_paths("ilai")

## Feature enrichment

In [None]:
train[['item_seq_number', 'item_seq_number_count']].head()

In [4]:
print('loading data...')
train, test = load_data(DATA_PATH)
train, test = basic_enrichment(train, test, helper_data_path=HELPER_DATA_PATH)
train, test = load_image_features(train, test, helper_data_path=HELPER_DATA_PATH)
train, test = load_text_features(train, test, helper_data_path=HELPER_DATA_PATH)
train, test = add_aggregated_features(train, test, helper_data_path=HELPER_DATA_PATH)
train, test = numeric_features_cleaning(train, test, helper_data_path=HELPER_DATA_PATH)
train, test = complete_image_top_1(train, test, helper_data_path=HELPER_DATA_PATH)
train, test = complete_price(train, test, helper_data_path=HELPER_DATA_PATH)

loading data...
Adding basic features...
Done adding basic features.
Adding image features...




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Done loading image features.
Loading text features...
loading tfidf features...
Done loading text features.
Loading aggregated features...
Done loading aggregated features.
Loading aggregated features...
Done loading aggregated features.
Cleaning and completing numeric features...




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Done cleaning numeric features.
Completing image_top_1 features...
Done loading image_top_1 completions.
Completing price...
Done loading log_price_regression.


In [5]:
categorical_cols = ['user_type', \
                'region', 'city', \
                'parent_category_name', 'category_name', 'param_1', 'param_2', 'param_3', \
                'image_top_1_class', 'image_top_1_rounded_regression', \
                'month', 'day', 'weekday', \
                'has_price', 'has_description', 'has_params', 'has_image',
                   ]
numerical_cols = ['image_top_1_regression', \
                     'log_price_regression', \
                     'avg_days_up_user', 'avg_times_up_user', 'n_user_items', 'user_ads_count', \
                     'log_item_seq_number', 'item_seq_number_count', \
                     'img_size', 'img_luminance', 'img_colorfulness', 'img_confidence', 'log_img_sharpness', 'log_img_keypoints', \
                     'title_word_count', 'description_word_count', 'merged_params_word_count', \
                     'description_non_regular_chars_ratio', 'title_capital_letters_ratio','description_capital_letters_ratio', \
                     'title_non_regular_chars_ratio', 'title_adj_to_len_ratio', 'title_noun_to_len_ratio',\
                     'title_sentiment',
                     'title_svd_1_ngram', 'title_svd_2_ngram', 'title_svd_3_ngram', 'title_svd_4_ngram', 'title_svd_5_ngram',
                     'description_svd_1_ngram', 'description_svd_2_ngram', 'description_svd_3_ngram', 'description_svd_4_ngram', 'description_svd_5_ngram',
                 ]

feature_list = categorical_cols + numerical_cols

def categorical_indices(df, categorical_cols):
    return [i for i, col in enumerate(df.columns) if col in categorical_cols]

#### Encoding Labels

In [6]:
import gc
gc.collect()
for col in categorical_cols:
    print ("Encoding "+ str(col) + "...")
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train[col].values.astype('str')) + list(test[col].values.astype('str')))
    train[col] = lbl.transform(list(train[col].values.astype('str')))
    test[col] = lbl.transform(list(test[col].values.astype('str')))

Encoding user_type...
Encoding region...
Encoding city...
Encoding parent_category_name...
Encoding category_name...
Encoding param_1...
Encoding param_2...
Encoding param_3...
Encoding image_top_1_class...
Encoding image_top_1_rounded_regression...
Encoding month...
Encoding day...
Encoding weekday...
Encoding has_price...
Encoding has_description...
Encoding has_params...
Encoding has_image...


#### Splitting data

In [7]:
X_df = train[feature_list].fillna(0)
y_df = train['deal_probability'].values

In [8]:
X_test_df = test[feature_list].fillna(0)
X_train_df, X_val_df, y_train_df, y_val_df = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

### Hyper Parameters Tuning

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'verbose': 1,
    'random_state': [501],
}

# Create parameters to search
grid_params_1 = {
    'learning_rate': [0.005, 0.03, 0.05],
    'n_estimators': [8,16,24],
    'num_leaves': [16,32],
    'objective' : ['regression', 'poisson'],
    'max_depth': [10, 15, 20],
    'feature_fraction': [0.4, 0.6, 0.8],
}

grid_params_2 = {
    'learning_rate': [0.08, 0.1, 0.5, 0.7],
    'n_estimators': [32, 38],
    'num_leaves': [32, 38],
    'objective' : ['regression'],
    'max_depth': [15],
    'feature_fraction': [0.6],
}

grid_params_2 = {
    'learning_rate': [0.3],
    'n_estimators': [38, 45],
    'num_leaves': [38],
    'objective' : ['regression'],
    'max_depth': [15],
    'feature_fraction': [0.6],
}

def hyperparam_tuner(grid_params, X_train, y_train):
    # Create classifier to use. Note that parameters have to be input manually, not as a dict!
    mdl = lgb.LGBMRegressor(
        n_jobs = 1, # Updated from 'nthread' 
        silent = True,
        **params
    )

    # To view the default model params:
    mdl.get_params().keys()

    # Create the grid
    grid = GridSearchCV(mdl, grid_params, verbose=1, cv=4, n_jobs=5)
    # Run the grid
    grid.fit(X_train, y_train)

    # Print the best parameters found
    print(grid.best_params_)
    print(grid.best_score_)
          
    return grid.best_params_

best_params = hyperparam_tuner(grid_params, X_train_df, y_train_df)

In [42]:
tuned_params = params.copy()
tuned_params.update(best_params)

Final Results

In [43]:
tuned_params

{'task': 'train',
 'boosting_type': 'gbdt',
 'metric': 'rmse',
 'verbose': 1,
 'random_state': [501],
 'feature_fraction': 0.6,
 'learning_rate': 0.5,
 'max_depth': 15,
 'n_estimators': 38,
 'num_leaves': 38,
 'objective': 'regression'}