# Naive Learning

We first explore the basic algorithms: Linear Regression and Random Forest Regressors.

In [1]:
%load_ext autoreload

In [2]:
%run modules.ipynb


Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.

Using TensorFlow backend.


#### Feature enrichment

In [3]:
DATA_PATH = './data'
HELPER_DATA_PATH = './helper_data'

In [4]:
print('loading data...')
train, test = load_data(DATA_PATH)
train, test = basic_enrichment(train, test, helper_data_path=HELPER_DATA_PATH)
train, test = load_image_features(train, test, helper_data_path=HELPER_DATA_PATH)
train, test = load_text_features(train, test, helper_data_path=HELPER_DATA_PATH)
train, test = add_aggregated_features(train, test, helper_data_path=HELPER_DATA_PATH)
train, test = numeric_features_cleaning(train, test, helper_data_path=HELPER_DATA_PATH)
train, test = complete_image_top_1(train, test, helper_data_path=HELPER_DATA_PATH)
train, test = complete_price(train, test, helper_data_path=HELPER_DATA_PATH)

loading data...
Adding basic features...
Done adding basic features.
Adding image features...




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Done loading image features.
Loading text features...
loading tfidf features...


AttributeError: 'float' object has no attribute 'split'

In [None]:
list(train.columns)

In [None]:
categorical_cols = ['user_type', \
                'region', 'city', \
                'parent_category_name', 'category_name', 'param_1', 'param_2', 'param_3', \
                'image_top_1_class', 'image_top_1_rounded_regression', \
                'month', 'day', 'weekday', \
                'has_price', 'has_description', 'has_params', 'has_image',
                   ]
numerical_cols = ['image_top_1_regression', \
                     'log_price_regression', \
                     'avg_days_up_user', 'avg_times_up_user', 'n_user_items', 'user_ads_count', \
                     'log_item_seq_number', \
                     'img_size', 'img_luminance', 'img_colorfulness', 'img_confidence', 'log_img_sharpness', 'log_img_keypoints', \
                     'title_word_count', 'description_word_count', 'merged_params_word_count', \
                     'description_non_regular_chars_ratio', 'title_capital_letters_ratio','description_capital_letters_ratio', \
                     'title_non_regular_chars_ratio', 'title_adj_to_len_ratio', 'title_noun_to_len_ratio',\
                     'title_sentiment',
                     'title_svd_1_ngram', 'title_svd_2_ngram', 'title_svd_3_ngram', 'title_svd_4_ngram', 'title_svd_5_ngram',
                     'description_svd_1_ngram', 'description_svd_2_ngram', 'description_svd_3_ngram', 'description_svd_4_ngram', 'description_svd_5_ngram',
                 ]

feature_list = categorical_cols + numerical_cols

def categorical_indices(df, categorical_cols):
    return [i for i, col in enumerate(df.columns) if col in categorical_cols]

#### Encoding Labels

In [None]:
import gc
gc.collect()
for col in categorical_cols:
    print ("Encoding "+ str(col) + "...")
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train[col].values.astype('str')) + list(test[col].values.astype('str')))
    train[col] = lbl.transform(list(train[col].values.astype('str')))
    test[col] = lbl.transform(list(test[col].values.astype('str')))

#### Splitting data

In [None]:
X_df = train[feature_list].fillna(0)
y_df = train['deal_probability'].values
X_test_df = test[feature_list].fillna(0)
y_test_df = train['deal_probability'].values
X_train_df, X_val_df, y_train_df, y_val_df = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

## Naive random prediction

In [None]:
results = {}

In [None]:
y_pred = np.random.randint(1, size=y_val_df.shape[0])
res = rmse(y_pred, y_val_df)
print(res)
results['naive'] = { 'rmse': res }

## Linear Regression

In [None]:
l = LinearRegression()
l.fit(X_train_df, y_train_df)
y_pred = l.predict(X_val_df)
res = rmse(y_pred, y_val_df)
print(res)
results['linear_regression'] = { 'rmse': res }

## Random Forest Regressor

In [None]:
r = RandomForestRegressor(n_estimators=25, max_depth=10, n_jobs=-1)
r.fit(X_train_df, y_train_df)
y_pred = r.predict(X_val_df)
res = rmse(y_pred, y_val_df)
print(res)
results['random_forest_regressor'] = { 'rmse': res }

## CatBoost

In [None]:
cb_model = CatBoostRegressor(iterations=100,
                             learning_rate=0.05,
                             depth=12,
                             eval_metric='RMSE',
                             random_seed = 23,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=100)
cb_model.fit(X_train_df, y_train_df,
             eval_set=(X_val_df, y_val_df),
             cat_features=categorical_indices(X_val_df, categorical_cols),
             use_best_model=True,
             verbose=True)

In [None]:
def test_catboost(cb_model, X_test, y_test):
    y_pred = cb_model.predict(X_test)
    y_pred = np.clip(y_pred, 0, 1)
    res = rmse(y_pred, y_test)
    print(res)
    return res

results['catboost'] = { 'rmse': test_catboost(cb_model, X_val_df, y_val_df) }

### Get Submission Result

In [None]:
y_pred = cb_model.predict(X_test_df)
y_pred = np.clip(y_pred, 0, 1)

In [None]:
res_df = pd.DataFrame(test['item_id'])
res_df['deal_probability'] = y_pred

In [None]:
res_df.to_csv('./submissions/submit_res_catboost_%s.csv.gz' % datetime.datetime.now(), index=None, compression='gzip')

## Results

We can see that the random regressor does the best work, but as we will see later, lgbm, catboost and nn does a much better job

In [None]:
algs = []
rmse = []
for alg, metrics in results.items():
    algs.append(alg)
    rmse.append(metrics['rmse'])

In [None]:
plt.bar(algs, rmse)

### Conclusion

CatBoost is much slower and brings inferior results to LGBM, we will not continue trials with it...