In [3]:
import os
import csv
import pandas as pd
import lightgbm as lgb
import numpy as np
import gc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import seaborn as sns

In [4]:
train_frame = pd.read_csv('train.csv')
test_frame = pd.read_csv('test.csv')

In [5]:
test_item_id = test_frame.item_id.values

In [6]:
cat_columns = ['region', 'city', 'parent_category_name', 'category_name', 'param_1', 'user_type']
for col in cat_columns:
    lbl = LabelEncoder()
    lbl.fit(list(train_frame[col].values.astype('str')) + list(test_frame[col].values.astype('str')))
    train_frame[col] = lbl.transform(list(train_frame[col].values.astype('str')))
    test_frame[col] = lbl.transform(list(test_frame[col].values.astype('str')))


In [7]:
train_x = train_frame[['price', 'image_top_1']].values
test_x = test_frame[['price', 'image_top_1']].values
train_arr = [train_x]
test_arr = [test_x]
for col in cat_columns:
    lbl = OneHotEncoder()
    lbl_set = list(train_frame[col].values) + list(test_frame[col].values)
    lbl_set = np.reshape(np.array(lbl_set), (-1, 1))
    lbl.fit(lbl_set)
    lbl_set_train = np.reshape(np.array(list(train_frame[col].values)), (-1, 1))
    train_col_lbl = lbl.transform(lbl_set_train)
    train_arr.append(train_col_lbl.todense())
    lbl_set_test = np.reshape(np.array(list(test_frame[col].values)), (-1, 1))
    test_col_lbl = lbl.transform(lbl_set_test)
    test_arr.append(test_col_lbl.todense())



In [8]:
train_x = np.hstack(train_arr)


In [9]:
test_x = np.hstack(test_arr)

In [10]:
print(train_x.shape)
print(test_x.shape)

(1503424, 2213)
(508438, 2213)


In [11]:
y = train_frame.deal_probability.values

In [12]:
del train_arr, test_arr, train_frame, test_frame

In [13]:
gc.collect()

171

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(train_x, y, test_size=0.2, random_state=42)


In [15]:
print(y_train)

[0.76786 0.      0.12311 ... 0.2     0.80323 0.     ]


In [16]:
lgtrain = lgb.Dataset(np.asarray(X_train), label=y_train)
lgval = lgb.Dataset(np.asarray(X_valid), label=y_valid)

In [17]:
params = {
        "objective" : "regression",
        "metric" : "rmse",
        'num_leaves': 250,
        'feature_fraction': 0.65,
        'bagging_fraction': 0.85,
        'learning_rate': 0.02,
    }
    
evals_result = {}
model = lgb.train(params, lgtrain, 4000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20)

Training until validation scores don't improve for 100 rounds.
[20]	valid_0's rmse: 0.245847
[40]	valid_0's rmse: 0.238456
[60]	valid_0's rmse: 0.234506
[80]	valid_0's rmse: 0.232292
[100]	valid_0's rmse: 0.23093
[120]	valid_0's rmse: 0.230109
[140]	valid_0's rmse: 0.229556
[160]	valid_0's rmse: 0.229165
[180]	valid_0's rmse: 0.228851
[200]	valid_0's rmse: 0.228602
[220]	valid_0's rmse: 0.2284
[240]	valid_0's rmse: 0.228225
[260]	valid_0's rmse: 0.228063
[280]	valid_0's rmse: 0.227937
[300]	valid_0's rmse: 0.227816
[320]	valid_0's rmse: 0.227712
[340]	valid_0's rmse: 0.227629
[360]	valid_0's rmse: 0.227553
[380]	valid_0's rmse: 0.22749
[400]	valid_0's rmse: 0.227442
[420]	valid_0's rmse: 0.227398
[440]	valid_0's rmse: 0.227355
[460]	valid_0's rmse: 0.227314
[480]	valid_0's rmse: 0.227285
[500]	valid_0's rmse: 0.227256
[520]	valid_0's rmse: 0.227223
[540]	valid_0's rmse: 0.227193
[560]	valid_0's rmse: 0.227167
[580]	valid_0's rmse: 0.227138
[600]	valid_0's rmse: 0.227111
[620]	valid_0's

In [None]:
predictions.shape

In [40]:
predictions[predictions > 0.2].shape

NameError: name 'predictions' is not defined

In [22]:
predictions = model.predict(np.asarray(test_x))

In [20]:
predictions[predictions > 0.2] = 1
predictions[predictions <= 0.2] = 0
predictions = predictions.astype(int)

In [25]:
sub_df = pd.DataFrame({"item_id":test_item_id})
sub_df["deal_probability"] = predictions.clip(0,1)
sub_df.to_csv("/tmp/baseline_avito.csv", index=False)

In [None]:
sns.distplot(y[y>0.001])

In [None]:
sns.distplot(y[y< 0.05])

In [None]:
sns.distplot(y[y > 0.95])

In [None]:
sns.distplot(y)