# https://www.kaggle.com/c/mercari-price-suggestion-challenge 
# https://www.kaggle.com/c/home-credit-default-risk

Выбрать одно из двух соревнований. Выбирайте по данным, с которыми вам интереснее работать.

In [1]:
import time
import re
# from __future__ import print_function
from collections import defaultdict

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import pandas_profiling

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, LabelEncoder, MinMaxScaler,  Imputer, LabelBinarizer, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

# Ансамбли

import xgboost as xgb
import lightgbm as lgb

%matplotlib inline
plt.rcParams["figure.figsize"] = (15, 8)
pd.options.display.float_format = '{:.2f}'.format

In [2]:
df = pd.read_csv('train.tsv', sep='\t')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482535 entries, 0 to 1482534
Data columns (total 8 columns):
train_id             1482535 non-null int64
name                 1482535 non-null object
item_condition_id    1482535 non-null int64
category_name        1476208 non-null object
brand_name           849853 non-null object
price                1482535 non-null float64
shipping             1482535 non-null int64
item_description     1482531 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 90.5+ MB


train_id or test_id - the id of the listing

name - the title of the listing. Note that we have cleaned the data to remove text that look like prices (e.g. $20) to avoid leakage. These removed prices are represented as [rm]

item_condition_id - the condition of the items provided by the seller

category_name - category of the listing

brand_name

price - the price that the item was sold for. This is the target variable that you will predict. The unit is USD. This column doesn't exist in test.tsv since that is what you will predict.

shipping - 1 if shipping fee is paid by seller and 0 by buyer

item_description - the full description of the item. Note that we have cleaned the data to remove text that look like prices (e.g. $20) to avoid leakage. These removed prices are represented as [rm]


### Используйте параметр nrows, чтобы уменьшить выборку и сделать базовый разведочный анализ данных

In [4]:
df_p = pd.read_csv('train.tsv', nrows=50000, sep='\t')
df_p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
train_id             50000 non-null int64
name                 50000 non-null object
item_condition_id    50000 non-null int64
category_name        49763 non-null object
brand_name           28416 non-null object
price                50000 non-null float64
shipping             50000 non-null int64
item_description     50000 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 3.1+ MB


In [5]:
df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [6]:
!head sample_submission.csv

test_id,price
0,26.738
1,26.738
2,26.738
3,26.738
4,26.738
5,26.738
6,26.738
7,26.738
8,26.738


In [7]:
df_test = pd.read_csv('test.tsv', sep='\t')

In [8]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693359 entries, 0 to 693358
Data columns (total 7 columns):
test_id              693359 non-null int64
name                 693359 non-null object
item_condition_id    693359 non-null int64
category_name        690301 non-null object
brand_name           397834 non-null object
shipping             693359 non-null int64
item_description     693359 non-null object
dtypes: int64(3), object(4)
memory usage: 37.0+ MB


In [9]:
df_test.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


# Задание
1. Сделать baseline submission
  * Исследовать признак price. 
  * Исследовать признак price в зависимости от brand_name или других признаков
2. Реализовать цикл анализа
  * признаки -> модель -> настройка параметров -> лучшая модель и ее значение метрики качества на кросс-валидации
3. Провести простые преобразования признаков и добавление простых признаков
  * разделить category_name на уровни
  * посмотреть на наличие числовых значений в описании и имени
  * ...
4. Составить план по применению нескольких моделей на разных признаках
  * спроектируйте эксперимент. Нужно заранее спланировать порядок перебора признаков и моделей. Потом только писать код. Обратный порядок вызывает необходимость переписывать существующий код, это трата времени
5. Просмотреть 1-5 kernel на kaggle. (только в таком порядке. сначала работаем самостоятельно, потом смотрим идеи других. при появлении опыта можно сразу начинать с них)
6. Скорректировать план
  * убрать пункты, которые кажутся неудачными
  * добавить идеи из kernel, кажущиеся удачными
7. Построить модель, выбрать лучшую
8. Построить ансамбль, настрить парамертры. Сравнить с другими моделями.
9. Применить и засабмитить лучшую на cv модель
10. Прислать блокнот и свой ник в лидерборде

# Feature Engineering

In [10]:
pandas_profiling.ProfileReport(df_p)

0,1
Number of variables,8
Number of observations,50000
Total Missing (%),5.5%
Total size in memory,3.1 MiB
Average record size in memory,64.0 B

0,1
Numeric,3
Categorical,4
Boolean,1
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,1539
Unique (%),3.1%
Missing (%),43.2%
Missing (n),21584

0,1
PINK,1876
Nike,1782
Victoria's Secret,1554
Other values (1535),23204
(Missing),21584

Value,Count,Frequency (%),Unnamed: 3
PINK,1876,3.8%,
Nike,1782,3.6%,
Victoria's Secret,1554,3.1%,
LuLaRoe,1056,2.1%,
Apple,591,1.2%,
FOREVER 21,533,1.1%,
Nintendo,499,1.0%,
Lululemon,488,1.0%,
Michael Kors,448,0.9%,
American Eagle,441,0.9%,

0,1
Distinct count,882
Unique (%),1.8%
Missing (%),0.5%
Missing (n),237

0,1
"Women/Athletic Apparel/Pants, Tights, Leggings",2018
Women/Tops & Blouses/T-Shirts,1541
Beauty/Makeup/Face,1161
Other values (878),45043

Value,Count,Frequency (%),Unnamed: 3
"Women/Athletic Apparel/Pants, Tights, Leggings",2018,4.0%,
Women/Tops & Blouses/T-Shirts,1541,3.1%,
Beauty/Makeup/Face,1161,2.3%,
Beauty/Makeup/Lips,1039,2.1%,
"Electronics/Cell Phones & Accessories/Cases, Covers & Skins",888,1.8%,
Beauty/Makeup/Eyes,881,1.8%,
Electronics/Video Games & Consoles/Games,844,1.7%,
Women/Underwear/Bras,728,1.5%,
"Women/Tops & Blouses/Tank, Cami",713,1.4%,
Women/Athletic Apparel/Shorts,688,1.4%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.9087
Minimum,1
Maximum,5
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,1
Q1,1
Median,2
Q3,3
95-th percentile,3
Maximum,5
Range,4
Interquartile range,2

0,1
Standard deviation,0.90471
Coef of variation,0.474
Kurtosis,-1.0595
Mean,1.9087
MAD,0.7841
Skewness,0.41993
Sum,95433
Variance,0.81849
Memory size,390.7 KiB

Value,Count,Frequency (%),Unnamed: 3
1,21573,43.1%,
3,14532,29.1%,
2,12706,25.4%,
4,1093,2.2%,
5,96,0.2%,

Value,Count,Frequency (%),Unnamed: 3
1,21573,43.1%,
2,12706,25.4%,
3,14532,29.1%,
4,1093,2.2%,
5,96,0.2%,

Value,Count,Frequency (%),Unnamed: 3
1,21573,43.1%,
2,12706,25.4%,
3,14532,29.1%,
4,1093,2.2%,
5,96,0.2%,

0,1
Distinct count,45713
Unique (%),91.4%
Missing (%),0.0%
Missing (n),0

0,1
No description yet,2850
New,130
Brand new,96
Other values (45710),46924

Value,Count,Frequency (%),Unnamed: 3
No description yet,2850,5.7%,
New,130,0.3%,
Brand new,96,0.2%,
Great condition,39,0.1%,
Good condition,38,0.1%,
Never worn,36,0.1%,
Like new,29,0.1%,
NWT,23,0.0%,
Brand New,21,0.0%,
New with tags,20,0.0%,

0,1
Distinct count,47896
Unique (%),95.8%
Missing (%),0.0%
Missing (n),0

0,1
Bundle,69
Dress,17
Lularoe OS leggings,15
Other values (47893),49899

Value,Count,Frequency (%),Unnamed: 3
Bundle,69,0.1%,
Dress,17,0.0%,
Lularoe OS leggings,15,0.0%,
Reserved,14,0.0%,
Converse,14,0.0%,
Coach purse,13,0.0%,
Lularoe Randy,13,0.0%,
Romper,13,0.0%,
Reserved bundle,12,0.0%,
Nike,12,0.0%,

0,1
Distinct count,337
Unique (%),0.7%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,26.651
Minimum,0
Maximum,1506
Zeros (%),0.1%

0,1
Minimum,0
5-th percentile,6
Q1,10
Median,17
Q3,29
95-th percentile,75
Maximum,1506
Range,1506
Interquartile range,19

0,1
Standard deviation,38.208
Coef of variation,1.4336
Kurtosis,179.2
Mean,26.651
MAD,18.362
Skewness,9.5503
Sum,1332600
Variance,1459.9
Memory size,390.7 KiB

Value,Count,Frequency (%),Unnamed: 3
10.0,3434,6.9%,
12.0,2738,5.5%,
14.0,2580,5.2%,
9.0,2199,4.4%,
16.0,2189,4.4%,
8.0,2091,4.2%,
15.0,2028,4.1%,
20.0,1897,3.8%,
7.0,1722,3.4%,
18.0,1404,2.8%,

Value,Count,Frequency (%),Unnamed: 3
0.0,40,0.1%,
3.0,662,1.3%,
4.0,567,1.1%,
5.0,1007,2.0%,
6.0,1069,2.1%,

Value,Count,Frequency (%),Unnamed: 3
1009.0,1,0.0%,
1100.0,1,0.0%,
1109.0,1,0.0%,
1206.0,1,0.0%,
1506.0,1,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.44834

0,1
0,27583
1,22417

Value,Count,Frequency (%),Unnamed: 3
0,27583,55.2%,
1,22417,44.8%,

0,1
Distinct count,50000
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,25000
Minimum,0
Maximum,49999
Zeros (%),0.0%

0,1
Minimum,0
5-th percentile,2500
Q1,12500
Median,25000
Q3,37499
95-th percentile,47499
Maximum,49999
Range,49999
Interquartile range,25000

0,1
Standard deviation,14434
Coef of variation,0.57737
Kurtosis,-1.2
Mean,25000
MAD,12500
Skewness,0
Sum,1249975000
Variance,208340000
Memory size,390.7 KiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
36155,1,0.0%,
40281,1,0.0%,
38232,1,0.0%,
11599,1,0.0%,
9550,1,0.0%,
15693,1,0.0%,
13644,1,0.0%,
3403,1,0.0%,
1354,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1,0.0%,
1,1,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
49995,1,0.0%,
49996,1,0.0%,
49997,1,0.0%,
49998,1,0.0%,
49999,1,0.0%,

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


item_description и name практически уникальные, поэтому для начала нет смысла их учитывать для анализа

In [11]:
df_train = df_p.copy()

In [12]:
df_train.drop(columns=["item_description","name","train_id"],inplace=True)

In [13]:
df_train["brand_name"].fillna("unknown",inplace=True)
df_train["category_name"].fillna("unknown",inplace=True)
df_train.drop_duplicates(inplace=True)

In [14]:
df_train.head()

Unnamed: 0,item_condition_id,category_name,brand_name,price,shipping
0,3,Men/Tops/T-shirts,unknown,10.0,1
1,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0
2,1,Women/Tops & Blouses/Blouse,Target,10.0,1
3,1,Home/Home Décor/Home Décor Accents,unknown,35.0,1
4,1,Women/Jewelry/Necklaces,unknown,44.0,0


In [15]:
pandas_profiling.ProfileReport(df_train)

0,1
Number of variables,6
Number of observations,37270
Total Missing (%),0.0%
Total size in memory,1.7 MiB
Average record size in memory,48.0 B

0,1
Numeric,3
Categorical,2
Boolean,1
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,1539
Unique (%),4.1%
Missing (%),0.0%
Missing (n),0

0,1
unknown,13191
Nike,1350
PINK,1268
Other values (1536),21461

Value,Count,Frequency (%),Unnamed: 3
unknown,13191,35.4%,
Nike,1350,3.6%,
PINK,1268,3.4%,
Victoria's Secret,1066,2.9%,
LuLaRoe,737,2.0%,
FOREVER 21,459,1.2%,
Apple,438,1.2%,
Michael Kors,405,1.1%,
Lululemon,393,1.1%,
American Eagle,355,1.0%,

0,1
Distinct count,882
Unique (%),2.4%
Missing (%),0.0%
Missing (n),0

0,1
"Women/Athletic Apparel/Pants, Tights, Leggings",1105
Women/Tops & Blouses/T-Shirts,925
Beauty/Makeup/Face,816
Other values (879),34424

Value,Count,Frequency (%),Unnamed: 3
"Women/Athletic Apparel/Pants, Tights, Leggings",1105,3.0%,
Women/Tops & Blouses/T-Shirts,925,2.5%,
Beauty/Makeup/Face,816,2.2%,
Beauty/Makeup/Lips,602,1.6%,
"Women/Tops & Blouses/Tank, Cami",533,1.4%,
Women/Athletic Apparel/Shorts,532,1.4%,
Women/Tops & Blouses/Blouse,517,1.4%,
Electronics/Video Games & Consoles/Games,501,1.3%,
Women/Shoes/Boots,496,1.3%,
"Women/Dresses/Above Knee, Mini",495,1.3%,

0,1
Distinct count,37270
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,23174
Minimum,0
Maximum,49998
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,1916.5
Q1,10361.0
Median,22338.0
Q3,35497.0
95-th percentile,46973.0
Maximum,49998.0
Range,49998.0
Interquartile range,25136.0

0,1
Standard deviation,14499
Coef of variation,0.62563
Kurtosis,-1.1876
Mean,23174
MAD,12561
Skewness,0.1454
Sum,863708666
Variance,210210000
Memory size,291.2 KiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
23873,1,0.0%,
40217,1,0.0%,
38168,1,0.0%,
9486,1,0.0%,
15629,1,0.0%,
13580,1,0.0%,
1290,1,0.0%,
7433,1,0.0%,
5384,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1,0.0%,
1,1,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
49992,1,0.0%,
49993,1,0.0%,
49994,1,0.0%,
49995,1,0.0%,
49998,1,0.0%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.9973
Minimum,1
Maximum,5
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,1
Q1,1
Median,2
Q3,3
95-th percentile,3
Maximum,5
Range,4
Interquartile range,2

0,1
Standard deviation,0.90798
Coef of variation,0.4546
Kurtosis,-1.0206
Mean,1.9973
MAD,0.75397
Skewness,0.31069
Sum,74440
Variance,0.82443
Memory size,291.2 KiB

Value,Count,Frequency (%),Unnamed: 3
1,14088,37.8%,
3,11623,31.2%,
2,10424,28.0%,
4,1040,2.8%,
5,95,0.3%,

Value,Count,Frequency (%),Unnamed: 3
1,14088,37.8%,
2,10424,28.0%,
3,11623,31.2%,
4,1040,2.8%,
5,95,0.3%,

Value,Count,Frequency (%),Unnamed: 3
1,14088,37.8%,
2,10424,28.0%,
3,11623,31.2%,
4,1040,2.8%,
5,95,0.3%,

0,1
Distinct count,337
Unique (%),0.9%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,29.782
Minimum,0
Maximum,1506
Zeros (%),0.1%

0,1
Minimum,0
5-th percentile,7
Q1,12
Median,19
Q3,32
95-th percentile,85
Maximum,1506
Range,1506
Interquartile range,20

0,1
Standard deviation,42.802
Coef of variation,1.4372
Kurtosis,148.96
Mean,29.782
MAD,20.967
Skewness,8.8037
Sum,1110000
Variance,1832
Memory size,291.2 KiB

Value,Count,Frequency (%),Unnamed: 3
10.0,2184,5.9%,
12.0,1870,5.0%,
14.0,1775,4.8%,
16.0,1560,4.2%,
9.0,1484,4.0%,
15.0,1408,3.8%,
8.0,1393,3.7%,
20.0,1382,3.7%,
7.0,1145,3.1%,
18.0,1060,2.8%,

Value,Count,Frequency (%),Unnamed: 3
0.0,40,0.1%,
3.0,336,0.9%,
4.0,316,0.8%,
5.0,545,1.5%,
6.0,607,1.6%,

Value,Count,Frequency (%),Unnamed: 3
1009.0,1,0.0%,
1100.0,1,0.0%,
1109.0,1,0.0%,
1206.0,1,0.0%,
1506.0,1,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.41189

0,1
0,21919
1,15351

Value,Count,Frequency (%),Unnamed: 3
0,21919,58.8%,
1,15351,41.2%,

Unnamed: 0,item_condition_id,category_name,brand_name,price,shipping
0,3,Men/Tops/T-shirts,unknown,10.0,1
1,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0
2,1,Women/Tops & Blouses/Blouse,Target,10.0,1
3,1,Home/Home Décor/Home Décor Accents,unknown,35.0,1
4,1,Women/Jewelry/Necklaces,unknown,44.0,0


In [16]:
del df_p, df_train

Таким образом, остановимся на следущих признаках:

* shipping явно влияет на цену
* item_condition_id больше коррелирует с shipping, но может вносить уточнение в цену
* из опыта категория товаров явно влияет на цену
* так же из опыта, товар из одной категории, но с разным брендом оказывает влияние на цену (обувь addidas vs noname made in china)
* нам нужно предсказывать цену с какой-то погрешностью, таким образом выявление цены с точки зрения категорий не целесообразно. Цена так же имеет логарифмическую шкалу.

# Выбор моделей

Для начала предположим, что выбросов нет

В качестве метрики возьмем RMSE, чеем она ниже, тем лучше модель

In [18]:
from sklearn.pipeline import make_union, make_pipeline, Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

class LabelEncoderPipelineFriendly(LabelEncoder):
    
    def fit(self, X, y=None):
        """this would allow us to fit the model based on the X input."""
        super(LabelEncoderPipelineFriendly, self).fit(X)
        
    def transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).transform(X).reshape(-1, 1)

    def fit_transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).fit(X).transform(X).reshape(-1, 1)

def get_y(df):
    return np.log(df["price"]+1) # +1 чтобы не было log(0)

def get_nums(df):
    return df[["shipping","item_condition_id"]]

def get_cat_name(df):
    return df[["category_name"]]

def get_brand_name(df):
    return df[["brand_name"]]

vec = make_union(*[
    make_pipeline(FunctionTransformer(get_nums, validate=False)),
    make_pipeline(FunctionTransformer(get_cat_name, validate=False),SimpleImputer(strategy="constant",fill_value="unknown"),LabelEncoderPipelineFriendly()),
    make_pipeline(FunctionTransformer(get_brand_name, validate=False),SimpleImputer(strategy="constant",fill_value="unknown"),LabelEncoderPipelineFriendly()),
])

vec2 = make_union(*[
    make_pipeline(FunctionTransformer(get_nums, validate=False)),
    make_pipeline(FunctionTransformer(get_cat_name, validate=False),SimpleImputer(strategy="constant",fill_value="unknown"),OneHotEncoder(sparse=False)),
    make_pipeline(FunctionTransformer(get_brand_name, validate=False),SimpleImputer(strategy="constant",fill_value="unknown"),OneHotEncoder(sparse=False)),
])

In [28]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV, cross_val_score
from scipy.stats import randint,expon
from time import time
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split

def report(search_res):
    print("Best param: {0}".format(search_res.best_params_))
    print("Best score: {0}".format(search_res.best_score_))
    
def test_model(model,v,df):
    y=get_y(df)
    X=v.fit_transform(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model.fit(X_train, y_train)
    print("RMSE:",mean_squared_error(model.predict(X_test), y_test))
    
def gs_model(model,param,v,df,n_iter_search = 20, cv=5, jobs=4, verbose=10):
    y=get_y(df)
    X=v.fit_transform(df)
#     rmse_scorer = make_scorer(mean_squared_error)
#     search = RandomizedSearchCV(model, param_distributions=param, scoring=rmse_scorer,
#                                    n_iter=n_iter_search, cv=cv, n_jobs=4)
    search = RandomizedSearchCV(model, param_distributions=param, scoring="neg_mean_squared_error",
                                   n_iter=n_iter_search, cv=cv, n_jobs=jobs, verbose=verbose, pre_dispatch=4)
#     search = RandomizedSearchCV(model, param_distributions=param, scoring="neg_mean_squared_error",
#                                    n_iter=n_iter_search, cv=cv, verbose=verbose)
#     search = GridSearchCV(model, param, scoring="neg_mean_squared_error", cv=cv, n_jobs=4, verbose=10)
    start=time()
    search.fit(X, y)
    print("RandomizedSearchCV took %.2f seconds for %d candidates parameter settings." % ((time() - start), n_iter_search))
    report(search)
    return search.best_estimator_

def cv_model(model,v,df,jobs=4):
    y=get_y(df)
    X=v.fit_transform(df)
    scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error',n_jobs=jobs)
    print(scores)
    print(scores.mean(),scores.std())

Для начала используем линейную регрессию, в качестве самой простой модели.

In [19]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

lin_reg = Ridge()

# test_model(lin_reg,vec,df)
param = {"alpha":[0.001,0.1,1,2,3,5,10,100,1000]}
model = gs_model(lin_reg, param,vec,df)
cv_model(model,vec,df)
del lin_reg,model

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:    2.6s
[Parallel(n_jobs=4)]: Done  21 tasks      | elapsed:    3.3s
[Parallel(n_jobs=4)]: Done  28 tasks      | elapsed:    4.0s
[Parallel(n_jobs=4)]: Done  37 tasks      | elapsed:    4.9s
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:    5.6s finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


RandomizedSearchCV took 5.77 seconds for 20 candidates parameter settings.
Best param: {'alpha': 10}
Best score: -0.5176187512524733


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[-0.51557751 -0.51594518 -0.52198992 -0.51647614 -0.51810501]
-0.5176187512524733 0.0023501735330393736


Попробуем улучшить результаты другой моделью

In [20]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()
# test_model(knn,vec,df)
param = {"n_neighbors":[1,2,3,5,10,25,50,80,100],
        "leaf_size":[5,10,15,30,50,100],
        "metric":["minkowski","manhattan",]}
model = gs_model(knn, param,vec,df)
cv_model(model,vec,df)

del knn,model

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  5.3min
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:  5.7min
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed: 16.2min
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed: 22.1min
[Parallel(n_jobs=4)]: Done  21 tasks      | elapsed: 33.6min
[Parallel(n_jobs=4)]: Done  28 tasks      | elapsed: 40.2min
[Parallel(n_jobs=4)]: Done  37 tasks      | elapsed: 53.8min
[Parallel(n_jobs=4)]: Done  46 tasks      | elapsed: 64.9min
[Parallel(n_jobs=4)]: Done  57 tasks      | elapsed: 80.2min
[Parallel(n_jobs=4)]: Done  68 tasks      | elapsed: 95.0min
[Parallel(n_jobs=4)]: Done  81 tasks      | elapsed: 112.6min
[Parallel(n_jobs=4)]: Done  94 tasks      | elapsed: 129.4min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 136.1min finished


RandomizedSearchCV took 8316.89 seconds for 20 candidates parameter settings.
Best param: {'n_neighbors': 50, 'metric': 'manhattan', 'leaf_size': 5}
Best score: -0.31841112006475225


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[-0.31848366 -0.31802334 -0.31954974 -0.31771065 -0.3182882 ]
-0.31841112006475225 0.0006257569087413023


DecisionTree мне нравятся за их интерпретируемость до определенной глубины. Вдруг неплохо предскажет и можно заложить явным алгоритмом?

In [22]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
# test_model(tree_reg,vec,df)
param = {"max_depth":[None,2,5,10,15],
        "min_samples_split":[2,5,10,20,50,100]}
model = gs_model(tree_reg, param,vec,df)
cv_model(model,vec,df)

del tree_reg,model

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    2.4s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    2.6s
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:    6.8s
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:    9.4s
[Parallel(n_jobs=4)]: Done  21 tasks      | elapsed:   13.2s
[Parallel(n_jobs=4)]: Done  28 tasks      | elapsed:   16.1s
[Parallel(n_jobs=4)]: Done  37 tasks      | elapsed:   19.7s
[Parallel(n_jobs=4)]: Done  46 tasks      | elapsed:   24.2s
[Parallel(n_jobs=4)]: Done  57 tasks      | elapsed:   29.0s
[Parallel(n_jobs=4)]: Done  68 tasks      | elapsed:   34.0s
[Parallel(n_jobs=4)]: Done  81 tasks      | elapsed:   41.3s
[Parallel(n_jobs=4)]: Done  94 tasks      | elapsed:   48.8s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   49.8s finished


RandomizedSearchCV took 51.87 seconds for 20 candidates parameter settings.
Best param: {'min_samples_split': 50, 'max_depth': None}
Best score: -0.31561458180776736


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[-0.31544558 -0.31577905 -0.31595234 -0.31504462 -0.31585522]
-0.3156153622213402 0.00033240038146458665


Попробуем ансамблирование

In [25]:
from sklearn.ensemble import RandomForestRegressor

# rnd_reg = RandomForestRegressor(n_estimators=100, n_jobs=4)
rnd_reg = RandomForestRegressor(n_jobs=1)
# test_model(rnd_reg,vec,df)
param = {"n_estimators":[10,50,100,150,200],
        "max_depth":[2,3,5,10,15],
        "min_samples_split":[2,5,10,20,50,100],
        }
model = gs_model(rnd_reg, param,vec,df,jobs=4)
cv_model(model,vec,df,jobs=4)

del rnd_reg, model

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:  6.2min
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed: 12.8min
[Parallel(n_jobs=4)]: Done  21 tasks      | elapsed: 18.6min
[Parallel(n_jobs=4)]: Done  28 tasks      | elapsed: 24.9min
[Parallel(n_jobs=4)]: Done  37 tasks      | elapsed: 26.1min
[Parallel(n_jobs=4)]: Done  46 tasks      | elapsed: 26.6min
[Parallel(n_jobs=4)]: Done  57 tasks      | elapsed: 27.8min
[Parallel(n_jobs=4)]: Done  68 tasks      | elapsed: 31.3min
[Parallel(n_jobs=4)]: Done  81 tasks      | elapsed: 36.1min
[Parallel(n_jobs=4)]: Done  94 tasks      | elapsed: 39.0min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 40.0min finished


RandomizedSearchCV took 2665.89 seconds for 20 candidates parameter settings.
Best param: {'n_estimators': 200, 'min_samples_split': 20, 'max_depth': 15}
Best score: -0.3276899757561234


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[-0.32791668 -0.32827047 -0.32663361 -0.32713446 -0.32839712]
-0.32767046743447825 0.0006798578999119342


NameError: name 'rnd_clf' is not defined

In [29]:
import xgboost as xgb

# xgb_reg = xgb.XGBRegressor(nthread=4)
xgb_reg = xgb.XGBRegressor(nthread=4)
# test_model(xgb_reg,vec,df)
param = {"n_estimators":[50,100,150,200],
        "max_depth":[5,10,15],
        "learning_rate":[0.1,1,10],
         "reg_lambda":[0.1,1,10]
        }
model = gs_model(xgb_reg, param,vec,df,jobs=1)
cv_model(model,vec,df)

del xgb_reg,model

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] reg_lambda=0.1, n_estimators=100, max_depth=15, learning_rate=1 .
[CV]  reg_lambda=0.1, n_estimators=100, max_depth=15, learning_rate=1, score=-0.3204156322601387, total= 1.0min
[CV] reg_lambda=0.1, n_estimators=100, max_depth=15, learning_rate=1 .


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


[CV]  reg_lambda=0.1, n_estimators=100, max_depth=15, learning_rate=1, score=-0.3204889703940479, total=  59.0s
[CV] reg_lambda=0.1, n_estimators=100, max_depth=15, learning_rate=1 .


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.2min remaining:    0.0s


[CV]  reg_lambda=0.1, n_estimators=100, max_depth=15, learning_rate=1, score=-0.3207226777351061, total= 1.0min
[CV] reg_lambda=0.1, n_estimators=100, max_depth=15, learning_rate=1 .


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.2min remaining:    0.0s


[CV]  reg_lambda=0.1, n_estimators=100, max_depth=15, learning_rate=1, score=-0.3205611371732475, total=  57.4s
[CV] reg_lambda=0.1, n_estimators=100, max_depth=15, learning_rate=1 .


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  4.3min remaining:    0.0s


[CV]  reg_lambda=0.1, n_estimators=100, max_depth=15, learning_rate=1, score=-0.3208353944919064, total= 1.2min
[CV] reg_lambda=1, n_estimators=100, max_depth=10, learning_rate=1 ...


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.5min remaining:    0.0s


[CV]  reg_lambda=1, n_estimators=100, max_depth=10, learning_rate=1, score=-0.3098211428861046, total=  39.8s
[CV] reg_lambda=1, n_estimators=100, max_depth=10, learning_rate=1 ...


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  6.2min remaining:    0.0s


[CV]  reg_lambda=1, n_estimators=100, max_depth=10, learning_rate=1, score=-0.30911234016429845, total=  38.5s
[CV] reg_lambda=1, n_estimators=100, max_depth=10, learning_rate=1 ...


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  6.9min remaining:    0.0s


[CV]  reg_lambda=1, n_estimators=100, max_depth=10, learning_rate=1, score=-0.31003623504308303, total=  36.7s
[CV] reg_lambda=1, n_estimators=100, max_depth=10, learning_rate=1 ...


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  7.5min remaining:    0.0s


[CV]  reg_lambda=1, n_estimators=100, max_depth=10, learning_rate=1, score=-0.30956015870388753, total=  37.2s
[CV] reg_lambda=1, n_estimators=100, max_depth=10, learning_rate=1 ...


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  8.2min remaining:    0.0s


[CV]  reg_lambda=1, n_estimators=100, max_depth=10, learning_rate=1, score=-0.30979141991828335, total=  36.0s
[CV] reg_lambda=10, n_estimators=200, max_depth=5, learning_rate=1 ...
[CV]  reg_lambda=10, n_estimators=200, max_depth=5, learning_rate=1, score=-0.3102606615833585, total=  36.6s
[CV] reg_lambda=10, n_estimators=200, max_depth=5, learning_rate=1 ...
[CV]  reg_lambda=10, n_estimators=200, max_depth=5, learning_rate=1, score=-0.3101493617604224, total=  36.6s
[CV] reg_lambda=10, n_estimators=200, max_depth=5, learning_rate=1 ...
[CV]  reg_lambda=10, n_estimators=200, max_depth=5, learning_rate=1, score=-0.310579490757417, total=  36.6s
[CV] reg_lambda=10, n_estimators=200, max_depth=5, learning_rate=1 ...
[CV]  reg_lambda=10, n_estimators=200, max_depth=5, learning_rate=1, score=-0.30995350045345443, total=  36.8s
[CV] reg_lambda=10, n_estimators=200, max_depth=5, learning_rate=1 ...
[CV]  reg_lambda=10, n_estimators=200, max_depth=5, learning_rate=1, score=-0.3100678283246621

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

Вылетает по непонятным причинам. лучшая модель получается reg_lambda=10, n_estimators=150, max_depth=10, learning_rate=1, score=-0.30511102674687024

In [25]:
xgb_reg = xgb.XGBRegressor(reg_lambda=10, n_estimators=150, max_depth=10, learning_rate=1)
cv_model(xgb_reg,vec,df)
del xgb_reg

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[-0.30519446 -0.30484437 -0.30599174 -0.30506423 -0.30511103]
-0.30524116290789144 0.000392741351677596


## Итог

наиболее удачной является модель xgb хоть и незначительно (RMSE = -0.305, против -0.315 у деревьем, например, но для соревнований это важно).

In [30]:
xgb_reg = xgb.XGBRegressor(reg_lambda=10, n_estimators=150, max_depth=10, learning_rate=1, nthread=4)
y = get_y(df)
X = vec.fit_transform(df)
X_test = vec.fit_transform(df_test)
xgb_reg.fit(X,y)
y_pred = xgb_reg.predict(X_test)
y_pred = np.exp(y_pred)-y_pred[np.where(y_pred<0)]=0

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([3.5513668, 3.1374931, 2.874524 , ..., 2.966386 , 2.8441763,
       2.8862147], dtype=float32)

In [46]:
df_y_test = pd.DataFrame(df_test['test_id'].copy())
df_y_test['price'] = pd.Series(y_pred, index=df_y_test.index)
df_y_test.to_csv("submission.csv", index=False)

Загруженно на кагл. Очков значительно меньше.

Выводы:
 * по каглу:
      * внимательно читать подсчет очков. В данном случае RMSLE, а не RMSE
 * По моделе:
      * можно попробовать разделить категории товаров и использовать более распространенные
      * в описаниях товаров выделить метки и использовать их для улучшения работы
      * попробовать предварительно кластеризовать данные и учитывать кластеры в качестве признаков