In [13]:
# перед началом работы не забудьте скачать файл train.json.zip с Kaggle и разархивировать его
import json
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# сразу загрузим датасет от Renthop
with open('../data/train.json', 'r') as raw_data:
    data = json.load(raw_data)
    df = pd.DataFrame(data)

In [14]:
import numpy as np
import pandas as pd
import json
from sklearn.base import TransformerMixin

EPSILON = 1e-5


class FeatureEngineer(TransformerMixin):

    def apply(self, df, k, condition):
        df[k] = df['features'].apply(condition)
        df[k] = df[k].astype(np.int8)

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None):
        df = X.copy()

        df.features = df.features.apply(lambda x: ' '.join([y.replace(' ', '_') for y in x]))
        df.features = df.features.apply(lambda x: x.lower())
        df.features = df.features.apply(lambda x: x.replace('-', '_'))

        for k, condition in (('dishwasher', lambda x: 'dishwasher' in x),
                             ('doorman', lambda x: 'doorman' in x or 'concierge' in x),
                             ('pets', lambda x: "pets" in x or "pet" in x or "dog" in x or "cats" in x and "no_pets" not in x),
                             ('air_conditioning', lambda x: 'air_conditioning' in x or 'central' in x),
                             ('parking', lambda x: 'parking' in x),
                             ('balcony', lambda x: 'balcony' in x or 'deck' in x or 'terrace' in x or 'patio' in x),
                             ('bike', lambda x: 'bike' in x),
                             ('storage', lambda x: 'storage' in x),
                             ('outdoor', lambda x: 'outdoor' in x or 'courtyard' in x or 'garden' in x),
                             ('roof', lambda x: 'roof' in x),
                             ('gym', lambda x: 'gym' in x or 'fitness' in x),
                             ('pool', lambda x: 'pool' in x),
                             ('backyard', lambda x: 'backyard' in x),
                             ('laundry', lambda x: 'laundry' in x),
                             ('hardwood_floors', lambda x: 'hardwood_floors' in x),
                             ('new_construction', lambda x: 'new_construction' in x),
                             ('dryer', lambda x: 'dryer' in x),
                             ('elevator', lambda x: 'elevator' in x),
                             ('garage', lambda x: 'garage' in x),
                             ('pre_war', lambda x: 'pre_war' in x or 'prewar' in x),
                             ('post_war', lambda x: 'post_war' in x or 'postwar' in x),
                             ('no_fee', lambda x: 'no_fee' in x),
                             ('low_fee', lambda x: 'reduced_fee' in x or 'low_fee' in x),
                             ('fire', lambda x: 'fireplace' in x),
                             ('private', lambda x: 'private' in x),
                             ('wheelchair', lambda x: 'wheelchair' in x),
                             ('internet', lambda x: 'wifi' in x or 'wi_fi' in x or 'internet' in x),
                             ('yoga', lambda x: 'yoga' in x),
                             ('furnished', lambda x: 'furnished' in x),
                             ('multi_level', lambda x: 'multi_level' in x),
                             ('exclusive', lambda x: 'exclusive' in x),
                             ('high_ceil', lambda x: 'high_ceil' in x),
                             ('green', lambda x: 'green_b' in x),
                             ('stainless', lambda x: 'stainless_' in x),
                             ('simplex', lambda x: 'simplex' in x),
                             ('public', lambda x: 'public' in x),
                             ):
            self.apply(df, k, condition)

        df['bathrooms'] = df['bathrooms'].apply(lambda x: x if x < 5 else 5)
        df['bedrooms'] = df['bedrooms'].apply(lambda x: x if x < 5 else 5)
        df["num_photos"] = df["photos"].apply(len)
        df["num_features"] = df["features"].apply(len)
        created = pd.to_datetime(df.pop("created"))
        df["listing_age"] = (pd.to_datetime('today') - created).apply(lambda x: x.days)
        df["room_dif"] = df["bedrooms"] - df["bathrooms"]
        df["room_sum"] = df["bedrooms"] + df["bathrooms"]
        df["price_per_room"] = df["price"] / df["room_sum"].apply(lambda x: max(x, .5))
        df["bedrooms_share"] = df["bedrooms"] / df["room_sum"].apply(lambda x: max(x, .5))
        df['price'] = df['price'].apply(lambda x: np.log(x + EPSILON))

        key_types = df.dtypes.to_dict()
        for k in key_types:
            if key_types[k].name not in ('int64', 'float64', 'int8'):
                df.pop(k)

        for k in ('latitude', 'longitude', 'listing_id'):
            df.pop(k)
        return df


def encode(x):
    if x == 'low':
        return 0
    elif x == 'medium':
        return 1
    elif x == 'high':
        return 2


def get_data():
    with open('../data/train.json', 'r') as raw_data:
        data = json.load(raw_data)

    df = pd.DataFrame(data)
    target = df.pop('interest_level').apply(encode)

    df = FeatureEngineer().fit_transform(df)
    return df, target

In [15]:
x_data, y_data = get_data()

In [16]:
x_data.head(5)

Unnamed: 0,bathrooms,bedrooms,price,dishwasher,doorman,pets,air_conditioning,parking,balcony,bike,...,stainless,simplex,public,num_photos,num_features,listing_age,room_dif,room_sum,price_per_room,bedrooms_share
4,1.0,1,7.783224,1,0,1,0,0,0,0,...,0,0,0,12,92,1507,0.0,2.0,1200.0,0.5
6,1.0,2,8.242756,1,1,0,0,0,0,0,...,0,0,0,6,70,1522,1.0,3.0,1266.666667,0.666667
9,1.0,2,8.159089,1,1,0,0,0,0,0,...,0,0,0,6,79,1508,1.0,3.0,1165.0,0.666667
10,1.5,3,8.006368,0,0,0,0,0,0,0,...,0,0,0,5,0,1499,1.5,4.5,666.666667,0.666667
15,1.0,0,7.935587,0,1,0,0,0,0,0,...,0,0,0,4,51,1495,-1.0,1.0,2795.0,0.0


In [17]:
x_data = x_data.values

In [18]:
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score

from sklearn.feature_selection import SelectFromModel

cross_val_score(LogisticRegression(), x_data, y_data, scoring='neg_log_loss').mean()

-0.6922702822749353

In [21]:
from sklearn.preprocessing import StandardScaler

cross_val_score(LogisticRegression(), StandardScaler().fit_transform(x_data), y_data, scoring='neg_log_loss').mean()

-0.6725480668269141

In [22]:
from sklearn.preprocessing import MinMaxScaler

cross_val_score(LogisticRegression(), MinMaxScaler().fit_transform(x_data), y_data, scoring='neg_log_loss').mean()

-0.687707738631698

In [23]:
rooms = df["bedrooms"].apply(lambda x: max(x, .5))
# избегаем деления на ноль; .5 выбран более или менее произвольно
df["price_per_bedroom"] = df["price"] / rooms

In [24]:
from sklearn.pipeline import make_pipeline


x_data, y_data = get_data()
x_data = x_data.values

pipe1 = make_pipeline(StandardScaler(),
                      SelectFromModel(estimator=RandomForestClassifier()),
                      LogisticRegression())

pipe2 = make_pipeline(StandardScaler(),
                      LogisticRegression())

rf = RandomForestClassifier()

print('LR + selection: ', cross_val_score(pipe1, x_data, y_data, scoring='neg_log_loss').mean())
print('LR: ', cross_val_score(pipe2, x_data, y_data, scoring='neg_log_loss').mean())
print('RF: ', cross_val_score(rf, x_data, y_data, scoring='neg_log_loss').mean())


LR + selection:  -0.71360527429795
LR:  -0.6723768575473414
RF:  -1.9525704072211914


In [25]:
from sklearn.feature_selection import VarianceThreshold

from sklearn.datasets import make_classification

x_data_generated, y_data_generated = make_classification()

In [26]:
x_data_generated.shape

(100, 20)

In [27]:
VarianceThreshold(.7).fit_transform(x_data_generated).shape

(100, 20)

In [28]:
VarianceThreshold(.8).fit_transform(x_data_generated).shape

(100, 17)

In [29]:
VarianceThreshold(.9).fit_transform(x_data_generated).shape

(100, 13)

In [30]:
from sklearn.feature_selection import SelectKBest, f_classif
x_data_kbest = SelectKBest(f_classif, k=5).fit_transform(x_data_generated, y_data_generated)
x_data_varth = VarianceThreshold(.9).fit_transform(x_data_generated)

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [32]:
cross_val_score(LogisticRegression(), x_data_generated, y_data_generated, scoring='neg_log_loss').mean()

-0.35442239256480884

In [33]:
cross_val_score(LogisticRegression(), x_data_kbest, y_data_generated, scoring='neg_log_loss').mean()

-0.238571437075628

In [34]:
cross_val_score(LogisticRegression(), x_data_varth, y_data_generated, scoring='neg_log_loss').mean()

-0.2928767357872344

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [35]:
#selector = SFS(LogisticRegression(), scoring='neg_log_loss', verbose=2, k_features=3, forward=False, n_jobs=-1)

In [None]:
%%time
#selector.fit(StandardScaler().fit_transform(x_data), y_data)
#ооооочень долго

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done  46 out of  46 | elapsed:  6.3min finished

[2020-08-01 09:32:40] Features: 45/3 -- score: -0.6671071695131933[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  6.0min finished

[2020-08-01 09:38:40] Features: 44/3 -- score: -0.6669186131812352[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  44 out of  44 | elapsed:  5.8min finished

[2020-08-01 09:44:29] Features: 43/3 -- score: -0.6666909829567097[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  43 out of  43 | elapsed:  5.6min finished

[2020-08-01 09:50:07] Features: 42/3 -- score: -0.6665929241958064[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  42 out of  42 | elapsed:  5.4min finished

[2020-08-01 09:55:34] Features: 41/3 -- score: -0.6665474652803584[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  41 out of  41 | elapsed:  5.3min finished

[2020-08-01 10:00:50] Features: 40/3 -- score: -0.666452774472609[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  5.1min finished

[2020-08-01 10:05:58] Features: 39/3 -- score: -0.6665123116703842[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  39 out of  39 | elapsed:  4.9min finished

[2020-08-01 10:10:54] Features: 38/3 -- score: -0.6664838616229884[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 out of  38 | elapsed:  4.7min finished

[2020-08-01 10:15:38] Features: 37/3 -- score: -0.6664108299623923[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 out of  37 | elapsed:  4.6min finished

[2020-08-01 10:20:12] Features: 36/3 -- score: -0.6663448857380193[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  4.4min finished

[2020-08-01 10:24:38] Features: 35/3 -- score: -0.6663469245658475[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:  4.3min finished

[2020-08-01 10:28:59] Features: 34/3 -- score: -0.6664097738467556[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 out of  34 | elapsed:  4.1min finished

[2020-08-01 10:33:06] Features: 33/3 -- score: -0.6664377807453471[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:  4.0min finished

[2020-08-01 10:37:05] Features: 32/3 -- score: -0.6663957474119856[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:  3.9min finished

[2020-08-01 10:40:58] Features: 31/3 -- score: -0.6663968068388562[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  31 out of  31 | elapsed:  3.6min finished

[2020-08-01 10:44:34] Features: 30/3 -- score: -0.6664970280385335[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  3.4min finished

[2020-08-01 10:47:58] Features: 29/3 -- score: -0.6665157982241675[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 out of  29 | elapsed:  3.3min finished

[2020-08-01 10:51:14] Features: 28/3 -- score: -0.666533753366545[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 out of  28 | elapsed:  3.1min finished

[2020-08-01 10:54:21] Features: 27/3 -- score: -0.6664039018911334[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  2.9min finished

[2020-08-01 10:57:14] Features: 26/3 -- score: -0.665940983239518[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 out of  26 | elapsed:  2.7min finished

[2020-08-01 10:59:59] Features: 25/3 -- score: -0.6659307403183865[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  2.6min finished

[2020-08-01 11:02:36] Features: 24/3 -- score: -0.6659090348139476[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.5min finished

[2020-08-01 11:05:05] Features: 23/3 -- score: -0.6659980864410959[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  23 out of  23 | elapsed:  2.3min finished

[2020-08-01 11:07:25] Features: 22/3 -- score: -0.6660761346564674[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  22 out of  22 | elapsed:  2.2min finished

[2020-08-01 11:09:36] Features: 21/3 -- score: -0.6661555051305353[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:  2.1min finished

[2020-08-01 11:11:41] Features: 20/3 -- score: -0.6662619375189207[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.0min finished

[2020-08-01 11:13:39] Features: 19/3 -- score: -0.6664648371296751[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  19 | elapsed:  1.9min finished

[2020-08-01 11:15:30] Features: 18/3 -- score: -0.6666365680644947[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:  1.7min finished

[2020-08-01 11:17:15] Features: 17/3 -- score: -0.6667954689370459[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 out of  17 | elapsed:  1.5min finished

[2020-08-01 11:18:46] Features: 16/3 -- score: -0.6670044422548986[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:  1.3min finished

[2020-08-01 11:20:05] Features: 15/3 -- score: -0.6672425946905274[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.2min finished

[2020-08-01 11:21:16] Features: 14/3 -- score: -0.6675018487665072[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 out of  14 | elapsed:  1.1min finished

[2020-08-01 11:22:21] Features: 13/3 -- score: -0.6680390324817685[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:   58.5s finished

[2020-08-01 11:23:20] Features: 12/3 -- score: -0.6686668350603517[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   49.6s finished

[2020-08-01 11:24:09] Features: 11/3 -- score: -0.6696584336555628[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:   44.3s finished

[2020-08-01 11:24:54] Features: 10/3 -- score: -0.6718507838894807[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   38.3s finished

[2020-08-01 11:25:32] Features: 9/3 -- score: -0.674430235379908[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   32.3s finished

[2020-08-01 11:26:04] Features: 8/3 -- score: -0.6778778386719821[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   26.5s finished

[2020-08-01 11:26:31] Features: 7/3 -- score: -0.6815594187948015[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of   7 | elapsed:   21.0s finished

[2020-08-01 11:26:52] Features: 6/3 -- score: -0.6891743523846027[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   15.8s finished

[2020-08-01 11:27:08] Features: 5/3 -- score: -0.6978513510617517[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.2s finished

[2020-08-01 11:27:18] Features: 4/3 -- score: -0.6997647604172826[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

CPU times: user 55.1 s, sys: 12.1 s, total: 1min 7s
Wall time: 2h 1min 15s

[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    6.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    6.4s finished

[2020-08-01 11:27:25] Features: 3/3 -- score: -0.707677861049149

SequentialFeatureSelector(clone_estimator=True, cv=5,
                          estimator=LogisticRegression(C=1.0, class_weight=None,
                                                       dual=False,
                                                       fit_intercept=True,
                                                       intercept_scaling=1,
                                                       l1_ratio=None,
                                                       max_iter=100,
                                                       multi_class='auto',
                                                       n_jobs=None,
                                                       penalty='l2',
                                                       random_state=None,
                                                       solver='lbfgs',
                                                       tol=0.0001, verbose=0,
                                                       warm_start=False),
                          floating=False, forward=False, k_features=3,
                          n_jobs=-1, pre_dispatch='2*n_jobs',
                          scoring='neg_log_loss', verbose=2)