# Предсказание популярности объявлений о продаже домов в Нью-Йорке

In [165]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from collections import Counter

In [117]:
df = pd.read_csv("train.csv", dtype={"photos": object})

In [118]:
df.head()

Unnamed: 0,Id,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address,TARGET
0,57094,1.0,3,0,2016-05-19 18:06:27,A FABULOUS 3BR IN MIDTOWN WEST! PERFECT APAR...,HOW AMAZING IS THIS MIDTOWN WEST STEAL!! NO FE...,"['Laundry In Unit', 'No Fee', 'Elevator']",40.7647,7039994,-73.9918,4bdc3d8c1aaa90d997ce2cb77680679b,['https://photos.renthop.com/2/7039994_07be01b...,4495,W 50 & AVE 10,medium
1,33389,1.0,1,9225efdfb57a50bf3ec17ebab082f94a,2016-06-16 02:01:49,Renovated Kitchen and Bathroom!,55 River Drive South,"['Dogs Allowed', 'Cats Allowed', 'No Fee']",40.7275,7166774,-74.0322,e5808a5e6cc13988fe596704428d38d5,['https://photos.renthop.com/2/7166774_03cf63a...,2570,55 River Drive South,medium
2,60458,1.0,0,320de7d3cc88e50a7fbbcfde1e825d21,2016-05-04 02:42:50,RARE AND BEST DEAL ON THE MARKET!!!! PERFECT S...,W 77 Street,"['Elevator', 'Hardwood Floors']",40.7798,6962716,-73.9751,d69d4e111612dd12ef864031c1148543,['https://photos.renthop.com/2/6962716_ec7f56f...,1795,22 W 77 Street,low
3,53048,1.0,2,ce6d18bf3238e668b2bf23f4110b7b67,2016-05-12 05:57:56,Newly renovated flex 2 apartment offers the ne...,John Street,"['Swimming Pool', 'Doorman', 'Elevator', 'Fitn...",40.7081,7002458,-74.0065,e6472c7237327dd3903b3d6f6a94515a,['https://photos.renthop.com/2/7002458_93f4010...,3400,100 John Street,low
4,592,1.0,3,fee4d465932160318364d9d48d272879,2016-06-16 06:06:15,LOW FEE apartments do not come around like thi...,West 16th Street,"['Laundry in Building', 'Laundry in Unit', 'Di...",40.7416,7170465,-74.0025,6fba9b3a8327c607b8b043716efee684,['https://photos.renthop.com/2/7170465_9c3f173...,5695,321 West 16th Street,low


In [119]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34546 entries, 0 to 34545
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               34546 non-null  int64  
 1   bathrooms        34546 non-null  float64
 2   bedrooms         34546 non-null  int64  
 3   building_id      34546 non-null  object 
 4   created          34546 non-null  object 
 5   description      33509 non-null  object 
 6   display_address  34458 non-null  object 
 7   features         34546 non-null  object 
 8   latitude         34546 non-null  float64
 9   listing_id       34546 non-null  int64  
 10  longitude        34546 non-null  float64
 11  manager_id       34546 non-null  object 
 12  photos           34546 non-null  object 
 13  price            34546 non-null  int64  
 14  street_address   34542 non-null  object 
 15  TARGET           34546 non-null  object 
dtypes: float64(3), int64(4), object(9)
memory usage: 4.2+ MB


In [120]:
df.shape

(34546, 16)

In [121]:
df.isnull().sum()

Id                    0
bathrooms             0
bedrooms              0
building_id           0
created               0
description        1037
display_address      88
features              0
latitude              0
listing_id            0
longitude             0
manager_id            0
photos                0
price                 0
street_address        4
TARGET                0
dtype: int64

Для начала создадим набор базовых признаков: число ванн, число спален, широта, долгота, цена, а также добавим в него число фотографий в объявлении.

In [122]:
df.photos = df.photos.apply(lambda x: x[1: -2])
df.photos = df.photos.str.split(", ")

In [None]:
df_base = df[["bathrooms", "bedrooms", "latitude", "longitude", "price"]]
df_base.loc[:, "bathrooms"] = df_base.loc[:, "bathrooms"].astype("int")
df_base["photos"] = df.photos.str.len()

In [124]:
df_base

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,photos
0,1,3,40.7647,-73.9918,4495,3
1,1,1,40.7275,-74.0322,2570,13
2,1,0,40.7798,-73.9751,1795,6
3,1,2,40.7081,-74.0065,3400,4
4,1,3,40.7416,-74.0025,5695,3
...,...,...,...,...,...,...
34541,1,1,40.7653,-73.9248,1950,4
34542,1,0,40.7239,-73.9901,2600,6
34543,2,2,40.7437,-73.9585,5110,15
34544,1,2,40.7673,-73.9866,3400,3


Посмотрим на распределение целевой переменной 

In [125]:
X_base = df_base
y = df.TARGET

In [126]:
y.value_counts(normalize=True)

low       0.694697
medium    0.227523
high      0.077780
Name: TARGET, dtype: float64

Обучим логистическую регрессию на базовом наборе признаков.

In [127]:
logreg = LogisticRegression(max_iter=500)
np.mean(cross_val_score(logreg, X_base, y, cv=5, scoring="balanced_accuracy"))

0.34387553700879925

Качество так себе.

Сбалансируем веса для классов.

In [128]:
logreg = LogisticRegression(class_weight="balanced", max_iter=500)
np.mean(cross_val_score(logreg, X_base, y, cv=5, scoring="balanced_accuracy"))

0.4935001496017944

Стало лучше

Добавим день недели и месяц создания объявления.

In [129]:
df.created = pd.to_datetime(df.created)
X = pd.DataFrame(df_base, copy=True)
X["day"] = df.created.dt.day_of_week

In [131]:
logreg = LogisticRegression(class_weight="balanced", max_iter=700)
np.mean(cross_val_score(logreg, X, y, cv=5, scoring="balanced_accuracy"))

0.49230673564115535

In [132]:
X = X.drop(columns="day")
X["month"] = df.created.dt.month

In [133]:
logreg = LogisticRegression(class_weight="balanced", max_iter=700)
np.mean(cross_val_score(logreg, X, y, cv=5, scoring="balanced_accuracy"))

0.4929380114838165

В обоих случаях стало хуже, выкинем оба признака.

In [None]:
X = X.drop(columns="month")

In [135]:
X.head()

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,photos
0,1,3,40.7647,-73.9918,4495,3
1,1,1,40.7275,-74.0322,2570,13
2,1,0,40.7798,-73.9751,1795,6
3,1,2,40.7081,-74.0065,3400,4
4,1,3,40.7416,-74.0025,5695,3


Добавим расстояние до Центрального Парка

Кооординаты центрального парка в Нью-Йорке: latitude - 40.785091, longitude - -73.968285

In [136]:
CentPark_lattitude = 40.785091
CentPark_longitude = -73.968285
X["CentPark_closeness"] = ((CentPark_lattitude - X.latitude)**2 + (CentPark_longitude - X.longitude)**2)**0.5

In [137]:
logreg = LogisticRegression(class_weight="balanced", max_iter=700)
np.mean(cross_val_score(logreg, X, y, cv=5, scoring="balanced_accuracy"))

0.4943577828505159

Качество модели немного подросло, добавим признак общего числа комнат в квартире

In [138]:
X["rooms"] = X.bathrooms + X.bedrooms

In [139]:
logreg = LogisticRegression(class_weight="balanced", max_iter=700)
np.mean(cross_val_score(logreg, X, y, cv=5, scoring="balanced_accuracy"))

0.49796530260226746

Качество выросло

Теперь добавим 50 самых распространенных текстовых тегов в качестве фичей

In [140]:
df["features_list"] = df.features.apply(lambda x: x[1: -2])
df.features_list = df.features_list.str.split(", ")
features = []
for i in df.features_list:
    features += i
feat_count = Counter(features)
feat_count = sorted(feat_count.items(), key=lambda x: x[1])
chosen_features = [i[0] for i in feat_count[-51:] if i[0] != ""]
for feature in chosen_features:
    X[feature] = 0
    X.loc[df.features.str.contains(feature), feature] = 1

In [141]:
logreg = LogisticRegression(class_weight="balanced", max_iter=3000)
np.mean(cross_val_score(logreg, X, y, cv=5, scoring="balanced_accuracy"))

0.5324450724736923

Проведем стандартизацию значений

In [142]:
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

Теперь попробуем отобрать лучшие признаки, используя дисперсионный анализ

In [143]:
X.shape

(34546, 58)

In [144]:
for K in range(5, 59, 5):
    X_selected = SelectKBest(f_classif, k=K).fit_transform(X_scaled, y)
    logreg = LogisticRegression(class_weight="balanced", max_iter=3000)
    print(K, np.mean(cross_val_score(logreg, X_selected, y, cv=3, scoring="balanced_accuracy")))

5 0.4049702690377666
10 0.4295923350666167
15 0.42416377829832036
20 0.4283947301000622
25 0.4349549856240304
30 0.4442953199081068
35 0.45212701058984894
40 0.45053787323749583
45 0.44971995127759223
50 0.5289577826580546
55 0.5330511206201027


Переберем более детально интервал от 55 до 58 призаков.

In [145]:
for K in range(55, 59, 1):
    X_selected = SelectKBest(f_classif, k=K).fit_transform(X_scaled, y)
    logreg = LogisticRegression(class_weight="balanced", max_iter=3000)
    print(K, np.mean(cross_val_score(logreg, X_selected, y, cv=3, scoring="balanced_accuracy")))

55 0.5330511206201027
56 0.5332933764289706
57 0.5347773859491859
58 0.539052183526874


In [151]:
logreg = LogisticRegression(class_weight="balanced", max_iter=3000)
np.mean(cross_val_score(logreg, X_scaled, y, cv=3, scoring="balanced_accuracy"))

0.539052183526874

Оптимальным вариантом оказалось не перебирать признаки, а брать все

Переберем коэффициенты регуляризации

In [152]:
regulars = [0.1, 0.2, 0.4, 0.5, 0.7, 1, 2, 4, 5, 7, 10, 11, 13, 15, 20]
for regular in regulars:
    logreg = LogisticRegression(class_weight="balanced", C=regular, max_iter=3000)
    print(regular, np.mean(cross_val_score(logreg, X_scaled, y, cv=3, scoring="balanced_accuracy")))

0.1 0.5056662921291181
0.2 0.5182832514712911
0.4 0.5281160141788831
0.5 0.5312600585850515
0.7 0.535005757400434
1 0.539052183526874
2 0.5417504887174229
4 0.5437694237315908
5 0.5439352455727596
7 0.544833227056686
10 0.5448168722135378
11 0.5447923060170438
13 0.5447360100434482
15 0.5442766835379859
20 0.544039919630678


In [153]:
regulars = np.arange(7, 11.5, 0.5)
for regular in regulars:
    logreg = LogisticRegression(class_weight="balanced", C=regular, max_iter=3000)
    print(regular, np.mean(cross_val_score(logreg, X_scaled, y, cv=3, scoring="balanced_accuracy")))

7.0 0.544833227056686
7.5 0.5451116408450462
8.0 0.5446219320511928
8.5 0.5446668155081277
9.0 0.5443359261344548
9.5 0.544737609177611
10.0 0.5448168722135378
10.5 0.5446797123335245
11.0 0.5447923060170438


Лучше всего модель показывает себя с коэффициентом регуляризации равным 7.5.

Сделаем предсказание на тесте

In [161]:
 test_df = pd.read_csv("test.csv")

In [162]:
test_df.head()

Unnamed: 0,Id,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address
0,low,2.0,3,87c4e08083ac83618dd9fdbf849331fe,2016-06-21 03:21:59,Rent listed is net effective after one month f...,E 77 St.,"['Pre-War', 'Laundry in Unit', 'Dishwasher', '...",40.7716,7189305,-73.9544,e6472c7237327dd3903b3d6f6a94515a,['https://photos.renthop.com/2/7189305_f47ed5c...,4675,339 E 77 St.
1,low,1.0,2,850c3d4412aebb3335273ed95f3e4bd4,2016-06-11 03:47:06,A FABULOUS 2BR IN THE UPPER EAST SIDE! PERFECT...,E 112 Street,[],40.7944,7140670,-73.9401,3e9f0fa34d67e5d61aae56776ecb8cee,['https://photos.renthop.com/2/7140670_6c5824f...,2100,252 E 112 Street
2,low,1.0,2,84ddf917a091828ab9baedc0c926470e,2016-04-17 02:45:19,This is a great true 2 bedroom in a prime Midt...,Ninth Avenue,"['Dogs Allowed', 'Cats Allowed']",40.765,6886239,-73.9882,10b1ae0a38d50b7ba0cee612d14af9eb,['https://photos.renthop.com/2/6886239_533bbee...,2800,783 Ninth Avenue
3,high,2.0,3,315a2f868a195b076e5be73411110da2,2016-05-07 11:44:35,Brand new custom renovation in beautiful three...,East 12th street,"['Laundry In Unit', 'No Fee', 'Washer/Dryer in...",40.7288,6981146,-73.9808,e9920062e07ee893c10e38d0259665b0,['https://photos.renthop.com/2/6981146_f4a7400...,5500,186 Avenue B
4,medium,1.0,3,9b525bacc8d294728c35f27c04e22f46,2016-06-24 07:33:26,"Welcome Home! Located on a Prime UWS Block, Ba...",West 86th&Columbus,"['Private Outdoor Space', 'Elevator', 'Laundry...",40.7864,7210864,-73.9713,6d389fbe372d4d30b17733caa9370f95,['https://photos.renthop.com/2/7210864_e8528f4...,3995,47 West 86th street


In [None]:
test_df.photos = test_df.photos.apply(lambda x: x[1: -2])
test_df.photos = test_df.photos.str.split(", ")
X_test = test_df[["bathrooms", "bedrooms", "latitude", "longitude", "price"]]
X_test.loc[:, "bathrooms"] = X_test.loc[:, "bathrooms"].astype("int")
X_test["photos"] = test_df.photos.str.len()
X_test["CentPark_closeness"] = ((CentPark_lattitude - X_test.latitude)**2 + (CentPark_longitude - X_test.longitude)**2)**0.5
X_test["rooms"] = X_test.bathrooms + X_test.bedrooms
for feature in chosen_features:
    X_test[feature] = 0
    X_test.loc[test_df.features.str.contains(feature), feature] = 1
X_test_scaled = scaler.transform(X_test)

In [164]:
logreg = LogisticRegression(class_weight="balanced", C=7.5, max_iter=3000).fit(X_scaled, y)
prediction = logreg.predict(X_test_scaled)

In [170]:
ids = pd.read_csv("sample_submission.csv")
prediction_df = pd.DataFrame({"Id": ids.Id, "TARGET": prediction})
prediction_df.to_csv("prediction.csv", index=False)

Результаты Kaggle: Score: 0.54034, Public score: 0.55212

Качество близко к оценке по кросс-валидации