In [422]:
import pandas as pd
import numpy as np
import re

import pickle

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score, mean_squared_error, mean_absolute_percentage_error, f1_score, classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from catboost import Pool, cv, CatBoostRegressor, CatBoostClassifier
from skopt import BayesSearchCV
from skopt.space import Integer, Real
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper, DeltaYStopper
from sklearn.ensemble import GradientBoostingRegressor
from modAL.models import ActiveLearner
import seaborn as sns

%matplotlib inline

pd.set_option('display.max_columns', 50)
plt.rcParams.update({'figure.figsize': (16, 8), 'font.size': 12})

In [443]:
train_data = pd.read_csv('./preprocessed_data/train_data.csv')
test_data = pd.read_csv('./preprocessed_data/test_data.csv')

In [444]:
train_data = train_data[(~train_data['address'].isnull()) & (~train_data['osm_city_nearest_population'].isnull())]
cols = [col for col in list(train_data) if 'price_type' not in col and col != 'id']

train_data_1 = train_data[train_data['price_type'] == 1]
train_data_0 = train_data[train_data['price_type'] == 0].sample(train_data_1.shape[0])
train_data_strip = pd.concat([train_data_0, train_data_1])

In [445]:
data_to_predict_type = train_data.drop(train_data_strip.index.values, axis=0)

In [446]:
x_train, x_valid, y_train, y_valid = train_test_split(train_data_strip[cols], 
                                                      train_data_strip['price_type'], 
                                                      test_size=0.1, 
                                                      random_state=42,
                                                      stratify=train_data_strip['price_type'])

learner = ActiveLearner(
                        estimator=CatBoostClassifier(
                                cat_features=['osm_city_nearest_name', 'day', 'realty_type', 'region'],
                                text_features=['text_floor', 'address'],
                                loss_function='Logloss',
                                eval_metric='AUC',
                                task_type='CPU',
                                text_processing = {
                                    "tokenizers" : [{
                                        "tokenizer_id" : "Space",
                                        "separator_type" : "ByDelimiter",
                                        "delimiter" : " "
                                    }],

                                    "dictionaries" : [{
                                        "dictionary_id" : "BiGram",
                                        "token_level_type": "Letter",
                                        "max_dictionary_size" : "50000",
                                        "occurrence_lower_bound" : "1",
                                        "gram_order" : "2"
                                    }
                                    ],

                                    "feature_processing" : {
                                        "default" : [
                                                {
                                                "dictionaries_names" : ["BiGram"],
                                                "feature_calcers" : ["BoW"],
                                                "tokenizers_names" : ["Space"]
                                            },
                                                {
                                            "dictionaries_names" : ["BiGram"],
                                            "feature_calcers" : ["NaiveBayes"],
                                            "tokenizers_names" : ["Space"]
                                        },{
                                            "dictionaries_names" : ["BiGram"],
                                            "feature_calcers" : ["BM25"],
                                            "tokenizers_names" : ["Space"]
                                        },
                                        ],
                                    }
                                }
                            ),
    X_training=x_train, y_training=y_train
)

Learning rate set to 0.024401
0:	total: 125ms	remaining: 2m 4s
1:	total: 293ms	remaining: 2m 26s
2:	total: 537ms	remaining: 2m 58s
3:	total: 710ms	remaining: 2m 56s
4:	total: 858ms	remaining: 2m 50s
5:	total: 985ms	remaining: 2m 43s
6:	total: 1.11s	remaining: 2m 38s
7:	total: 1.25s	remaining: 2m 34s
8:	total: 1.37s	remaining: 2m 30s
9:	total: 1.55s	remaining: 2m 33s
10:	total: 1.67s	remaining: 2m 30s
11:	total: 1.81s	remaining: 2m 29s
12:	total: 2.01s	remaining: 2m 32s
13:	total: 2.15s	remaining: 2m 31s
14:	total: 2.27s	remaining: 2m 29s
15:	total: 2.39s	remaining: 2m 27s
16:	total: 2.54s	remaining: 2m 26s
17:	total: 2.66s	remaining: 2m 24s
18:	total: 2.79s	remaining: 2m 24s
19:	total: 2.92s	remaining: 2m 22s
20:	total: 3.08s	remaining: 2m 23s
21:	total: 3.24s	remaining: 2m 24s
22:	total: 3.38s	remaining: 2m 23s
23:	total: 3.5s	remaining: 2m 22s
24:	total: 3.63s	remaining: 2m 21s
25:	total: 3.75s	remaining: 2m 20s
26:	total: 3.9s	remaining: 2m 20s
27:	total: 4.03s	remaining: 2m 19s
28:

233:	total: 31.9s	remaining: 1m 44s
234:	total: 32.1s	remaining: 1m 44s
235:	total: 32.2s	remaining: 1m 44s
236:	total: 32.3s	remaining: 1m 44s
237:	total: 32.4s	remaining: 1m 43s
238:	total: 32.5s	remaining: 1m 43s
239:	total: 32.7s	remaining: 1m 43s
240:	total: 32.9s	remaining: 1m 43s
241:	total: 33s	remaining: 1m 43s
242:	total: 33.1s	remaining: 1m 43s
243:	total: 33.3s	remaining: 1m 43s
244:	total: 33.4s	remaining: 1m 42s
245:	total: 33.5s	remaining: 1m 42s
246:	total: 33.6s	remaining: 1m 42s
247:	total: 33.8s	remaining: 1m 42s
248:	total: 33.9s	remaining: 1m 42s
249:	total: 34s	remaining: 1m 42s
250:	total: 34.2s	remaining: 1m 41s
251:	total: 34.3s	remaining: 1m 41s
252:	total: 34.4s	remaining: 1m 41s
253:	total: 34.6s	remaining: 1m 41s
254:	total: 34.7s	remaining: 1m 41s
255:	total: 34.8s	remaining: 1m 41s
256:	total: 34.9s	remaining: 1m 40s
257:	total: 35s	remaining: 1m 40s
258:	total: 35.1s	remaining: 1m 40s
259:	total: 35.3s	remaining: 1m 40s
260:	total: 35.4s	remaining: 1m 40

463:	total: 1m 1s	remaining: 1m 11s
464:	total: 1m 2s	remaining: 1m 11s
465:	total: 1m 2s	remaining: 1m 11s
466:	total: 1m 2s	remaining: 1m 11s
467:	total: 1m 2s	remaining: 1m 10s
468:	total: 1m 2s	remaining: 1m 10s
469:	total: 1m 2s	remaining: 1m 10s
470:	total: 1m 2s	remaining: 1m 10s
471:	total: 1m 2s	remaining: 1m 10s
472:	total: 1m 3s	remaining: 1m 10s
473:	total: 1m 3s	remaining: 1m 10s
474:	total: 1m 3s	remaining: 1m 10s
475:	total: 1m 3s	remaining: 1m 9s
476:	total: 1m 3s	remaining: 1m 9s
477:	total: 1m 3s	remaining: 1m 9s
478:	total: 1m 3s	remaining: 1m 9s
479:	total: 1m 3s	remaining: 1m 9s
480:	total: 1m 4s	remaining: 1m 9s
481:	total: 1m 4s	remaining: 1m 9s
482:	total: 1m 4s	remaining: 1m 8s
483:	total: 1m 4s	remaining: 1m 8s
484:	total: 1m 4s	remaining: 1m 8s
485:	total: 1m 4s	remaining: 1m 8s
486:	total: 1m 4s	remaining: 1m 8s
487:	total: 1m 4s	remaining: 1m 8s
488:	total: 1m 5s	remaining: 1m 8s
489:	total: 1m 5s	remaining: 1m 7s
490:	total: 1m 5s	remaining: 1m 7s
491:	tot

695:	total: 1m 31s	remaining: 40.1s
696:	total: 1m 31s	remaining: 40s
697:	total: 1m 32s	remaining: 39.8s
698:	total: 1m 32s	remaining: 39.7s
699:	total: 1m 32s	remaining: 39.6s
700:	total: 1m 32s	remaining: 39.5s
701:	total: 1m 32s	remaining: 39.3s
702:	total: 1m 32s	remaining: 39.2s
703:	total: 1m 32s	remaining: 39.1s
704:	total: 1m 33s	remaining: 38.9s
705:	total: 1m 33s	remaining: 38.8s
706:	total: 1m 33s	remaining: 38.7s
707:	total: 1m 33s	remaining: 38.5s
708:	total: 1m 33s	remaining: 38.4s
709:	total: 1m 33s	remaining: 38.3s
710:	total: 1m 33s	remaining: 38.1s
711:	total: 1m 33s	remaining: 38s
712:	total: 1m 34s	remaining: 37.9s
713:	total: 1m 34s	remaining: 37.7s
714:	total: 1m 34s	remaining: 37.6s
715:	total: 1m 34s	remaining: 37.5s
716:	total: 1m 34s	remaining: 37.3s
717:	total: 1m 34s	remaining: 37.2s
718:	total: 1m 34s	remaining: 37.1s
719:	total: 1m 35s	remaining: 36.9s
720:	total: 1m 35s	remaining: 36.8s
721:	total: 1m 35s	remaining: 36.7s
722:	total: 1m 35s	remaining: 36

926:	total: 2m 1s	remaining: 9.6s
927:	total: 2m 2s	remaining: 9.47s
928:	total: 2m 2s	remaining: 9.34s
929:	total: 2m 2s	remaining: 9.2s
930:	total: 2m 2s	remaining: 9.07s
931:	total: 2m 2s	remaining: 8.94s
932:	total: 2m 2s	remaining: 8.81s
933:	total: 2m 2s	remaining: 8.67s
934:	total: 2m 2s	remaining: 8.54s
935:	total: 2m 3s	remaining: 8.41s
936:	total: 2m 3s	remaining: 8.28s
937:	total: 2m 3s	remaining: 8.15s
938:	total: 2m 3s	remaining: 8.02s
939:	total: 2m 3s	remaining: 7.88s
940:	total: 2m 3s	remaining: 7.75s
941:	total: 2m 3s	remaining: 7.62s
942:	total: 2m 3s	remaining: 7.49s
943:	total: 2m 4s	remaining: 7.36s
944:	total: 2m 4s	remaining: 7.23s
945:	total: 2m 4s	remaining: 7.1s
946:	total: 2m 4s	remaining: 6.97s
947:	total: 2m 4s	remaining: 6.83s
948:	total: 2m 4s	remaining: 6.71s
949:	total: 2m 4s	remaining: 6.57s
950:	total: 2m 5s	remaining: 6.44s
951:	total: 2m 5s	remaining: 6.31s
952:	total: 2m 5s	remaining: 6.18s
953:	total: 2m 5s	remaining: 6.05s
954:	total: 2m 5s	remai

In [447]:
y_pred = learner.predict(x_valid)
f1 = f1_score(y_valid, y_pred)

print(f'F1 = {f1:.2f}')

F1 = 0.94


In [448]:
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.90      0.93       419
           1       0.91      0.97      0.94       418

    accuracy                           0.94       837
   macro avg       0.94      0.94      0.94       837
weighted avg       0.94      0.94      0.94       837



In [449]:
tresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
accuracies = []
f1_scores = []

for treshold in tresholds:
    y_probs = learner.predict_proba(x_valid)
    y_preds = np.where(y_probs[:, 1] > treshold, 1, 0)
    accuracies.append((treshold, accuracy_score(y_preds, y_valid)))
    f1_scores.append((treshold, f1_score(y_preds, y_valid)))

In [450]:
accuracies

[(0.1, 0.899641577060932),
 (0.2, 0.9199522102747909),
 (0.3, 0.9307048984468339),
 (0.4, 0.9366786140979689),
 (0.5, 0.9366786140979689),
 (0.6, 0.9390681003584229),
 (0.7, 0.9223416965352449),
 (0.8, 0.899641577060932),
 (0.9, 0.8136200716845878)]

In [451]:
predict_type_probs = learner.predict_proba(data_to_predict_type[cols])
predict_type_preds = np.where(predict_type_probs[:, 1] > 0.85, 1, 0)

In [452]:
ind_predicted_1_type = np.where(predict_type_preds == 1)[0]
ind_predicted_0_type = np.where(predict_type_preds == 0)[0]

In [453]:
predicted_data_1 = data_to_predict_type.iloc[ind_predicted_1_type]
predicted_data_1['price_type'] = 1
# shape_predicted_type_1 = predicted_data_1.shape[0]
# predicted_data_0 = data_to_predict_type.iloc[ind_predicted_0_type].sample(shape_predicted_type_1)

predicted_data = pd.concat([predicted_data_0, predicted_data_1], axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [454]:
test_data = pd.read_csv('./preprocessed_data/test_data.csv')

def vectorize_text_features(train_data, test_data, col):
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), analyzer='char', max_features=50)
    
    train_text_data = vectorizer.fit_transform(train_data[col])
    test_text_data = vectorizer.transform(test_data[col])
    
    train_text_df = pd.DataFrame(data=train_text_data.toarray(), columns=list(vectorizer.vocabulary_))
    test_text_df = pd.DataFrame(data=test_text_data.toarray(), columns=list(vectorizer.vocabulary_))
    
    train_text_df.reset_index(inplace=True, drop=True)
    test_text_df.reset_index(inplace=True, drop=True)
    
    train_data = pd.concat([train_data, train_text_df], axis=1)
    test_data = pd.concat([test_data, test_text_df], axis=1)
    
    train_data.drop([col], axis=1, inplace=True)
    test_data.drop([col], axis=1, inplace=True)
    
    return train_data, test_data

test_data['address'].fillna(method='backfill', inplace=True)
predicted_data_1.reset_index(inplace=True, drop=True)
predicted_data, test_data = vectorize_text_features(predicted_data_1, test_data, 'address')
#predicted_data, test_data = vectorize_text_features(predicted_data, test_data, 'text_floor')

In [455]:
predicted_data.shape, test_data.shape

((7887, 238), (2974, 238))

In [468]:
# baseline

cols_pr = [col for col in list(predicted_data) if 'bin_' not in col and 'agg' not in col and col != 'id' and 'price_eur_rub' not in col and 'price_usd_rub' not in col and 'per_square_meter_price' not in col and 'address' not in col and 'text_floor' not in col and 'price_type' not in col]

pr_x_train, pr_x_test, pr_y_train, pr_y_test = train_test_split(predicted_data[cols_pr], 
                                                                predicted_data['per_square_meter_price'], 
                                                                test_size=0.2, 
                                                                random_state=42)

#pr_x_train, pr_x_test = scale_data(pr_x_train, pr_x_test)

estimator = CatBoostRegressor(
    loss_function='RMSE',
    eval_metric='R2',
    task_type='CPU',
    cat_features=['osm_city_nearest_name', 
                  'day', 
                  'realty_type', 
                  'region']
)

model.fit(pr_x_train, pr_y_train)

pr_y_pred = model.predict(pr_x_test)
r2 = r2_score(pr_y_test, pr_y_pred)
mae = mean_absolute_error(pr_y_test, pr_y_pred)
mse = mean_squared_error(pr_y_test, pr_y_pred)
rmse = np.sqrt(mse)

print(f'R2 = {r2:.2f} | MAE = {mae:.0f} | RMSE = {rmse:.0f}')

Learning rate set to 0.054174
0:	learn: 0.0220733	total: 68.7ms	remaining: 1m 8s
1:	learn: 0.0418128	total: 139ms	remaining: 1m 9s
2:	learn: 0.0614290	total: 225ms	remaining: 1m 14s
3:	learn: 0.0783156	total: 300ms	remaining: 1m 14s
4:	learn: 0.0900572	total: 378ms	remaining: 1m 15s
5:	learn: 0.1048255	total: 454ms	remaining: 1m 15s
6:	learn: 0.1215898	total: 528ms	remaining: 1m 14s
7:	learn: 0.1358973	total: 609ms	remaining: 1m 15s
8:	learn: 0.1493462	total: 708ms	remaining: 1m 17s
9:	learn: 0.1621230	total: 794ms	remaining: 1m 18s
10:	learn: 0.1723586	total: 886ms	remaining: 1m 19s
11:	learn: 0.1821834	total: 982ms	remaining: 1m 20s
12:	learn: 0.1925376	total: 1.07s	remaining: 1m 21s
13:	learn: 0.2006073	total: 1.14s	remaining: 1m 20s
14:	learn: 0.2123058	total: 1.22s	remaining: 1m 20s
15:	learn: 0.2227612	total: 1.3s	remaining: 1m 19s
16:	learn: 0.2332351	total: 1.39s	remaining: 1m 20s
17:	learn: 0.2403030	total: 1.47s	remaining: 1m 20s
18:	learn: 0.2455384	total: 1.55s	remaining: 1

159:	learn: 0.5315370	total: 13.7s	remaining: 1m 11s
160:	learn: 0.5316380	total: 13.8s	remaining: 1m 11s
161:	learn: 0.5324368	total: 13.9s	remaining: 1m 11s
162:	learn: 0.5330160	total: 14s	remaining: 1m 11s
163:	learn: 0.5343610	total: 14s	remaining: 1m 11s
164:	learn: 0.5381645	total: 14.1s	remaining: 1m 11s
165:	learn: 0.5396675	total: 14.2s	remaining: 1m 11s
166:	learn: 0.5402070	total: 14.3s	remaining: 1m 11s
167:	learn: 0.5403013	total: 14.4s	remaining: 1m 11s
168:	learn: 0.5419127	total: 14.4s	remaining: 1m 10s
169:	learn: 0.5424325	total: 14.5s	remaining: 1m 10s
170:	learn: 0.5432679	total: 14.6s	remaining: 1m 10s
171:	learn: 0.5438115	total: 14.7s	remaining: 1m 10s
172:	learn: 0.5438841	total: 14.8s	remaining: 1m 10s
173:	learn: 0.5443269	total: 14.8s	remaining: 1m 10s
174:	learn: 0.5451858	total: 14.9s	remaining: 1m 10s
175:	learn: 0.5479713	total: 15s	remaining: 1m 10s
176:	learn: 0.5504594	total: 15.1s	remaining: 1m 10s
177:	learn: 0.5506045	total: 15.2s	remaining: 1m 10s

320:	learn: 0.6736593	total: 27.6s	remaining: 58.4s
321:	learn: 0.6739086	total: 27.7s	remaining: 58.3s
322:	learn: 0.6741770	total: 27.8s	remaining: 58.2s
323:	learn: 0.6744517	total: 27.9s	remaining: 58.1s
324:	learn: 0.6757854	total: 28s	remaining: 58.1s
325:	learn: 0.6764944	total: 28.1s	remaining: 58s
326:	learn: 0.6770804	total: 28.2s	remaining: 58s
327:	learn: 0.6776347	total: 28.3s	remaining: 57.9s
328:	learn: 0.6777008	total: 28.4s	remaining: 57.8s
329:	learn: 0.6789855	total: 28.4s	remaining: 57.8s
330:	learn: 0.6795653	total: 28.5s	remaining: 57.7s
331:	learn: 0.6797793	total: 28.6s	remaining: 57.6s
332:	learn: 0.6800158	total: 28.7s	remaining: 57.6s
333:	learn: 0.6810866	total: 28.8s	remaining: 57.5s
334:	learn: 0.6817607	total: 28.9s	remaining: 57.4s
335:	learn: 0.6818237	total: 29s	remaining: 57.3s
336:	learn: 0.6821380	total: 29.1s	remaining: 57.2s
337:	learn: 0.6823517	total: 29.2s	remaining: 57.1s
338:	learn: 0.6833599	total: 29.2s	remaining: 57s
339:	learn: 0.6840249	

479:	learn: 0.7620707	total: 41.5s	remaining: 45s
480:	learn: 0.7620883	total: 41.6s	remaining: 44.9s
481:	learn: 0.7624352	total: 41.7s	remaining: 44.8s
482:	learn: 0.7631735	total: 41.8s	remaining: 44.7s
483:	learn: 0.7636052	total: 41.8s	remaining: 44.6s
484:	learn: 0.7639367	total: 41.9s	remaining: 44.5s
485:	learn: 0.7639514	total: 42s	remaining: 44.4s
486:	learn: 0.7645957	total: 42.1s	remaining: 44.3s
487:	learn: 0.7648435	total: 42.1s	remaining: 44.2s
488:	learn: 0.7649272	total: 42.2s	remaining: 44.1s
489:	learn: 0.7651281	total: 42.3s	remaining: 44s
490:	learn: 0.7654669	total: 42.4s	remaining: 43.9s
491:	learn: 0.7658555	total: 42.5s	remaining: 43.8s
492:	learn: 0.7662516	total: 42.5s	remaining: 43.7s
493:	learn: 0.7664472	total: 42.6s	remaining: 43.7s
494:	learn: 0.7668598	total: 42.7s	remaining: 43.6s
495:	learn: 0.7670676	total: 42.8s	remaining: 43.5s
496:	learn: 0.7676291	total: 42.9s	remaining: 43.4s
497:	learn: 0.7681376	total: 43s	remaining: 43.3s
498:	learn: 0.768412

640:	learn: 0.8090310	total: 55.9s	remaining: 31.3s
641:	learn: 0.8091892	total: 55.9s	remaining: 31.2s
642:	learn: 0.8092771	total: 56s	remaining: 31.1s
643:	learn: 0.8095438	total: 56.1s	remaining: 31s
644:	learn: 0.8097322	total: 56.2s	remaining: 30.9s
645:	learn: 0.8099850	total: 56.3s	remaining: 30.8s
646:	learn: 0.8103209	total: 56.4s	remaining: 30.8s
647:	learn: 0.8106795	total: 56.5s	remaining: 30.7s
648:	learn: 0.8108026	total: 56.6s	remaining: 30.6s
649:	learn: 0.8109295	total: 56.7s	remaining: 30.5s
650:	learn: 0.8110425	total: 56.7s	remaining: 30.4s
651:	learn: 0.8113760	total: 56.8s	remaining: 30.3s
652:	learn: 0.8115964	total: 56.9s	remaining: 30.2s
653:	learn: 0.8117419	total: 57s	remaining: 30.2s
654:	learn: 0.8119683	total: 57.1s	remaining: 30.1s
655:	learn: 0.8128650	total: 57.2s	remaining: 30s
656:	learn: 0.8131099	total: 57.3s	remaining: 29.9s
657:	learn: 0.8134444	total: 57.4s	remaining: 29.8s
658:	learn: 0.8139464	total: 57.4s	remaining: 29.7s
659:	learn: 0.814075

799:	learn: 0.8426868	total: 1m 9s	remaining: 17.5s
800:	learn: 0.8428141	total: 1m 9s	remaining: 17.4s
801:	learn: 0.8429202	total: 1m 9s	remaining: 17.3s
802:	learn: 0.8430250	total: 1m 10s	remaining: 17.2s
803:	learn: 0.8431121	total: 1m 10s	remaining: 17.1s
804:	learn: 0.8432640	total: 1m 10s	remaining: 17s
805:	learn: 0.8432911	total: 1m 10s	remaining: 16.9s
806:	learn: 0.8434725	total: 1m 10s	remaining: 16.8s
807:	learn: 0.8436099	total: 1m 10s	remaining: 16.7s
808:	learn: 0.8437912	total: 1m 10s	remaining: 16.6s
809:	learn: 0.8439118	total: 1m 10s	remaining: 16.6s
810:	learn: 0.8441070	total: 1m 10s	remaining: 16.5s
811:	learn: 0.8442049	total: 1m 10s	remaining: 16.4s
812:	learn: 0.8443093	total: 1m 10s	remaining: 16.3s
813:	learn: 0.8445092	total: 1m 10s	remaining: 16.2s
814:	learn: 0.8447554	total: 1m 11s	remaining: 16.1s
815:	learn: 0.8447631	total: 1m 11s	remaining: 16s
816:	learn: 0.8449226	total: 1m 11s	remaining: 16s
817:	learn: 0.8449259	total: 1m 11s	remaining: 15.9s
81

957:	learn: 0.8651476	total: 1m 23s	remaining: 3.65s
958:	learn: 0.8652437	total: 1m 23s	remaining: 3.56s
959:	learn: 0.8653517	total: 1m 23s	remaining: 3.48s
960:	learn: 0.8653973	total: 1m 23s	remaining: 3.39s
961:	learn: 0.8654221	total: 1m 23s	remaining: 3.3s
962:	learn: 0.8655048	total: 1m 23s	remaining: 3.21s
963:	learn: 0.8656786	total: 1m 23s	remaining: 3.13s
964:	learn: 0.8657658	total: 1m 23s	remaining: 3.04s
965:	learn: 0.8658199	total: 1m 23s	remaining: 2.95s
966:	learn: 0.8659735	total: 1m 23s	remaining: 2.87s
967:	learn: 0.8660339	total: 1m 24s	remaining: 2.78s
968:	learn: 0.8662165	total: 1m 24s	remaining: 2.69s
969:	learn: 0.8663020	total: 1m 24s	remaining: 2.61s
970:	learn: 0.8663073	total: 1m 24s	remaining: 2.52s
971:	learn: 0.8663790	total: 1m 24s	remaining: 2.43s
972:	learn: 0.8663970	total: 1m 24s	remaining: 2.35s
973:	learn: 0.8665607	total: 1m 24s	remaining: 2.26s
974:	learn: 0.8667354	total: 1m 24s	remaining: 2.17s
975:	learn: 0.8668303	total: 1m 24s	remaining: 

ValueError: Input contains NaN

In [457]:
import sys

sys.path.append('./raif/notebooks/baseline/raifhack_ds/')

In [458]:
from metrics import metrics_stat, deviation_metric

raif_metric = metrics_stat(np.array(pr_y_test), np.array(pr_y_pred))['raif_metric']
print(f'Raif = {raif_metric}')

Raif = 1.5765103412381927


In [370]:
test_pred = model.predict(test_data[cols_pr])
pd.DataFrame(np.c_[test_data['id'], test_pred], columns=['id', 'per_square_meter_price']).set_index('id').to_csv('submission.csv')

In [485]:
train_data_1 = train_data[train_data['price_type'] == 1]
test_data = pd.read_csv('./preprocessed_data/test_data.csv')

test_data['address'].fillna(method='backfill', inplace=True)

train_data_1.reset_index(inplace=True, drop=True)
train_data_1, test_data = vectorize_text_features(train_data_1, test_data, 'address')


cols_pr = [col for col in list(train_data_1) if col != 'id' and 'price_eur_rub' not in col and 'price_usd_rub' not in col and 'per_square_meter_price' not in col and 'address' not in col and 'text_floor' not in col and 'price_type' not in col]

pr_x_train, pr_x_test, pr_y_train, pr_y_test = train_test_split(train_data_1[cols_pr], 
                                                                train_data_1['per_square_meter_price'], 
                                                                test_size=0.2, 
                                                                random_state=42)


model = CatBoostRegressor(
    loss_function='RMSE',
    eval_metric='R2',
    task_type='CPU',
    cat_features=['osm_city_nearest_name', 
                  'day', 
                  'realty_type', 
                  'region']
)

model.fit(pr_x_train, pr_y_train)

pr_y_pred = model.predict(pr_x_test)
r2 = r2_score(pr_y_test, pr_y_pred)
mae = mean_absolute_error(pr_y_test, pr_y_pred)
mse = mean_squared_error(pr_y_test, pr_y_pred)
rmse = np.sqrt(mse)

print(f'R2 = {r2:.2f} | MAE = {mae:.0f} | RMSE = {rmse:.0f}')

Learning rate set to 0.048396
0:	learn: 0.0134186	total: 74.1ms	remaining: 1m 14s
1:	learn: 0.0298240	total: 156ms	remaining: 1m 17s
2:	learn: 0.0457090	total: 231ms	remaining: 1m 16s
3:	learn: 0.0603941	total: 335ms	remaining: 1m 23s
4:	learn: 0.0728311	total: 449ms	remaining: 1m 29s
5:	learn: 0.0852050	total: 545ms	remaining: 1m 30s
6:	learn: 0.0988385	total: 633ms	remaining: 1m 29s
7:	learn: 0.1083143	total: 722ms	remaining: 1m 29s
8:	learn: 0.1187948	total: 813ms	remaining: 1m 29s
9:	learn: 0.1284053	total: 921ms	remaining: 1m 31s
10:	learn: 0.1423921	total: 1.02s	remaining: 1m 31s
11:	learn: 0.1543725	total: 1.11s	remaining: 1m 31s
12:	learn: 0.1664349	total: 1.21s	remaining: 1m 31s
13:	learn: 0.1733565	total: 1.3s	remaining: 1m 31s
14:	learn: 0.1807271	total: 1.39s	remaining: 1m 31s
15:	learn: 0.1872555	total: 1.49s	remaining: 1m 31s
16:	learn: 0.1970028	total: 1.59s	remaining: 1m 31s
17:	learn: 0.2028767	total: 1.7s	remaining: 1m 32s
18:	learn: 0.2090089	total: 1.83s	remaining: 

159:	learn: 0.6500556	total: 15.6s	remaining: 1m 21s
160:	learn: 0.6512386	total: 15.7s	remaining: 1m 21s
161:	learn: 0.6516306	total: 15.8s	remaining: 1m 21s
162:	learn: 0.6522667	total: 15.9s	remaining: 1m 21s
163:	learn: 0.6529781	total: 16s	remaining: 1m 21s
164:	learn: 0.6617467	total: 16.1s	remaining: 1m 21s
165:	learn: 0.6634800	total: 16.2s	remaining: 1m 21s
166:	learn: 0.6650252	total: 16.3s	remaining: 1m 21s
167:	learn: 0.6652651	total: 16.5s	remaining: 1m 21s
168:	learn: 0.6657197	total: 16.6s	remaining: 1m 21s
169:	learn: 0.6663479	total: 16.7s	remaining: 1m 21s
170:	learn: 0.6668315	total: 16.8s	remaining: 1m 21s
171:	learn: 0.6670835	total: 16.9s	remaining: 1m 21s
172:	learn: 0.6682191	total: 17s	remaining: 1m 21s
173:	learn: 0.6686230	total: 17.1s	remaining: 1m 21s
174:	learn: 0.6722603	total: 17.2s	remaining: 1m 20s
175:	learn: 0.6726738	total: 17.3s	remaining: 1m 20s
176:	learn: 0.6730895	total: 17.3s	remaining: 1m 20s
177:	learn: 0.6759826	total: 17.4s	remaining: 1m 2

316:	learn: 0.8331624	total: 31.5s	remaining: 1m 7s
317:	learn: 0.8348740	total: 31.6s	remaining: 1m 7s
318:	learn: 0.8355181	total: 31.7s	remaining: 1m 7s
319:	learn: 0.8359377	total: 31.8s	remaining: 1m 7s
320:	learn: 0.8362394	total: 31.9s	remaining: 1m 7s
321:	learn: 0.8373791	total: 32s	remaining: 1m 7s
322:	learn: 0.8376609	total: 32.1s	remaining: 1m 7s
323:	learn: 0.8379439	total: 32.2s	remaining: 1m 7s
324:	learn: 0.8386006	total: 32.3s	remaining: 1m 7s
325:	learn: 0.8391167	total: 32.4s	remaining: 1m 6s
326:	learn: 0.8394613	total: 32.5s	remaining: 1m 6s
327:	learn: 0.8403769	total: 32.6s	remaining: 1m 6s
328:	learn: 0.8410098	total: 32.7s	remaining: 1m 6s
329:	learn: 0.8412971	total: 32.8s	remaining: 1m 6s
330:	learn: 0.8414270	total: 32.9s	remaining: 1m 6s
331:	learn: 0.8420104	total: 33s	remaining: 1m 6s
332:	learn: 0.8421819	total: 33.1s	remaining: 1m 6s
333:	learn: 0.8425774	total: 33.2s	remaining: 1m 6s
334:	learn: 0.8443717	total: 33.3s	remaining: 1m 6s
335:	learn: 0.84

478:	learn: 0.9035514	total: 48.2s	remaining: 52.4s
479:	learn: 0.9036754	total: 48.3s	remaining: 52.3s
480:	learn: 0.9038235	total: 48.4s	remaining: 52.2s
481:	learn: 0.9041845	total: 48.5s	remaining: 52.1s
482:	learn: 0.9047084	total: 48.6s	remaining: 52s
483:	learn: 0.9047601	total: 48.7s	remaining: 51.9s
484:	learn: 0.9049719	total: 48.8s	remaining: 51.8s
485:	learn: 0.9050655	total: 48.9s	remaining: 51.7s
486:	learn: 0.9058616	total: 49s	remaining: 51.6s
487:	learn: 0.9060291	total: 49.1s	remaining: 51.5s
488:	learn: 0.9065294	total: 49.2s	remaining: 51.4s
489:	learn: 0.9067414	total: 49.3s	remaining: 51.3s
490:	learn: 0.9068564	total: 49.4s	remaining: 51.2s
491:	learn: 0.9073018	total: 49.5s	remaining: 51.1s
492:	learn: 0.9074441	total: 49.6s	remaining: 51s
493:	learn: 0.9077089	total: 49.7s	remaining: 50.9s
494:	learn: 0.9078169	total: 49.8s	remaining: 50.8s
495:	learn: 0.9079291	total: 49.9s	remaining: 50.7s
496:	learn: 0.9082892	total: 50s	remaining: 50.6s
497:	learn: 0.908345

639:	learn: 0.9356263	total: 1m 4s	remaining: 36.4s
640:	learn: 0.9359812	total: 1m 4s	remaining: 36.3s
641:	learn: 0.9360954	total: 1m 4s	remaining: 36.2s
642:	learn: 0.9362501	total: 1m 5s	remaining: 36.1s
643:	learn: 0.9364438	total: 1m 5s	remaining: 36s
644:	learn: 0.9365849	total: 1m 5s	remaining: 35.9s
645:	learn: 0.9366055	total: 1m 5s	remaining: 35.8s
646:	learn: 0.9366275	total: 1m 5s	remaining: 35.7s
647:	learn: 0.9367062	total: 1m 5s	remaining: 35.6s
648:	learn: 0.9368177	total: 1m 5s	remaining: 35.5s
649:	learn: 0.9369126	total: 1m 5s	remaining: 35.4s
650:	learn: 0.9369483	total: 1m 5s	remaining: 35.3s
651:	learn: 0.9370638	total: 1m 5s	remaining: 35.2s
652:	learn: 0.9371662	total: 1m 6s	remaining: 35.1s
653:	learn: 0.9372858	total: 1m 6s	remaining: 35s
654:	learn: 0.9374209	total: 1m 6s	remaining: 34.9s
655:	learn: 0.9374502	total: 1m 6s	remaining: 34.8s
656:	learn: 0.9374841	total: 1m 6s	remaining: 34.7s
657:	learn: 0.9376209	total: 1m 6s	remaining: 34.6s
658:	learn: 0.93

796:	learn: 0.9522746	total: 1m 20s	remaining: 20.6s
797:	learn: 0.9524035	total: 1m 20s	remaining: 20.5s
798:	learn: 0.9524178	total: 1m 20s	remaining: 20.4s
799:	learn: 0.9525174	total: 1m 21s	remaining: 20.3s
800:	learn: 0.9526160	total: 1m 21s	remaining: 20.2s
801:	learn: 0.9526803	total: 1m 21s	remaining: 20.1s
802:	learn: 0.9527559	total: 1m 21s	remaining: 20s
803:	learn: 0.9528475	total: 1m 21s	remaining: 19.9s
804:	learn: 0.9529275	total: 1m 21s	remaining: 19.8s
805:	learn: 0.9530002	total: 1m 21s	remaining: 19.7s
806:	learn: 0.9530432	total: 1m 21s	remaining: 19.5s
807:	learn: 0.9532402	total: 1m 21s	remaining: 19.4s
808:	learn: 0.9532987	total: 1m 21s	remaining: 19.3s
809:	learn: 0.9533538	total: 1m 22s	remaining: 19.2s
810:	learn: 0.9535871	total: 1m 22s	remaining: 19.1s
811:	learn: 0.9536623	total: 1m 22s	remaining: 19s
812:	learn: 0.9537341	total: 1m 22s	remaining: 18.9s
813:	learn: 0.9538176	total: 1m 22s	remaining: 18.8s
814:	learn: 0.9539235	total: 1m 22s	remaining: 18.

954:	learn: 0.9626755	total: 1m 36s	remaining: 4.56s
955:	learn: 0.9626780	total: 1m 36s	remaining: 4.46s
956:	learn: 0.9627318	total: 1m 36s	remaining: 4.36s
957:	learn: 0.9629100	total: 1m 37s	remaining: 4.26s
958:	learn: 0.9629777	total: 1m 37s	remaining: 4.16s
959:	learn: 0.9629887	total: 1m 37s	remaining: 4.05s
960:	learn: 0.9630097	total: 1m 37s	remaining: 3.95s
961:	learn: 0.9630201	total: 1m 37s	remaining: 3.85s
962:	learn: 0.9630949	total: 1m 37s	remaining: 3.75s
963:	learn: 0.9631055	total: 1m 37s	remaining: 3.65s
964:	learn: 0.9631415	total: 1m 37s	remaining: 3.55s
965:	learn: 0.9632137	total: 1m 37s	remaining: 3.44s
966:	learn: 0.9632831	total: 1m 37s	remaining: 3.34s
967:	learn: 0.9633523	total: 1m 38s	remaining: 3.24s
968:	learn: 0.9634081	total: 1m 38s	remaining: 3.14s
969:	learn: 0.9634126	total: 1m 38s	remaining: 3.04s
970:	learn: 0.9634879	total: 1m 38s	remaining: 2.94s
971:	learn: 0.9635375	total: 1m 38s	remaining: 2.84s
972:	learn: 0.9635682	total: 1m 38s	remaining:

In [486]:
from metrics import metrics_stat, deviation_metric

raif_metric = metrics_stat(np.array(pr_y_test), np.array(pr_y_pred))['raif_metric']
print(f'Raif = {raif_metric}')

Raif = 1.7240512634164107
