<a href="https://colab.research.google.com/github/klosow/ocr-forms/blob/master/house_predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [85]:
!pip install keras-bert catboost xgboost lightgbm transformers



In [86]:
import pandas as pd
import numpy as np
import gc
import glob
import re
import os
import joblib
import math

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

import catboost as ctb
import xgboost as xgb
import lightgbm as lgb

os.environ['TF_KERAS'] = '1'
from keras_bert import load_trained_model_from_checkpoint
from transformers import BertForMaskedLM, BertTokenizer, TFBertEmbeddings, AutoModel, AutoTokenizer

In [87]:
proj_dir = r'/content/drive/MyDrive/Data Workshop/'
model_dir = r'/content/drive/MyDrive/Data Workshop/model/bert/bg_cs_pl_ru_cased_L-12_H-768_A-12'
config_path = os.path.join(model_dir, 'bert_config.json')
checkpoint_path = os.path.join(model_dir, 'bert_model.ckpt')
vocab_path = os.path.join(model_dir, 'vocab.txt')

In [None]:
# model_name = "djstrong/bg_cs_pl_ru_cased_L-12_H-768_A-12"
# model_name = "http://huggingface.co/djstrong/bg_cs_pl_ru_cased_L-12_H-768_A-12"
# model_name = "DeepPavlov/bert-base-bg-cs-pl-ru-cased"

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# bert_model = AutoModel.from_pretrained(model_name)

In [None]:
tokenizer = BertTokenizer(vocab_path)
bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=False)

In [88]:
df = joblib.load('/content/drive/MyDrive/Data Workshop/df.joblib')

In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29580 entries, 1 to 29575
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                29580 non-null  int64  
 1   text_description  29580 non-null  object 
 2   price             14790 non-null  object 
 3   price_value       14790 non-null  float64
 4   text              29580 non-null  object 
 5   tokens            29580 non-null  object 
 6   input_ids         29580 non-null  object 
 7   token_type_ids    29580 non-null  object 
 8   attention_mask    29580 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 2.3+ MB


Ponowna tokenizacja

In [None]:
tokenize = lambda sent: tokenizer.encode_plus(sent, max_length=512, padding='max_length', truncation=True)

In [None]:
%time df['tokens'] = df['text'].map(tokenize)

CPU times: user 3min 48s, sys: 1.09 s, total: 3min 49s
Wall time: 3min 49s


In [None]:
df['input_ids'] = df['tokens'].map(lambda t: t['input_ids'])
df['token_type_ids'] = df['tokens'].map(lambda t: t['token_type_ids'])
df['attention_mask'] = df['tokens'].map(lambda t: t['attention_mask'])

input_ids = np.stack(df['input_ids'])
token_type_ids = np.stack(df['token_type_ids'])
attention_mask = np.stack(df['attention_mask'])

In [None]:
input_ids = np.stack(df['input_ids'])
token_type_ids = np.stack(df['token_type_ids'])
attention_mask = np.stack(df['attention_mask'])

In [None]:
input_ids.shape, token_type_ids.shape, attention_mask.shape

((29580, 512), (29580, 512), (29580, 512))

In [None]:
gc.collect()

1381

In [7]:
output_dir = r'/content/drive/MyDrive/Data Workshop/output/'

In [None]:
max_len = 29580
chunk = 32*50
liczba_plikow = math.ceil(max_len/chunk)
for i in range(liczba_plikow):
    chunk_start = i*chunk
    chunk_end = min((i+1)*chunk,max_len)
    print(f'predykcja chunku: {chunk_start}:{chunk_end}')
    %time predicts = bert_model.predict([input_ids[chunk_start:chunk_end], token_type_ids[chunk_start:chunk_end]], verbose=1)
    X = predicts[:, 0, :]
    fn = f'pred_{i:02d}_{chunk_start:05d}_{chunk_end:05d}.npz'
    print(f'chunk: {i} - {chunk_start}:{chunk_end} - {X.shape} - {fn}')
    np.savez_compressed(os.path.join(output_dir, fn), x=X)
    gc.collect()

predykcja chunku: 0:1600
CPU times: user 20.6 s, sys: 4.96 s, total: 25.6 s
Wall time: 2min 17s
chunk: 0 - 0:1600 - (1600, 768) - pred_00_00000_01600.npz
predykcja chunku: 1600:3200
CPU times: user 15.9 s, sys: 2.95 s, total: 18.8 s
Wall time: 2min 10s
chunk: 1 - 1600:3200 - (1600, 768) - pred_01_01600_03200.npz
predykcja chunku: 3200:4800
CPU times: user 15.9 s, sys: 2.84 s, total: 18.7 s
Wall time: 2min 21s
chunk: 2 - 3200:4800 - (1600, 768) - pred_02_03200_04800.npz
predykcja chunku: 4800:6400
CPU times: user 15.9 s, sys: 3.01 s, total: 18.9 s
Wall time: 2min 10s
chunk: 3 - 4800:6400 - (1600, 768) - pred_03_04800_06400.npz
predykcja chunku: 6400:8000
CPU times: user 16 s, sys: 2.94 s, total: 19 s
Wall time: 2min 10s
chunk: 4 - 6400:8000 - (1600, 768) - pred_04_06400_08000.npz
predykcja chunku: 8000:9600
CPU times: user 16.3 s, sys: 2.95 s, total: 19.3 s
Wall time: 2min 10s
chunk: 5 - 8000:9600 - (1600, 768) - pred_05_08000_09600.npz
predykcja chunku: 9600:11200
CPU times: user 15.7 

Zaczynamy uczenie :-)

In [90]:
fn_list = glob.glob(os.path.join(output_dir, '*.npz'))

In [91]:
fn_list

['/content/drive/MyDrive/Data Workshop/output/pred_00_00000_01600.npz',
 '/content/drive/MyDrive/Data Workshop/output/pred_01_01600_03200.npz',
 '/content/drive/MyDrive/Data Workshop/output/pred_02_03200_04800.npz',
 '/content/drive/MyDrive/Data Workshop/output/pred_03_04800_06400.npz',
 '/content/drive/MyDrive/Data Workshop/output/pred_04_06400_08000.npz',
 '/content/drive/MyDrive/Data Workshop/output/pred_05_08000_09600.npz',
 '/content/drive/MyDrive/Data Workshop/output/pred_06_09600_11200.npz',
 '/content/drive/MyDrive/Data Workshop/output/pred_07_11200_12800.npz',
 '/content/drive/MyDrive/Data Workshop/output/pred_08_12800_14400.npz',
 '/content/drive/MyDrive/Data Workshop/output/pred_09_14400_16000.npz',
 '/content/drive/MyDrive/Data Workshop/output/pred_10_16000_17600.npz',
 '/content/drive/MyDrive/Data Workshop/output/pred_11_17600_19200.npz',
 '/content/drive/MyDrive/Data Workshop/output/pred_12_19200_20800.npz',
 '/content/drive/MyDrive/Data Workshop/output/pred_13_20800_2240

In [92]:
arr_list = []
for fn in fn_list:
    loaded = np.load(fn)
    x = loaded['x']
    arr_list.append(x)
    loaded.close()

In [93]:
X = np.vstack(arr_list)

In [94]:
X.shape

(29580, 768)

In [95]:
y = df['price_value'].values

In [96]:
train_mask = ~df['price_value'].isnull().values
test_mask = df['price_value'].isnull().values

X_train = X[train_mask]
X_test = X[test_mask]
y_train = df[train_mask]['price_value'].values

In [111]:
def mae_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    return 'mae', mean_absolute_error(y_true, y_pred)


def testuj_model(X, y, model, cv=3):
    cv = KFold(n_splits=cv, shuffle=True, random_state=42)

    scores = []

    for train_idx, test_idx in cv.split(X):
        X_train_cv, X_test_cv = X[train_idx], X[test_idx]
        y_train_cv, y_test_cv = y[train_idx], y[test_idx]

        # model.fit(X_train_cv, y_train_cv)

        eval_set = [(X_train_cv, y_train_cv),
                    (X_test_cv, y_test_cv)]
        
        model.fit(X_train_cv, y_train_cv,
                  eval_metric=mae_eval,
                  eval_set=eval_set,
                  verbose=10)

        # model.fit(X_train_cv, y_train_cv,
        #           eval_set=eval_set,
        #           verbose=10)

        # model.fit(X_train_cv, y_train_cv,
        #           eval_metric='mae',                  
        #           eval_set=eval_set,
        #           verbose=10)

        y_pred = model.predict(X_test_cv)
        error = mean_absolute_error(y_test_cv, y_pred)
        print(f'Mean absolute error: {error:.5f}')
        scores.append(error)

    print('Model:')
    print(model)
    print(f'Średni błąd:\t{np.mean(scores):.05f}')
    print(f'StdDev błąd:\t{np.std(scores):.05f}')

    return np.mean(scores)

In [81]:
ctb_params = {
    # 'loss_function' : 'MAE',
    }

In [82]:
 model = ctb.CatBoostRegressor(**ctb_params)

In [None]:
model = ctb.CatBoostRegressor()

In [109]:
xgb_params = {'max_depth:': 7,
              'learning_rate' : 0.07,
              'n_estimators': 950, 
              'reg_alpha' : 1.0,
              'min_child_weight': 3.0,
              'objective': 'reg:squarederror',
              'random_state': 42}

In [110]:
model = lgb.LGBMRegressor(n_estimators=900)
# model = xgb.XGBRegressor(**xgb_params)
# model = ctb.CatBoostClassifier(max_depth=7, n_estimators=50, verbose=0, random_state=0)

In [108]:
testuj_model(X_train, y_train, model)

[10]	training's l2: 4.93779e+09	training's l1: 59032.9	valid_1's l2: 5.46441e+09	valid_1's l1: 62190.3
[20]	training's l2: 4.344e+09	training's l1: 55126.3	valid_1's l2: 5.28241e+09	valid_1's l1: 60804.3
[30]	training's l2: 3.88694e+09	training's l1: 51951.1	valid_1's l2: 5.18044e+09	valid_1's l1: 59991.6
[40]	training's l2: 3.51357e+09	training's l1: 49334	valid_1's l2: 5.11245e+09	valid_1's l1: 59485.1
[50]	training's l2: 3.1906e+09	training's l1: 46892.7	valid_1's l2: 5.05491e+09	valid_1's l1: 58998.3
[60]	training's l2: 2.91985e+09	training's l1: 44768.6	valid_1's l2: 5.03001e+09	valid_1's l1: 58771.3
[70]	training's l2: 2.68061e+09	training's l1: 42806.8	valid_1's l2: 4.99698e+09	valid_1's l1: 58490.3
[80]	training's l2: 2.45759e+09	training's l1: 40908.4	valid_1's l2: 4.97782e+09	valid_1's l1: 58317.5
[90]	training's l2: 2.27546e+09	training's l1: 39267.8	valid_1's l2: 4.97416e+09	valid_1's l1: 58226.5
[100]	training's l2: 2.1036e+09	training's l1: 37678	valid_1's l2: 4.96137e+09

56133.57538832893

In [112]:
model = xgb.XGBRegressor(**xgb_params)

In [None]:
testuj_model(X_train, y_train, model)

[0]	validation_0-rmse:283530	validation_1-rmse:284612	validation_0-mae:273130	validation_1-mae:274159
[10]	validation_0-rmse:151693	validation_1-rmse:153227	validation_0-mae:132382	validation_1-mae:133572
[20]	validation_0-rmse:97142.8	validation_1-rmse:99081.6	validation_0-mae:78304.7	validation_1-mae:80303.6
[30]	validation_0-rmse:78491.8	validation_1-rmse:80839.5	validation_0-mae:64010.4	validation_1-mae:66441
[40]	validation_0-rmse:72897.5	validation_1-rmse:75576.4	validation_0-mae:60417.9	validation_1-mae:62949.7
[50]	validation_0-rmse:71007.8	validation_1-rmse:74009.2	validation_0-mae:59253.7	validation_1-mae:61948.6
[60]	validation_0-rmse:70095.6	validation_1-rmse:73480.4	validation_0-mae:58633.5	validation_1-mae:61640.6
[70]	validation_0-rmse:69358.3	validation_1-rmse:73177.8	validation_0-mae:58053.8	validation_1-mae:61394.8
[80]	validation_0-rmse:68841.8	validation_1-rmse:72983.5	validation_0-mae:57622.6	validation_1-mae:61252.4
[90]	validation_0-rmse:68326.5	validation_1-rmse

In [None]:
model = ctb.CatBoostRegressor()

In [83]:
model.get_all_params()

CatBoostError: ignored

In [72]:
model.fit(X_train, y_train, verbose=10)

Learning rate set to 0.062666
0:	learn: 76255.2971050	total: 297ms	remaining: 4m 56s
10:	learn: 74788.3515404	total: 2.62s	remaining: 3m 55s
20:	learn: 73795.9498258	total: 4.92s	remaining: 3m 49s
30:	learn: 73102.6126839	total: 7.2s	remaining: 3m 45s
40:	learn: 72426.0960823	total: 9.46s	remaining: 3m 41s
50:	learn: 71846.2732604	total: 11.7s	remaining: 3m 37s
60:	learn: 71314.0906395	total: 13.9s	remaining: 3m 34s
70:	learn: 70852.0132051	total: 16.2s	remaining: 3m 31s
80:	learn: 70354.4022388	total: 18.4s	remaining: 3m 28s
90:	learn: 69877.4006733	total: 20.6s	remaining: 3m 26s
100:	learn: 69495.0590963	total: 22.8s	remaining: 3m 23s
110:	learn: 69064.4174210	total: 25s	remaining: 3m 20s
120:	learn: 68665.9110187	total: 27.2s	remaining: 3m 17s
130:	learn: 68238.1022092	total: 29.5s	remaining: 3m 15s
140:	learn: 67832.2917151	total: 31.8s	remaining: 3m 13s
150:	learn: 67472.2466704	total: 34s	remaining: 3m 10s
160:	learn: 67043.3845841	total: 36.2s	remaining: 3m 8s
170:	learn: 66629.

<catboost.core.CatBoostRegressor at 0x7fcc8318d190>

In [73]:
preds = model.predict(X_test)

In [74]:
df_valid = df[test_mask].copy()

In [75]:
df_valid['price_value'] = preds

In [76]:
df_valid[['id', 'price_value']].to_csv(r'/content/drive/MyDrive/Data Workshop/output_pred/pred_2021_10_15_16_25_catboost.csv', index=False)