In [1]:
import os

from sqlalchemy import create_engine

import pandas as pd
import datetime as dt

from catboost import CatBoostClassifier

In [2]:
def batch_load_sql(query: str) -> pd.DataFrame:
    CHUNKSIZE = 200000
    engine = create_engine("postgresql://robot-startml-ro:pheiph0hahj1Vaif\
@postgres.lab.karpov.courses:6432/startml")
    conn = engine.connect().execution_options(stream_results=True)
    chunks = []
    for chunk_dataframe in pd.read_sql(query, conn, chunksize=CHUNKSIZE):
        chunks.append(chunk_dataframe)
    conn.close()
    return pd.concat(chunks, ignore_index=True)

In [3]:
user_df = batch_load_sql('SELECT * FROM public.user_data')
post_df = batch_load_sql('SELECT * FROM public.post_text_df')
feed_df = batch_load_sql('SELECT * FROM public.feed_data LIMIT 100000')

In [4]:
def transform_data(user_df, post_df, feed_df):
    united_data = feed_df.merge(post_df, on='post_id')
    united_data = pd.merge(united_data, user_df, on='user_id')
    united_data.drop(['city', 'text'], axis=1, inplace=True)
    ohe_cols = ['action', 'topic', 'country', 'os', 'source']
    for col in ohe_cols:
        tmp = pd.get_dummies(united_data[col])
        united_data = pd.concat((united_data, tmp), axis=1)
        united_data.drop(col, axis=1, inplace=True)
    united_data.drop('post_id', axis=1, inplace=True)
    return united_data

In [5]:
X = transform_data(user_df, post_df, feed_df)

In [6]:
X_train = X.sort_values(by='timestamp').iloc[:-int(X.shape[0]*.25)]
X_test = X.sort_values(by='timestamp').iloc[-int(X.shape[0]*.25):]
y_train = X_train['target']
y_test = X_test['target']

In [7]:
cat = CatBoostClassifier()

cat.fit(X_train, y_train)

cat.save_model('catboost_model', format='cbm')

Learning rate set to 0.065101
0:	learn: 0.4531249	total: 74.2ms	remaining: 1m 14s
1:	learn: 0.2795675	total: 83.5ms	remaining: 41.7s
2:	learn: 0.1814023	total: 90.9ms	remaining: 30.2s
3:	learn: 0.1161174	total: 98ms	remaining: 24.4s
4:	learn: 0.0751772	total: 105ms	remaining: 20.8s
5:	learn: 0.0515316	total: 111ms	remaining: 18.4s
6:	learn: 0.0347471	total: 117ms	remaining: 16.6s
7:	learn: 0.0243696	total: 124ms	remaining: 15.3s
8:	learn: 0.0172663	total: 130ms	remaining: 14.3s
9:	learn: 0.0122832	total: 136ms	remaining: 13.4s
10:	learn: 0.0094818	total: 142ms	remaining: 12.8s
11:	learn: 0.0072983	total: 148ms	remaining: 12.2s
12:	learn: 0.0056430	total: 154ms	remaining: 11.7s
13:	learn: 0.0045081	total: 161ms	remaining: 11.4s
14:	learn: 0.0036004	total: 167ms	remaining: 11s
15:	learn: 0.0028890	total: 174ms	remaining: 10.7s
16:	learn: 0.0023370	total: 180ms	remaining: 10.4s
17:	learn: 0.0019306	total: 186ms	remaining: 10.1s
18:	learn: 0.0016413	total: 192ms	remaining: 9.91s
19:	learn:

190:	learn: 0.0001162	total: 1.19s	remaining: 5.04s
191:	learn: 0.0001162	total: 1.2s	remaining: 5.03s
192:	learn: 0.0001162	total: 1.2s	remaining: 5.03s
193:	learn: 0.0001162	total: 1.21s	remaining: 5.02s
194:	learn: 0.0001162	total: 1.21s	remaining: 5.01s
195:	learn: 0.0001162	total: 1.22s	remaining: 5s
196:	learn: 0.0001162	total: 1.23s	remaining: 4.99s
197:	learn: 0.0001162	total: 1.23s	remaining: 4.99s
198:	learn: 0.0001162	total: 1.24s	remaining: 4.98s
199:	learn: 0.0001162	total: 1.24s	remaining: 4.97s
200:	learn: 0.0001162	total: 1.25s	remaining: 4.96s
201:	learn: 0.0001162	total: 1.25s	remaining: 4.95s
202:	learn: 0.0001162	total: 1.26s	remaining: 4.94s
203:	learn: 0.0001162	total: 1.26s	remaining: 4.93s
204:	learn: 0.0001162	total: 1.27s	remaining: 4.92s
205:	learn: 0.0001162	total: 1.27s	remaining: 4.91s
206:	learn: 0.0001162	total: 1.28s	remaining: 4.9s
207:	learn: 0.0001162	total: 1.28s	remaining: 4.89s
208:	learn: 0.0001162	total: 1.29s	remaining: 4.88s
209:	learn: 0.0001

377:	learn: 0.0001162	total: 2.17s	remaining: 3.57s
378:	learn: 0.0001162	total: 2.18s	remaining: 3.57s
379:	learn: 0.0001162	total: 2.18s	remaining: 3.56s
380:	learn: 0.0001162	total: 2.19s	remaining: 3.56s
381:	learn: 0.0001162	total: 2.19s	remaining: 3.55s
382:	learn: 0.0001162	total: 2.2s	remaining: 3.54s
383:	learn: 0.0001162	total: 2.2s	remaining: 3.54s
384:	learn: 0.0001162	total: 2.21s	remaining: 3.53s
385:	learn: 0.0001162	total: 2.21s	remaining: 3.52s
386:	learn: 0.0001162	total: 2.22s	remaining: 3.52s
387:	learn: 0.0001162	total: 2.22s	remaining: 3.51s
388:	learn: 0.0001162	total: 2.23s	remaining: 3.5s
389:	learn: 0.0001162	total: 2.23s	remaining: 3.5s
390:	learn: 0.0001162	total: 2.24s	remaining: 3.49s
391:	learn: 0.0001162	total: 2.25s	remaining: 3.48s
392:	learn: 0.0001162	total: 2.25s	remaining: 3.48s
393:	learn: 0.0001162	total: 2.26s	remaining: 3.47s
394:	learn: 0.0001162	total: 2.26s	remaining: 3.46s
395:	learn: 0.0001162	total: 2.27s	remaining: 3.46s
396:	learn: 0.00

563:	learn: 0.0001162	total: 3.16s	remaining: 2.44s
564:	learn: 0.0001162	total: 3.17s	remaining: 2.44s
565:	learn: 0.0001162	total: 3.17s	remaining: 2.43s
566:	learn: 0.0001162	total: 3.18s	remaining: 2.43s
567:	learn: 0.0001162	total: 3.18s	remaining: 2.42s
568:	learn: 0.0001162	total: 3.19s	remaining: 2.42s
569:	learn: 0.0001162	total: 3.19s	remaining: 2.41s
570:	learn: 0.0001162	total: 3.2s	remaining: 2.4s
571:	learn: 0.0001162	total: 3.21s	remaining: 2.4s
572:	learn: 0.0001162	total: 3.21s	remaining: 2.39s
573:	learn: 0.0001162	total: 3.21s	remaining: 2.39s
574:	learn: 0.0001162	total: 3.22s	remaining: 2.38s
575:	learn: 0.0001162	total: 3.23s	remaining: 2.38s
576:	learn: 0.0001162	total: 3.23s	remaining: 2.37s
577:	learn: 0.0001162	total: 3.24s	remaining: 2.36s
578:	learn: 0.0001162	total: 3.24s	remaining: 2.36s
579:	learn: 0.0001162	total: 3.25s	remaining: 2.35s
580:	learn: 0.0001162	total: 3.25s	remaining: 2.35s
581:	learn: 0.0001162	total: 3.26s	remaining: 2.34s
582:	learn: 0.0

752:	learn: 0.0001162	total: 4.15s	remaining: 1.36s
753:	learn: 0.0001162	total: 4.16s	remaining: 1.36s
754:	learn: 0.0001162	total: 4.17s	remaining: 1.35s
755:	learn: 0.0001162	total: 4.17s	remaining: 1.35s
756:	learn: 0.0001162	total: 4.18s	remaining: 1.34s
757:	learn: 0.0001162	total: 4.18s	remaining: 1.33s
758:	learn: 0.0001162	total: 4.19s	remaining: 1.33s
759:	learn: 0.0001162	total: 4.19s	remaining: 1.32s
760:	learn: 0.0001162	total: 4.2s	remaining: 1.32s
761:	learn: 0.0001162	total: 4.2s	remaining: 1.31s
762:	learn: 0.0001162	total: 4.21s	remaining: 1.31s
763:	learn: 0.0001162	total: 4.21s	remaining: 1.3s
764:	learn: 0.0001162	total: 4.22s	remaining: 1.29s
765:	learn: 0.0001162	total: 4.22s	remaining: 1.29s
766:	learn: 0.0001162	total: 4.23s	remaining: 1.28s
767:	learn: 0.0001162	total: 4.23s	remaining: 1.28s
768:	learn: 0.0001162	total: 4.24s	remaining: 1.27s
769:	learn: 0.0001162	total: 4.24s	remaining: 1.27s
770:	learn: 0.0001162	total: 4.25s	remaining: 1.26s
771:	learn: 0.0

940:	learn: 0.0001162	total: 5.14s	remaining: 322ms
941:	learn: 0.0001162	total: 5.15s	remaining: 317ms
942:	learn: 0.0001162	total: 5.15s	remaining: 311ms
943:	learn: 0.0001162	total: 5.16s	remaining: 306ms
944:	learn: 0.0001162	total: 5.16s	remaining: 300ms
945:	learn: 0.0001162	total: 5.17s	remaining: 295ms
946:	learn: 0.0001162	total: 5.17s	remaining: 290ms
947:	learn: 0.0001162	total: 5.18s	remaining: 284ms
948:	learn: 0.0001162	total: 5.18s	remaining: 279ms
949:	learn: 0.0001162	total: 5.19s	remaining: 273ms
950:	learn: 0.0001162	total: 5.2s	remaining: 268ms
951:	learn: 0.0001162	total: 5.2s	remaining: 262ms
952:	learn: 0.0001162	total: 5.21s	remaining: 257ms
953:	learn: 0.0001162	total: 5.21s	remaining: 251ms
954:	learn: 0.0001162	total: 5.21s	remaining: 246ms
955:	learn: 0.0001162	total: 5.22s	remaining: 240ms
956:	learn: 0.0001162	total: 5.23s	remaining: 235ms
957:	learn: 0.0001162	total: 5.23s	remaining: 229ms
958:	learn: 0.0001162	total: 5.24s	remaining: 224ms
959:	learn: 0.

In [8]:
def get_model_path(path: str) -> str:
    if os.environ.get("IS_LMS") == "1":  # проверяем где выполняется код в лмс, или локально. Немного магии
        MODEL_PATH = '/workdir/user_input/model'
    else:
        MODEL_PATH = path
    return MODEL_PATH

def load_models():
    model_path = get_model_path("catboost_model")
    from_file = CatBoostClassifier()
    from_file.load_model(model_path)
    return from_file

In [9]:
# def load_features() -> pd.DataFrame:
    

In [10]:
cb_model = load_models()

In [11]:
from sklearn.metrics import accuracy_score

In [12]:
accuracy_score(y_test, cb_model.predict(X_test))

1.0