## Загрузим нужные библиотеки

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

import numpy as np
from sklearn.metrics import recall_score
from catboost import CatBoostClassifier

from src.seed_ import seed_everything
from src.dicision_stats_features import create_decision

from src.usr_t_p import create_place_and_score
from src.tfidf_features import create_tfidf_features
from src.inference import calc_result, get_ans
from src.search_params import start_search

from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

### Fix seed

In [3]:
seed_everything()

In [4]:
df_train = pd.read_csv("data/train_dataset_train.csv")
df_user = pd.read_csv("data/user_decision.csv")

### Добавим новых значений

In [5]:
list_id = df_train["id"].unique().tolist()

df_user_train = df_user[df_user["user_id"].isin(list_id)]
df_user_train

Unnamed: 0,user_id,period,decision_id
0,10625,1,409
2,10775,4,420
3,10236,2,284
4,10130,3,72
5,10273,2,140
...,...,...,...
60220,10528,1,88
60222,10832,3,315
60224,10189,1,33
60226,10424,2,140


In [6]:
new_datas = create_decision(df_user_train)

In [7]:
train = df_train.copy()

for data in new_datas:
    train = train.merge(data, how="left", left_on="id", right_on="user_id").drop("user_id", axis=1)


In [8]:
train.shape

(680, 33)

### W2V

In [9]:
# datas = create_w2v_features()

In [10]:
# per1, per2, per3, per4 = datas["period_1"], datas["period_2"], datas["period_3"], datas["period_4"]

In [11]:
# train_w_vec = train.merge(per1, how="left", left_on="id", right_on="user_id").drop("user_id", axis=1)
# train_w_vec = train_w_vec.merge(per2, how="left", left_on="id", right_on="user_id").drop("user_id", axis=1)
# train_w_vec = train_w_vec.merge(per3, how="left", left_on="id", right_on="user_id").drop("user_id", axis=1)
# train_w_vec = train_w_vec.merge(per4, how="left", left_on="id", right_on="user_id").drop("user_id", axis=1)

In [12]:
# train_w_vec

### tfidf

In [13]:
tfidf_features = create_tfidf_features()
# a, b, c, d  = create_tfidf_features()

In [14]:
# tmp = a.drop("period", axis=1).merge(b.drop("period", axis=1), how='inner', left_on="user_id", right_on='user_id')
# tmp = tmp.merge(c.drop("period", axis=1), how='inner', left_on="user_id", right_on='user_id')
# tfidf_features = tmp.merge(d.drop("period", axis=1), how='inner', left_on="user_id", right_on='user_id')
# tfidf_features

In [15]:
train_w_vec = train.merge(tfidf_features.reset_index(), how="left", left_on="id", right_on="user_id").drop("user_id", axis=1)

In [16]:
tfidf_features.to_csv("artifacts/tfidf_features.csv")

### User points

In [18]:
place_score = create_place_and_score()

In [19]:
train_vec_scores = train_w_vec.merge(place_score, how="left", left_on="id", right_on="user_id")

In [20]:
place_score.to_csv("artifacts/place_score.csv", index=False)

### Replace NaN

In [21]:
# train_vec_scores = train_vec_scores.fillna(0)

In [22]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy="mean")
tranformed_data = imp_mean.fit_transform(train_vec_scores)

train_vec_scores_nans = pd.DataFrame(tranformed_data, columns=train_vec_scores.columns)

In [23]:
pd.to_pickle(imp_mean, "artifacts/imp_mean.pkl")

## Выделим выборки

In [24]:
train_vec_scores_nans.to_csv("artifacts/train_vec_scores_nans.csv", index=False)

In [25]:
targets = ["Analytical thinking", "Systemic thinking", "Adaptability", "Focus"]

In [26]:
X = train_vec_scores_nans.drop(targets, axis = 1).drop(["id", "user_id"], axis = 1).drop(592).drop(544)
y = train_vec_scores_nans.drop(592).drop(544)[targets]

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [28]:
y_train_ada = y_train[["Adaptability"]]
y_test_ada = y_test[["Adaptability"]]

y_train_sys = y_train[["Systemic thinking"]]
y_test_sys = y_test[["Systemic thinking"]]

y_train_ana = y_train[["Analytical thinking"]]
y_test_ana = y_test[["Analytical thinking"]]

y_train_foc = y_train[["Focus"]]
y_test_foc = y_test[["Focus"]]

## Обучение модели

In [29]:
params = {"verbose": 250,
          "od_wait": 50,
          "od_type": "Iter",
          "iterations": 2000,
          "depth": 4,
          "learning_rate": 0.007,
          "loss_function": "MultiClass",
          "eval_metric": "AUC",
          }


model_ada = CatBoostClassifier(**params)
model_sys = CatBoostClassifier(**params)
model_ana = CatBoostClassifier(**params)
model_foc = CatBoostClassifier(**params)

In [195]:
model_ada.fit(X_train, y_train_ada, eval_set=(X_test, y_test_ada))
model_sys.fit(X_train, y_train_sys, eval_set=(X_test, y_test_sys))
model_ana.fit(X_train, y_train_ana, eval_set=(X_test, y_test_ana))
model_foc.fit(X_train, y_train_foc, eval_set=(X_test, y_test_foc))

0:	test: 0.2389365	best: 0.2389365 (0)	total: 14.9ms	remaining: 29.7s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.2638173211
bestIteration = 4

Shrink model to first 5 iterations.
0:	test: 0.2910379	best: 0.2910379 (0)	total: 23.7ms	remaining: 47.3s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.438056338
bestIteration = 88

Shrink model to first 89 iterations.
0:	test: 0.3281970	best: 0.3281970 (0)	total: 29.9ms	remaining: 59.8s
250:	test: 0.4164001	best: 0.4175647 (227)	total: 3.53s	remaining: 24.6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4175647254
bestIteration = 227

Shrink model to first 228 iterations.
0:	test: 0.0899306	best: 0.0899306 (0)	total: 17.8ms	remaining: 35.6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.1202060932
bestIteration = 143

Shrink model to first 144 iterations.


<catboost.core.CatBoostClassifier at 0x7ff24d9988b0>

## Оценка точности

In [196]:
pred_ada = model_ada.predict(X_test)
pred_sys = model_sys.predict(X_test)
pred_ana = model_ana.predict(X_test)
pred_foc = model_foc.predict(X_test)

pred = pd.DataFrame(index=X_test.index)
pred["Adaptability"] = pred_ada
pred["Systemic thinking"] = pred_sys
pred["Analytical thinking"] = pred_ana
pred["Focus"] = pred_foc

In [197]:
result = 0 

for i, col in enumerate(y_test.columns):
    result += recall_score(y_test[col], pred[col], average='macro', zero_division=True)

print("Recall score", result/4)

Recall score 0.28815841892880867


##local##-##public##
0.30658 - 0.261382
0.30897 - 0.258336
0.31063 - 0.271379 - ans6
0.30201 - 0.263627 - ans7
0.31552 - 0.291058 - ans_

### Search Hyperparams

In [208]:
params_ada = start_search(X_train, X_test, y_train_ada, y_test_ada)
params_sys = start_search(X_train, X_test, y_train_sys, y_test_sys)
params_ana = start_search(X_train, X_test, y_train_ana, y_test_ana)
params_foc = start_search(X_train, X_test, y_train_foc, y_test_foc)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already speci

In [209]:
params_ada["grow_policy"] = "Lossguide"
params_sys["grow_policy"] = "Lossguide"
params_ana["grow_policy"] = "Lossguide"
params_foc["grow_policy"] = "Lossguide"
params_ada["verbose"] = 0
params_sys["verbose"] = 0
params_ana["verbose"] = 0
params_foc["verbose"] = 0

model_ada = CatBoostClassifier(**params_ada)
model_sys = CatBoostClassifier(**params_sys)
model_ana = CatBoostClassifier(**params_ana)
model_foc = CatBoostClassifier(**params_foc)


model_ada.fit(X_train, y_train_ada, eval_set=(X_test, y_test_ada))
model_sys.fit(X_train, y_train_sys, eval_set=(X_test, y_test_sys))
model_ana.fit(X_train, y_train_ana, eval_set=(X_test, y_test_ana))
model_foc.fit(X_train, y_train_foc, eval_set=(X_test, y_test_foc))

pred_ada = model_ada.predict(X_test)
pred_sys = model_sys.predict(X_test)
pred_ana = model_ana.predict(X_test)
pred_foc = model_foc.predict(X_test)

pred = pd.DataFrame(index=X_test.index)
pred["Adaptability"] = pred_ada
pred["Systemic thinking"] = pred_sys
pred["Analytical thinking"] = pred_ana
pred["Focus"] = pred_foc

result = 0

for i, col in enumerate(y_test.columns):
    result += recall_score(y_test[col], pred[col], average='macro', zero_division=True)

print("Recall score", result/4)

Recall score 0.3158540584736246


In [210]:
model_ada = CatBoostClassifier(**params_ada)
model_sys = CatBoostClassifier(**params_sys)
model_ana = CatBoostClassifier(**params_ana)
model_foc = CatBoostClassifier(**params_foc)
# model_ada = CatBoostClassifier(**params)
# model_sys = CatBoostClassifier(**params)
# model_ana = CatBoostClassifier(**params)
# model_foc = CatBoostClassifier(**params)



model_ada.fit(X, y["Adaptability"])
model_sys.fit(X, y["Systemic thinking"])
model_ana.fit(X, y["Analytical thinking"])
model_foc.fit(X, y["Focus"])

<catboost.core.CatBoostClassifier at 0x7ff2496482e0>

In [211]:
pd.to_pickle(model_ada, "models/model_ada.pkl")
pd.to_pickle(model_sys, "models/model_sys.pkl")
pd.to_pickle(model_ana, "models/model_ana.pkl")
pd.to_pickle(model_foc, "models/model_foc.pkl")

In [212]:
# FOLDS = 4
#
# ada_pool = Pool(X, y["Adaptability"])
#
# scores_ada = cv(ada_pool,
#                 params,
#                 fold_count=FOLDS, return_models=True)
#
# sys_pool = Pool(X, y["Systemic thinking"])
#
# scores_sys = cv(sys_pool,
#                 params,
#                 fold_count=FOLDS, return_models=True)
#
# ana_pool = Pool(X, y["Analytical thinking"])
#
# scores_ana = cv(ana_pool,
#                 params,
#                 fold_count=FOLDS, return_models=True)
#
# foc_pool = Pool(X, y["Focus"])
#
# scores_foc = cv(foc_pool,
#                 params,
#                 fold_count=FOLDS, return_models=True)

### Inference

In [30]:
test_vec_scores_nans = calc_result()

In [31]:
X = test_vec_scores_nans.drop(targets, axis = 1).drop(["id", "user_id"], axis = 1)
y = test_vec_scores_nans[targets]

y_test_ada = y[["Adaptability"]]

y_test_sys = y[["Systemic thinking"]]

y_test_ana = y[["Analytical thinking"]]

y_test_foc = y[["Focus"]]

In [215]:
pred_ada = model_ada.predict(X)
pred_sys = model_sys.predict(X)
pred_ana = model_ana.predict(X)
pred_foc = model_foc.predict(X)



In [216]:
ans = get_ans(pred_ada, pred_sys, pred_ana, pred_foc)

In [217]:
ans

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus
0,10199,4.0,4.0,5.0,5.0
1,10539,5.0,4.0,4.0,5.0
2,10174,5.0,4.0,5.0,5.0
3,10465,4.0,4.0,4.0,4.0
4,10066,4.0,4.0,5.0,4.0
...,...,...,...,...,...
288,10433,4.0,4.0,4.0,4.0
289,10893,4.0,4.0,5.0,5.0
290,10909,4.0,4.0,4.0,4.0
291,10889,5.0,4.0,4.0,4.0
