In [None]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
import optuna
import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import f1_score

np.random.seed(42)
random.seed(42)

In [None]:
df_train = pd.read_csv('train.csv', index_col=0) 
df_test = pd.read_csv('test.csv', index_col=0) 

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df_train[['text']],
                                                    df_train['label'],
                                                    test_size=0.3)

In [None]:
X_test = df_test[['text']]
y_test = df_test['label']

In [None]:
text_features = ['text']

In [None]:
train_pool = Pool(
        X_train, 
        y_train, 
        text_features=text_features,
        feature_names=text_features
    )
valid_pool = Pool(
        X_val, 
        y_val, 
        text_features=text_features,
        feature_names=text_features
    )

In [None]:
def objective(trial):

    catboost_params = {
        "loss_function": 'MultiClass',
        "iterations": trial.suggest_int("iterations", 1000, 3000),
        "depth": trial.suggest_int("depth", 2, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 0, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        'task_type': 'GPU',
        'early_stopping_rounds': trial.suggest_int("early_stopping_rounds", 100, 1000),
        'eval_metric': 'TotalF1:average=Macro',
        'verbose': 300
    }
    text_processing = {
        "tokenizers" : [{
            'tokenizer_id': 'Sense',
            'separator_type': 'BySense',
            'lowercasing': 'True',
            'token_types':['Word', 'Number', 'SentenceBreak'],
            'sub_tokens_policy':'SeveralTokens'
        },
        {
            "tokenizer_id" : "Space",
            "separator_type" : "ByDelimiter",
            "lowercasing": "True",
            "delimiter" : " "
        }],
    
        "dictionaries" : [{
            "dictionary_id" : "BiGram",
            "max_dictionary_size" : "50000",
            "occurrence_lower_bound" : "100",
            "gram_order" : "2"
        }, {
            "dictionary_id" : "Word",
            "max_dictionary_size" : "50000",
            "occurrence_lower_bound" : "100",
            "gram_order" : "1"
        },{
            "dictionary_id" : "3-Gram",
            "max_dictionary_size" : "50000",
            "occurrence_lower_bound" : "100",
            "gram_order" : "3"
        }],
        "feature_processing" : {
            "default" : [{
                "dictionaries_names" : ["BiGram", "Word", "3-Gram"],
                "feature_calcers" : ["BoW"],
                "tokenizers_names" : ["Space"]
            }, {
                "dictionaries_names" : ["Word", "BiGram", "3-Gram"],
                "feature_calcers" : ["NaiveBayes"],
                "tokenizers_names" : ["Space"]
            }],
        }
    }


    model = CatBoostClassifier(**catboost_params, text_processing=text_processing) 
    model.fit(train_pool, eval_set=valid_pool)
    preds = model.predict(X_val)
    f1 = f1_score(y_val, preds, average='macro')

    return f1

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15, timeout=6000)

In [None]:
study.best_trial.params

In [None]:
catboost_params ={
    'iterations': study.best_trial.params['iterations'],
    'depth': 8,
    'min_data_in_leaf': study.best_trial.params['min_data_in_leaf'],
    'learning_rate': study.best_trial.params['learning_rate'],
    'eval_metric': 'TotalF1:average=Macro',
    "loss_function": 'MultiClass',
    'task_type': 'GPU',
    'verbose': 300,
    'early_stopping_rounds': study.best_trial.params['early_stopping_rounds']}

In [None]:
model = CatBoostClassifier(**catboost_params) 
model.fit(train_pool, eval_set=valid_pool)

In [None]:
preds = model.predict(X_test)

In [None]:
probs = model.predict_proba(X_test)