###  Хакатон

In [1]:
import pandas as pd
from catboost import Pool, CatBoostClassifier
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split 

%matplotlib inline

In [2]:
data_path = "../data/"
file_data = "labeled.csv"
file_data = os.path.join(data_path, file_data)

In [3]:
data = pd.read_csv(file_data)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14412 entries, 0 to 14411
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   comment  14412 non-null  object 
 1   toxic    14412 non-null  float64
dtypes: float64(1), object(1)
memory usage: 225.3+ KB


In [4]:
data.head()

Unnamed: 0,comment,toxic
0,"Верблюдов-то за что? Дебилы, бл...\n",1.0
1,"Хохлы, это отдушина затюканого россиянина, мол...",1.0
2,Собаке - собачья смерть\n,1.0
3,"Страницу обнови, дебил. Это тоже не оскорблени...",1.0
4,"тебя не убедил 6-страничный пдф в том, что Скр...",1.0


In [5]:
data.toxic.value_counts()

0.0    9586
1.0    4826
Name: toxic, dtype: int64

In [6]:
df_train, df_test = train_test_split(
    data,
    test_size=0.2, 
    stratify=data["toxic"],
    random_state=42
)


In [7]:
df_train.shape, df_test.shape

((11529, 2), (2883, 2))

In [8]:
df_train.toxic.value_counts(), df_test.toxic.value_counts()

(0.0    7668
 1.0    3861
 Name: toxic, dtype: int64,
 0.0    1918
 1.0     965
 Name: toxic, dtype: int64)

In [9]:
df_train.to_csv(os.path.join(data_path, "train.csv"), index=0)
df_test.to_csv(os.path.join(data_path, "test.csv"), index=0)

### Обучим

In [10]:
train = pd.read_csv(os.path.join(data_path, "train.csv"))
train.head()

Unnamed: 0,comment,toxic
0,Вы читали закон о банкротстве? Что бы объявить...,0.0
1,а у сяоми это постоянно,0.0
2,"Хотел занять? Тогда да,проще позвонить , чем у...",1.0
3,"заработки были без работы даже небольшие, но б...",0.0
4,Езжу каждый день. За этот год почти 30 тыс.\n,0.0


In [11]:
def fit_catboost(
    X_train, 
    X_test, 
    y_train, 
    y_test, 
    catboost_params = {},
    verbose = 100
):
    learn_pool = Pool(
        X_train, 
        y_train, 
        text_features=["text"], 
        feature_names=["text"]
    )
    test_pool = Pool(
        X_test, 
        y_test, 
        text_features=["text"],
        feature_names=["text"]
    )
    catboost_default_params = {
        'iterations': 500,
        'learning_rate': 0.05,
        'eval_metric': 'F1',
        'task_type': 'GPU',
        'use_best_model': True
    }
    catboost_default_params.update(catboost_params)
    
    model = CatBoostClassifier(**catboost_default_params)
    model.fit(learn_pool, eval_set=test_pool, verbose=verbose)
    return model



In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    train[["comment"]],
    train["toxic"],
    test_size=0.3, 
    stratify=train["toxic"],
    random_state=42
)
cat_boost_model = fit_catboost(X_train, X_val, y_train, y_val)

0:	learn: 0.5280431	test: 0.5305136	best: 0.5305136 (0)	total: 31.1ms	remaining: 15.5s
100:	learn: 0.7228871	test: 0.7451670	best: 0.7477329 (1)	total: 2.38s	remaining: 9.38s
200:	learn: 0.7294821	test: 0.7457181	best: 0.7477329 (1)	total: 4.63s	remaining: 6.89s
300:	learn: 0.7341062	test: 0.7460317	best: 0.7477329 (1)	total: 6.99s	remaining: 4.62s
400:	learn: 0.7412117	test: 0.7449250	best: 0.7477329 (1)	total: 9.37s	remaining: 2.31s
499:	learn: 0.7460169	test: 0.7446996	best: 0.7477329 (1)	total: 11.6s	remaining: 0us
bestTest = 0.7477328937
bestIteration = 1
Shrink model to first 2 iterations.


In [13]:
cat_boost_model.save_model("hack_model")