### Считывание кластеризованных данных

In [1]:
import os
import pandas as pd

os.chdir('../../../')

In [2]:
df = pd.read_csv('data/lmsys-chat-1m/processed/kmeans_clusters.csv')
df.head()

Unnamed: 0,user_id,neutral_count,positive_count,negative_count,total_reformulations,total_words,total_spelling_errors,total_questions,cluster
0,0,0.0,1.0,0.0,0.0,11.0,0.0,0.0,1
1,1,0.333333,0.5,0.166667,0.5,25.0,0.5,0.666667,1
2,2,0.0,1.0,0.0,0.0,66.0,0.0,0.0,1
3,3,0.5,0.5,0.0,0.0,14.5,0.0,0.0,0
4,4,1.0,0.0,0.0,0.0,18.0,3.0,0.0,0


### Обучение классификатора

In [3]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from scipy.stats import randint, uniform
import time

X = df.drop(['user_id', 'cluster'], axis=1)
y = df['cluster']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

param_dist_lgb = {
    'n_estimators': randint(100, 500),
    'max_depth': [-1] + list(range(3, 15)),
    'learning_rate': uniform(0.01, 0.2),
    'num_leaves': randint(20, 150),
    'min_child_samples': randint(5, 50),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4)
}

model_lgb = LGBMClassifier(random_state=42)

search_lgb = RandomizedSearchCV(
    model_lgb,
    param_distributions=param_dist_lgb,
    n_iter=20,
    scoring='accuracy',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

start = time.time()
search_lgb.fit(X_train, y_train)
end = time.time()

print(f"LightGBM — лучшая модель: {search_lgb.best_params_}")
print(f"Время подбора: {round(end - start, 2)} сек")

y_pred_lgb = search_lgb.predict(X_test)
print("Качество (LightGBM):")
print(classification_report(y_test, y_pred_lgb))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001693 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1073
[LightGBM] [Info] Number of data points in the train set: 103731, number of used features: 7
[LightGBM] [Info] Start training from score -0.818609
[LightGBM] [Info] Start training from score -1.112797
[LightGBM] [Info] Start training from score -4.382518
[LightGBM] [Info] Start training from score -2.530619
[LightGBM] [Info] Start training from score -2.046397
[LightGBM] [Info] Start training from score -4.709010
LightGBM — лучшая модель: {'colsample_bytree': np.float64(0.9369139098379994), 'learning_rate': np.float64(0.09995082667395314), 'max_depth': 11, 'min_child_samples': 40, 'n_estimators': 369, 'num_leaves': 114, 'subsample': np.float64(0.9687496940092467)}
⏱Врем

### Проверка без дубликатов

In [4]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from scipy.stats import randint, uniform
import time

X = df.drop(['user_id', 'cluster'], axis=1)
y = df['cluster']
X = X.drop_duplicates()
y = y.loc[X.index]  # пересинхронизация y

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

param_dist_lgb = {
    'n_estimators': randint(100, 500),
    'max_depth': [-1] + list(range(3, 15)),
    'learning_rate': uniform(0.01, 0.2),
    'num_leaves': randint(20, 150),
    'min_child_samples': randint(5, 50),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4)
}

model_lgb = LGBMClassifier(random_state=42)

search_lgb = RandomizedSearchCV(
    model_lgb,
    param_distributions=param_dist_lgb,
    n_iter=20,
    scoring='accuracy',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

start = time.time()
search_lgb.fit(X_train, y_train)
end = time.time()

print(f"LightGBM — лучшая модель: {search_lgb.best_params_}")
print(f"Время подбора: {round(end - start, 2)} сек")

y_pred_lgb = search_lgb.predict(X_test)
print("Качество (LightGBM):")
print(classification_report(y_test, y_pred_lgb))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000563 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1152
[LightGBM] [Info] Number of data points in the train set: 31591, number of used features: 7
[LightGBM] [Info] Start training from score -0.936386
[LightGBM] [Info] Start training from score -1.406212
[LightGBM] [Info] Start training from score -3.445897
[LightGBM] [Info] Start training from score -1.874512
[LightGBM] [Info] Start training from score -1.888641
[LightGBM] [Info] Start training from score -3.637998
LightGBM — лучшая модель: {'colsample_bytree': np.float64(0.9400154311159197), 'learning_rate': np.float64(0.09989013482764068), 'max_depth': -1, 'min_child_samples': 27, 'n_estimators': 161, 'num_leaves': 56, 'subsample': np.float64(0.8918424713352255)}
⏱Время 