In [None]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [58]:
data = pd.read_csv('datasets/dataset_diseases.csv', sep = ';')

In [59]:
data.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [60]:
data = data.fillna('none')

In [61]:
model = CatBoostClassifier()

In [67]:
param_grid = {
    'iterations': [100, 200, 1000],  
    'depth': [4, 6, 8],              
    'learning_rate': [0.01, 0.1, 0.2],  
}

In [68]:
X = data.loc[:, 'Symptom_1' : 'Symptom_4']
y = data['Disease']

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
random_search = RandomizedSearchCV(model, param_distributions=param_grid,  scoring='accuracy', cv=3, verbose=1, random_state=42, n_jobs=-1)

In [71]:
random_search.fit(X_train, y_train, cat_features=['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4'])

Fitting 3 folds for each of 1 candidates, totalling 3 fits




0:	learn: 3.4098013	total: 464ms	remaining: 7m 43s
0:	learn: 3.4027699	total: 464ms	remaining: 7m 43s
0:	learn: 3.5004745	total: 467ms	remaining: 7m 46s
1:	learn: 3.2062336	total: 877ms	remaining: 7m 17s
1:	learn: 3.1353884	total: 877ms	remaining: 7m 17s
1:	learn: 3.1747818	total: 877ms	remaining: 7m 17s
2:	learn: 3.0254013	total: 1.27s	remaining: 7m 2s
2:	learn: 2.9428655	total: 1.27s	remaining: 7m 2s
2:	learn: 2.9836494	total: 1.27s	remaining: 7m 2s
3:	learn: 2.8336738	total: 1.67s	remaining: 6m 56s
3:	learn: 2.8417845	total: 1.67s	remaining: 6m 56s
3:	learn: 2.8194529	total: 1.73s	remaining: 7m 10s
4:	learn: 2.6836036	total: 2.13s	remaining: 7m 2s
4:	learn: 2.6183064	total: 2.13s	remaining: 7m 3s
4:	learn: 2.5560184	total: 2.15s	remaining: 7m 6s
5:	learn: 2.5561867	total: 2.51s	remaining: 6m 55s
5:	learn: 2.4558729	total: 2.59s	remaining: 7m 9s
5:	learn: 2.5029977	total: 2.6s	remaining: 7m 11s
6:	learn: 2.4967421	total: 2.91s	remaining: 6m 53s
6:	learn: 2.2662854	total: 3.04s	remain

In [72]:
best_params = random_search.best_params_
print(f'Лучшие параметры: {best_params}')


best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Лучшие параметры: {'learning_rate': 0.1, 'iterations': 1000, 'depth': 4}
Accuracy: 0.9949186991869918


In [74]:
best_model.save_model('model/model.cbm')