In [6]:
import pandas as pd
import random

# Создаем список словарей, каждый словарь представляет одну строку данных
data_list = []

# Заполняем список словарей случайными данными
for _ in range(1000):
    data_list.append({
        'salary': float(random.randint(30000, 80000)),  # Установим числовой тип для 'salary'
        'city': random.choice(['Bishkek', 'London', 'Moscow']),
        'age': random.randint(30, 65),
        'vacation_prefer': random.choice(['Shopping', 'Beach holiday']),
        'transport_prefer': random.choice(['auto', 'plane']),
        'target': random.choice(['Bishkek', 'London', 'Moscow'])
    })

# Создаем датафрейм из списка словарей
data = pd.DataFrame(data_list)

# Преобразование категориальных данных
data = pd.get_dummies(data, columns=['city', 'vacation_prefer', 'transport_prefer'], drop_first=True)

# Выделение обучающей и тестовой выборки
X = data.drop('target', axis=1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

# Инициализация модели
clf = RandomForestClassifier(random_state=42)

# Обучение модели
clf.fit(X_train, y_train)

# Оценка модели на обучающей выборке
train_predictions = clf.predict(X_train)
print("Accuracy on training set:", accuracy_score(y_train, train_predictions))
print("\nClassification Report on training set:\n", classification_report(y_train, train_predictions))

# Перекрестная проверка
cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
print("\nCross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# Оценка модели на тестовой выборке
test_predictions = clf.predict(X_test)
print("\nAccuracy on test set:", accuracy_score(y_test, test_predictions))
print("\nClassification Report on test set:\n", classification_report(y_test, test_predictions))


Accuracy on training set: 1.0

Classification Report on training set:
               precision    recall  f1-score   support

     Bishkek       1.00      1.00      1.00       243
      London       1.00      1.00      1.00       270
      Moscow       1.00      1.00      1.00       287

    accuracy                           1.00       800
   macro avg       1.00      1.00      1.00       800
weighted avg       1.00      1.00      1.00       800


Cross-Validation Scores: [0.3375  0.34375 0.34375 0.3625  0.3125 ]
Mean CV Accuracy: 0.33999999999999997

Accuracy on test set: 0.32

Classification Report on test set:
               precision    recall  f1-score   support

     Bishkek       0.31      0.22      0.26        64
      London       0.26      0.26      0.26        68
      Moscow       0.37      0.47      0.41        68

    accuracy                           0.32       200
   macro avg       0.31      0.32      0.31       200
weighted avg       0.31      0.32      0.31       2

In [10]:
# Получение списка всех столбцов, созданных при помощи pd.get_dummies
all_columns = X_train.columns

# Создание случайных данных для предсказания
random_data = {
    'salary': [random.randint(30000, 80000)],
    'age': [random.randint(30, 65)],
    'city_Bishkek': [random.choice([0, 1])],
    'city_London': [random.choice([0, 1])],
    'city_Moscow': [random.choice([0, 1])],
    'vacation_prefer_Beach holiday': [random.choice([0, 1])],
    'vacation_prefer_Shopping': [random.choice([0, 1])],
    'transport_prefer_plane': [random.choice([0, 1])],
}

random_df = pd.DataFrame(random_data)

# Порядок столбцов должен совпадать с порядком в X_train
random_df = random_df.reindex(columns=all_columns, fill_value=0)

# Предсказание
prediction = clf.predict(random_df)
print("\nPredicted vacation destination:", prediction[0])


Predicted vacation destination: London


In [11]:
from sklearn.model_selection import GridSearchCV

# Параметры для решетчатого поиска
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Решетчатый поиск
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Лучшие параметры
print("\nBest Parameters:", grid_search.best_params_)

# Оценка модели с лучшими параметрами
best_clf = grid_search.best_estimator_
best_clf.fit(X_train, y_train)

# Оценка на тестовой выборке
best_test_predictions = best_clf.predict(X_test)
print("\nAccuracy on test set with best parameters:", accuracy_score(y_test, best_test_predictions))
print("\nClassification Report on test set with best parameters:\n", classification_report(y_test, best_test_predictions))



Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}

Accuracy on test set with best parameters: 0.335

Classification Report on test set with best parameters:
               precision    recall  f1-score   support

     Bishkek       0.36      0.22      0.27        64
      London       0.24      0.25      0.24        68
      Moscow       0.40      0.53      0.46        68

    accuracy                           0.34       200
   macro avg       0.33      0.33      0.32       200
weighted avg       0.33      0.34      0.33       200

