In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [9]:
# Загрузка данных
student_results = pd.read_csv('jamb_exam_results.csv')

# Приводим имена столбцов к нижнему регистру и заменяем пробелы на подчеркивания
student_results.columns = student_results.columns.str.lower().str.replace(' ', '_')

# Удаляем столбец student_id
student_results = student_results.drop(columns=['student_id'])

# Заполняем пропущенные значения нулями
student_results = student_results.fillna(0)

# Разделяем данные на признаки (X) и целевую переменную (y)
features = student_results.drop(columns=['jamb_score'])
target = student_results['jamb_score']


In [10]:
# Разбиваем данные на обучающую, валидационную и тестовую выборки
train_features, temp_features, train_target, temp_target = train_test_split(features, target, test_size=0.4, random_state=42)
val_features, test_features, val_target, test_target = train_test_split(temp_features, temp_target, test_size=0.5, random_state=42)

# Преобразуем данные с помощью DictVectorizer
converter = DictVectorizer(sparse=True)
train_features_converted = converter.fit_transform(train_features.to_dict(orient='records'))
val_features_converted = converter.transform(val_features.to_dict(orient='records'))
test_features_converted = converter.transform(test_features.to_dict(orient='records'))

# Обучаем модель дерева решений
decision_tree = DecisionTreeRegressor(max_depth=1, random_state=42)
decision_tree.fit(train_features_converted, train_target)

# Находим признак, использованный для первого разбиения
first_split_attribute = converter.feature_names_[decision_tree.tree_.feature[0]]
print(f"Feature used for the first split: {first_split_attribute}")

Feature used for the first split: study_hours_per_week


In [11]:
# Обучаем модель случайного леса
forest = RandomForestRegressor(n_estimators=10, random_state=42, n_jobs=-1)
forest.fit(train_features_converted, train_target)

# Оцениваем модель на валидационных данных
val_predictions = forest.predict(val_features_converted)
val_rmse = np.sqrt(mean_squared_error(val_target, val_predictions))
print(f"Validation RMSE: {val_rmse}")

Validation RMSE: 42.173933892868


In [12]:
# Поиск оптимального количества деревьев
rmse_data = {}
for num_trees in range(10, 201, 10):
    forest = RandomForestRegressor(n_estimators=num_trees, random_state=42, n_jobs=-1)
    forest.fit(train_features_converted, train_target)
    val_predictions = forest.predict(val_features_converted)
    rmse_data[num_trees] = np.sqrt(mean_squared_error(val_target, val_predictions))

# Выводим результаты по количеству деревьев
for num_trees, rmse_score in rmse_data.items():
    print(f"n_estimators: {num_trees}, RMSE: {rmse_score:.3f}")


# Находим лучшее значение max_depth
depth_options = [10, 15, 20, 25]
depth_results = {}

for max_depth in depth_options:
    rmse_scores = []
    for num_trees in range(10, 201, 10):
        forest = RandomForestRegressor(n_estimators=num_trees, max_depth=max_depth, random_state=42, n_jobs=-1)
        forest.fit(train_features_converted, train_target)
        val_predictions = forest.predict(val_features_converted)
        rmse_scores.append(np.sqrt(mean_squared_error(val_target, val_predictions)))
    depth_results[max_depth] = np.mean(rmse_scores)

# Находим значение max_depth с минимальным средним RMSE
optimal_depth = min(depth_results, key=depth_results.get)
print(f"Best max_depth: {optimal_depth}")

n_estimators: 10, RMSE: 42.174
n_estimators: 20, RMSE: 41.528
n_estimators: 30, RMSE: 41.551
n_estimators: 40, RMSE: 41.509
n_estimators: 50, RMSE: 41.309
n_estimators: 60, RMSE: 41.225
n_estimators: 70, RMSE: 41.132
n_estimators: 80, RMSE: 41.091
n_estimators: 90, RMSE: 41.010
n_estimators: 100, RMSE: 41.087
n_estimators: 110, RMSE: 41.094
n_estimators: 120, RMSE: 41.047
n_estimators: 130, RMSE: 40.974
n_estimators: 140, RMSE: 40.981
n_estimators: 150, RMSE: 40.961
n_estimators: 160, RMSE: 40.957
n_estimators: 170, RMSE: 40.933
n_estimators: 180, RMSE: 40.927
n_estimators: 190, RMSE: 40.911
n_estimators: 200, RMSE: 40.910
Best max_depth: 10


In [15]:
# Обучаем финальную модель случайного леса
best_forest = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=42, n_jobs=-1)
best_forest.fit(train_features_converted, train_target)

# Определяем в# Создание DMatrix для XGBoost
boost_train = xgb.DMatrix(train_features_converted, label=train_target)
boost_val = xgb.DMatrix(val_features_converted, label=val_target)

# Создаем watchlist
evaluation = [(boost_train, 'train'), (boost_val, 'eval')]

# Параметры для XGBoost модели
boost_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 42,
    'verbosity': 1
}

# Обучение XGBoost с eta = 0.3
boost_model_03 = xgb.train(boost_params, boost_train, num_boost_round=100, evals=evaluation, early_stopping_rounds=10)

# Изменяем eta на 0.1 и повторяем обучение
boost_params['eta'] = 0.1
boost_model_01 = xgb.train(boost_params, boost_train, num_boost_round=100, evals=evaluation, early_stopping_rounds=10)

# Выводим RMSE
print(f"Best validation RMSE with eta=0.3: {boost_model_03.best_score}")
print(f"Best validation RMSE with eta=0.1: {boost_model_01.best_score}")ажность признаков
attribute_importances = best_forest.feature_importances_
top_attribute = converter.feature_names_[np.argmax(attribute_importances)]
print(f"Most important feature: {top_attribute}")


Most important feature: study_hours_per_week


In [14]:
# Создание DMatrix для XGBoost
boost_train = xgb.DMatrix(train_features_converted, label=train_target)
boost_val = xgb.DMatrix(val_features_converted, label=val_target)

# Создаем watchlist
evaluation = [(boost_train, 'train'), (boost_val, 'eval')]

# Параметры для XGBoost модели
boost_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 42,
    'verbosity': 1
}

# Обучение XGBoost с eta = 0.3
boost_model_03 = xgb.train(boost_params, boost_train, num_boost_round=100, evals=evaluation, early_stopping_rounds=10)

# Изменяем eta на 0.1 и повторяем обучение
boost_params['eta'] = 0.1
boost_model_01 = xgb.train(boost_params, boost_train, num_boost_round=100, evals=evaluation, early_stopping_rounds=10)

# Выводим RMSE
print(f"Best validation RMSE with eta=0.3: {boost_model_03.best_score}")
print(f"Best validation RMSE with eta=0.1: {boost_model_01.best_score}")

[0]	train-rmse:42.71579	eval-rmse:45.17452
[1]	train-rmse:39.82920	eval-rmse:43.33307
[2]	train-rmse:37.76334	eval-rmse:42.21338
[3]	train-rmse:36.28364	eval-rmse:41.85801
[4]	train-rmse:35.07326	eval-rmse:41.33436
[5]	train-rmse:34.19555	eval-rmse:41.08693
[6]	train-rmse:33.44294	eval-rmse:40.91599
[7]	train-rmse:32.67600	eval-rmse:40.88081
[8]	train-rmse:32.01974	eval-rmse:40.97827
[9]	train-rmse:31.53993	eval-rmse:40.88101
[10]	train-rmse:31.06525	eval-rmse:40.93403
[11]	train-rmse:30.73302	eval-rmse:40.99632
[12]	train-rmse:30.37063	eval-rmse:40.99476
[13]	train-rmse:29.88920	eval-rmse:41.01079
[14]	train-rmse:29.51041	eval-rmse:40.97999
[15]	train-rmse:29.18156	eval-rmse:41.04403
[16]	train-rmse:29.07860	eval-rmse:41.05910
[0]	train-rmse:45.50472	eval-rmse:47.29604
[1]	train-rmse:44.14512	eval-rmse:46.35598
[2]	train-rmse:42.95150	eval-rmse:45.45687
[3]	train-rmse:41.89408	eval-rmse:44.71852
[4]	train-rmse:40.93352	eval-rmse:44.07986
[5]	train-rmse:40.08378	eval-rmse:43.54894
[6]	