<a href="https://colab.research.google.com/github/igorsvetlov88/TIPiS/blob/main/student_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

df = pd.read_csv('jamb_exam_results.csv')

df.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [None]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

df = df.drop('student_id', axis=1)

df = df.fillna(0)

In [None]:
# Разделение данных на признаки и целевую переменную
X = df.drop('jamb_score', axis=1)
y = df['jamb_score']

# Разделение на train/validation/test с распределением 60%/20%/20%
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.25, random_state=1
)

# Преобразование датафреймов в матрицы с помощью DictVectorizer
train_dict = X_train.to_dict(orient='records')
val_dict = X_val.to_dict(orient='records')
test_dict = X_test.to_dict(orient='records')

dv = DictVectorizer(sparse=True)
X_train_encoded = dv.fit_transform(train_dict)
X_val_encoded = dv.transform(val_dict)
X_test_encoded = dv.transform(test_dict)


In [None]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train_encoded, y_train)

feature_names = dv.get_feature_names_out()
root_feature = feature_names[dt.tree_.feature[0]]

print("Вопрос 1: Какой признак используется для разбиения данных?")
print(f"Ответ: {root_feature}")


Вопрос 1: Какой признак используется для разбиения данных?
Ответ: study_hours_per_week


In [None]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train_encoded, y_train)
y_pred = rf.predict(X_val_encoded)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print("Вопрос 2: Какое значение RMSE у этой модели на валидационных данных?")
print(f"Ответ: {rmse:.2f}")

Вопрос 2: Какое значение RMSE у этой модели на валидационных данных?
Ответ: 42.14


In [None]:
n_estimators_list = list(range(10, 201, 10))
rmse_scores = []

for n_est in n_estimators_list:
    rf = RandomForestRegressor(n_estimators=n_est, random_state=1, n_jobs=-1)
    rf.fit(X_train_encoded, y_train)
    y_pred = rf.predict(X_val_encoded)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)

# Поиск точки, после которой RMSE перестает улучшаться
best_rmse = float('inf')
best_n_estimators = 0
improvement_threshold = 0.001  # Порог улучшения

for i, (n_est, score) in enumerate(zip(n_estimators_list, rmse_scores)):
    if score < best_rmse - improvement_threshold:
        best_rmse = score
        best_n_estimators = n_est

print("Вопрос 3: После какого значения n_estimators RMSE перестает улучшаться?")
print(f"Ответ: {best_n_estimators}")

Вопрос 3: После какого значения n_estimators RMSE перестает улучшаться?
Ответ: 90


In [None]:
max_depth_list = [10, 15, 20, 25]
depth_rmse_scores = []

for depth in max_depth_list:
    rmse_scores_depth = []
    for n_est in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n_est,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train_encoded, y_train)
        y_pred = rf.predict(X_val_encoded)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores_depth.append(rmse)

    mean_rmse = np.mean(rmse_scores_depth)
    depth_rmse_scores.append(mean_rmse)
    print(f"max_depth={depth}, средний RMSE: {mean_rmse:.4f}")

best_depth_idx = np.argmin(depth_rmse_scores)
best_depth = max_depth_list[best_depth_idx]

print("\nВопрос 4: Какое значение max_depth оказалось лучшим по среднему RMSE?")
print(f"Ответ: {best_depth}")


max_depth=10, средний RMSE: 40.3925
max_depth=15, средний RMSE: 40.7353
max_depth=20, средний RMSE: 40.7397
max_depth=25, средний RMSE: 40.7879

Вопрос 4: Какое значение max_depth оказалось лучшим по среднему RMSE?
Ответ: 10


In [None]:
rf_final = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf_final.fit(X_train_encoded, y_train)

feature_importances = rf_final.feature_importances_
feature_importance_dict = dict(zip(feature_names, feature_importances))

target_features = ['study_hours_per_week', 'attendance_rate', 'distance_to_school', 'teacher_quality']
most_important_feature = None
max_importance = -1

for feature in target_features:
    for full_feature in feature_names:
        if feature in full_feature:
            if feature_importance_dict[full_feature] > max_importance:
                max_importance = feature_importance_dict[full_feature]
                most_important_feature = feature
            break

print("Вопрос 5: Какой признак оказался самым важным?")
print(f"Ответ: {most_important_feature}")

Вопрос 5: Какой признак оказался самым важным?
Ответ: study_hours_per_week


In [None]:
print(f"Ответы на вопросы:")
print(f"Вопрос 1: {root_feature}")
print(f"Вопрос 2: {rmse:.2f}")
print(f"Вопрос 3: {best_n_estimators}")
print(f"Вопрос 4: {best_depth}")
print(f"Вопрос 5: {most_important_feature}")

Ответы на вопросы:
Вопрос 1: study_hours_per_week
Вопрос 2: 40.60
Вопрос 3: 90
Вопрос 4: 10
Вопрос 5: study_hours_per_week
