### Подготовка

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('dataset_school_grades.csv', index_col=0)
df.head()

Unnamed: 0_level_0,Student,Period,Gender,Class,Subject,Mark,Is_new_sub,Average_grade,Perform_trend,Missed_Classes
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,1,М,8,Алг.,3,1,3.89,0,0
2,1,2,М,8,Алг.,3,0,3.89,0,6
3,1,3,М,8,Алг.,3,0,3.89,0,12
4,1,4,М,8,Алг.,3,0,3.96,0,9
5,2,1,М,8,Алг.,3,1,3.39,0,2


**Описание колонок:**

1. **Student** - номер ученика
2. **Period** - четверти/полугодия
3. **Gender** - М/Ж
4. **Class** - 8/9/10/11
5. **Subject** - Предметы
6. **Mark** - Итоговая оценка 
7. **Is_new_sub** - новый ли это предмет
8. **Average_grade** - средняя оценка по всем предметам за все периоды до текущего
9. **Тренд** - как изменилась оценка по сравнению с предыдущим периодом
10. **Пропуски** - количество пропусков за указанный период по текцщей дисциплине.

Наша целевая переменная - Mark.

In [6]:
numeric_columns = df.drop('Mark', axis=1).loc[:, df.dtypes != object].columns

In [7]:
df[numeric_columns].corr()

Unnamed: 0,Student,Period,Class,Is_new_sub,Average_grade,Perform_trend,Missed_Classes
Student,1.0,-0.407315,0.742536,0.124402,0.59113,-0.021599,-0.078311
Period,-0.407315,1.0,-0.482312,-0.457463,-0.270753,0.038606,0.233667
Class,0.742536,-0.482312,1.0,-0.133997,0.591202,-0.040633,-0.105271
Is_new_sub,0.124402,-0.457463,-0.133997,1.0,0.001558,0.000545,-0.187734
Average_grade,0.59113,-0.270753,0.591202,0.001558,1.0,-0.011328,-0.08231
Perform_trend,-0.021599,0.038606,-0.040633,0.000545,-0.011328,1.0,0.019464
Missed_Classes,-0.078311,0.233667,-0.105271,-0.187734,-0.08231,0.019464,1.0


In [8]:
categorical_columns = df.loc[:, df.dtypes == object].columns

In [9]:
upd_df = df.copy()

In [10]:
upd_df = upd_df.drop('Student', axis=1)

In [11]:
upd_df = pd.get_dummies(df, columns=['Gender', 'Subject'], prefix='subj').astype(int)

In [12]:
upd_df.head()

Unnamed: 0_level_0,Student,Period,Class,Mark,Is_new_sub,Average_grade,Perform_trend,Missed_Classes,subj_Ж,subj_М,...,subj_ОБиЗР,subj_Обществ.,subj_Практ. по обществ.,subj_Рус.яз.,subj_Смысл. чтен.,subj_Совр. литерат,subj_Труд,subj_Физ-ра,subj_Физика,subj_Химия
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,1,8,3,1,3,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,2,8,3,0,3,0,6,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,3,8,3,0,3,0,12,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,4,8,3,0,3,0,9,0,1,...,0,0,0,0,0,0,0,0,0,0
5,2,1,8,3,1,3,0,2,0,1,...,0,0,0,0,0,0,0,0,0,0


In [13]:
X = upd_df.drop('Mark', axis=1)
Y = upd_df['Mark']

### Linear

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)), 
    ('scaler', StandardScaler()),
    ('ridge', RidgeCV(alphas=[0.1, 1.0, 10.0], cv=5)) 
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

In [16]:
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report, confusion_matrix
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report,
    mean_absolute_error, mean_squared_error, r2_score,
    mean_absolute_percentage_error
)

y_pred = pipe.predict(X_test)

y_pred_rounded = np.clip(np.round(y_pred), 2, 5).astype(int)
y_test_rounded = y_test.astype(int)

In [17]:
print(f"\nДля Linear Model")
print(f"\nРегрессионные метрики:")
print(f"MAE:   {mean_absolute_error(y_test, y_pred):.3f}")
print(f"MSE:   {mean_squared_error(y_test, y_pred):.3f}")
print(f"RMSE:  {mean_squared_error(y_test, y_pred):.3f}")
print(f"MAPE:  {mean_absolute_percentage_error(y_test, y_pred):.3f}")
print(f"R²:    {r2_score(y_test, y_pred):.3f}")

print("\nКлассификационные метрики (по округлённым оценкам):")
print(f"Accuracy:            {accuracy_score(y_test_rounded, y_pred_rounded):.3f}")
print(f"F1-score (micro):    {f1_score(y_test_rounded, y_pred_rounded, average='micro'):.3f}")
print(f"F1-score (macro):    {f1_score(y_test_rounded, y_pred_rounded, average='macro'):.3f}")
print(f"F1-score (weighted): {f1_score(y_test_rounded, y_pred_rounded, average='weighted'):.3f}")
print(f"Precision (macro):   {precision_score(y_test_rounded, y_pred_rounded, average='macro'):.3f}")
print(f"Recall (macro):      {recall_score(y_test_rounded, y_pred_rounded, average='macro'):.3f}")


Для Linear Model

Регрессионные метрики:
MAE:   0.327
MSE:   0.185
RMSE:  0.185
MAPE:  0.085
R²:    0.684

Классификационные метрики (по округлённым оценкам):
Accuracy:            0.768
F1-score (micro):    0.768
F1-score (macro):    0.705
F1-score (weighted): 0.769
Precision (macro):   0.717
Recall (macro):      0.698


In [18]:
import joblib

joblib.dump(pipe, 'model_linear.pkl')

['model_linear.pkl']

### Cluster (RandomForestClassifier, LogisticRegression, LinearSVC)

In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

X_train, X_test, y_train, y_test = train_test_split(X, Y, 
                                                    test_size=0.25,
                                                    random_state=1)

scaler = StandardScaler()
k_means = KMeans(n_clusters=3, tol=0.0005, random_state=1)

k_means.fit(scaler.fit_transform(X_train))


  File "C:\Users\go130\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\go130\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\go130\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\go130\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [21]:
scaler.fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

rf = RandomForestClassifier(random_state=1)
lr = LogisticRegression(random_state=1)
svm = LinearSVC(random_state=1)

rf.fit(X_train_norm, y_train)
lr.fit(X_train_norm, y_train)
svm.fit(X_train_norm, y_train)

In [23]:
X_train_norm = np.c_[X_train_norm, k_means.predict(X_train_norm)]
X_test_norm = np.c_[X_test_norm, k_means.predict(X_test_norm)]

In [24]:
rf.fit(X_train_norm, y_train)
lr.fit(X_train_norm, y_train)
svm.fit(X_train_norm, y_train)

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

models = {
    "Random Forest": rf,
    "Logistic Regression": lr,
    "SVM": svm
}

for name, model in models.items():
    y_pred = model.predict(X_test_norm)
    
    print(f"\n{name}:")
    print(f"Accuracy:  {accuracy_score(y_test, y_pred):.3f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='macro'):.3f}")
    print(f"Recall:    {recall_score(y_test, y_pred, average='macro'):.3f}")
    print(f"F1 Score:  {f1_score(y_test, y_pred, average='macro'):.3f}")


Random Forest:
Accuracy:  0.765
Precision: 0.585
Recall:    0.574
F1 Score:  0.578

Logistic Regression:
Accuracy:  0.754
Precision: 0.576
Recall:    0.567
F1 Score:  0.570

SVM:
Accuracy:  0.720
Precision: 0.547
Recall:    0.546
F1 Score:  0.546


### KNeighborsClassifier 

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, Y.values,
                                                    random_state=0,
                                                    test_size=0.2)

In [28]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [29]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [30]:
print("KNeighborsClassifier:\n")

print(f'KNN Accuracy:        {accuracy_score(y_test, y_pred):.3f}')
print(f'KNN Precision (weighted): {precision_score(y_test, y_pred, average="weighted"):.3f}')
print(f'KNN Recall (weighted):    {recall_score(y_test, y_pred, average="weighted"):.3f}')
print(f'KNN F1 (weighted):         {f1_score(y_test, y_pred, average="weighted"):.3f}')

KNeighborsClassifier:

KNN Accuracy:        0.694
KNN Precision (weighted): 0.692
KNN Recall (weighted):    0.694
KNN F1 (weighted):         0.693


### Product Quantization.

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, Y.values,
                                                    random_state=0,
                                                    test_size=0.2)

In [33]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [34]:
def product_quantization(X_train, X_test, m_blocks=5, n_clusters=100):
    dist_table = np.zeros([X_test.shape[0], n_clusters, m_blocks])
    X_train_clusters = np.zeros([X_train.shape[0], m_blocks])

    for i in range(m_blocks):
        block_size = X_test.shape[1] // m_blocks
        X_train_block = X_train[:, i * block_size: (i + 1) * block_size].copy()
        X_test_block = X_test[:, i * block_size: (i + 1) * block_size].copy()

        kmeans = KMeans(n_clusters=n_clusters).fit(X_train_block)
        dist_table[:, :, i] = kmeans.transform(X_test_block)

        X_train_clusters[:, i] = kmeans.predict(X_train_block)

    return X_train_clusters, dist_table


X_train_clusters, dist_table = product_quantization(X_train, X_test,
                                                    m_blocks=5,
                                                    n_clusters=100)

In [35]:
sq_dist_table = dist_table ** 2
m_blocks = 5

dist = np.zeros(shape=(sq_dist_table.shape[0], X_train.shape[0]))
X_train_clusters = X_train_clusters.astype(int)

for i in range(m_blocks):
    dist += sq_dist_table[:, X_train_clusters[:, i], i]

In [36]:
y_pred = y_train[np.argmin(dist, axis=1)]

In [37]:
from sklearn.metrics import accuracy_score, f1_score

print("Product Quantization:\n")
print(f"Accuracy:    {accuracy_score(y_test, y_pred):.3f}")
print(f"F1 (macro):  {f1_score(y_test, y_pred, average='macro'):.3f}")
print(f"F1 (weighted): {f1_score(y_test, y_pred, average='weighted'):.3f}")


Product Quantization:

Accuracy:    0.683
F1 (macro):  0.517
F1 (weighted): 0.682


### Fully Connected NN

In [39]:
# !pip install tensorflow

In [40]:
import keras_tuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import regularizers
from sklearn.utils import class_weight

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

num_classes = 4
y_train_cat = to_categorical(y_train - 2, num_classes=num_classes)
y_test_cat = to_categorical(y_test - 2, num_classes=num_classes)

class_weights_raw = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = {i: w for i, w in enumerate(class_weights_raw)}

def build_model(hp):
    model = Sequential()
    model.add(Dense(
        units=hp.Int('units_1', min_value=64, max_value=512, step=64),
        activation='relu',
        kernel_regularizer=regularizers.l2(hp.Choice('l2_1', [0.0001, 0.001, 0.01])),
        input_shape=(X_train.shape[1],)
    ))
    model.add(Dropout(hp.Float('dropout_1', 0.2, 0.6, step=0.1)))
    
    for i in range(hp.Int("num_layers", 1, 3)):
        model.add(Dense(
            units=hp.Int(f'units_{i+2}', min_value=32, max_value=256, step=32),
            activation='relu',
            kernel_regularizer=regularizers.l2(hp.Choice(f'l2_{i+2}', [0.0001, 0.001, 0.01]))
        ))
        model.add(Dropout(hp.Float(f'dropout_{i+2}', 0.2, 0.5, step=0.1)))

    model.add(Dense(num_classes, activation='softmax'))

    model.compile(
        optimizer=Adam(learning_rate=hp.Choice('lr', [1e-2, 1e-3, 5e-4, 1e-4])),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=20,
    executions_per_trial=1,
    overwrite=True,
    directory='tuner_dir',
    project_name='student_score_model'
)

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)

tuner.search(
    X_train, y_train_cat,
    epochs=100,
    validation_split=0.2,
    class_weight=class_weights,
    callbacks=[early_stop, reduce_lr],
    verbose=2
)

best_model = tuner.get_best_models(num_models=1)[0]

loss, acc = best_model.evaluate(X_test, y_test_cat)
print(f"\nЛучшая модель — точность: {acc:.3f}")


Trial 20 Complete [00h 00m 03s]
val_accuracy: 0.7222914099693298

Best val_accuracy So Far: 0.7646326422691345
Total elapsed time: 00h 02m 37s
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 908us/step - accuracy: 0.7800 - loss: 0.6105

✅ Лучшая модель — точность: 0.763


In [41]:
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test_cat, axis=1)

print("Fully Connected NN:\n")
print(f"Accuracy:             {accuracy_score(y_true, y_pred):.3f}")
print(f"F1-score (macro):     {f1_score(y_true, y_pred, average='macro'):.3f}")
print(f"F1-score (weighted):  {f1_score(y_true, y_pred, average='weighted'):.3f}")
print(f"F1-score (micro):     {f1_score(y_true, y_pred, average='micro'):.3f}")
print(f"Precision (macro):    {precision_score(y_true, y_pred, average='macro'):.3f}")
print(f"Recall (macro):       {recall_score(y_true, y_pred, average='macro'):.3f}")

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Fully Connected NN:

Accuracy:             0.763
F1-score (macro):     0.640
F1-score (weighted):  0.763
F1-score (micro):     0.763
Precision (macro):    0.617
Recall (macro):       0.708
