## [Dataset](https://www.kaggle.com/datasets/aljarah/xAPI-Edu-Data/data)

In [1]:
import pandas as pd

data = pd.read_csv("..\\data\\xAPI-Edu-Data.csv")
data

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
2,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
3,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
4,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,F,Jordan,Jordan,MiddleSchool,G-08,A,Chemistry,S,Father,5,4,5,8,No,Bad,Above-7,L
476,F,Jordan,Jordan,MiddleSchool,G-08,A,Geology,F,Father,50,77,14,28,No,Bad,Under-7,M
477,F,Jordan,Jordan,MiddleSchool,G-08,A,Geology,S,Father,55,74,25,29,No,Bad,Under-7,M
478,F,Jordan,Jordan,MiddleSchool,G-08,A,History,F,Father,30,17,14,57,No,Bad,Above-7,L


In [2]:
data.columns

Index(['gender', 'NationalITy', 'PlaceofBirth', 'StageID', 'GradeID',
       'SectionID', 'Topic', 'Semester', 'Relation', 'raisedhands',
       'VisITedResources', 'AnnouncementsView', 'Discussion',
       'ParentAnsweringSurvey', 'ParentschoolSatisfaction',
       'StudentAbsenceDays', 'Class'],
      dtype='object')

In [3]:
# Визначення категоріальних і числових ознак
categorical_cols = [
    "gender", "NationalITy", "PlaceofBirth",
    "StageID", "GradeID", "SectionID", "Topic",
    "Semester", "Relation", "ParentAnsweringSurvey",
    "ParentschoolSatisfaction", "StudentAbsenceDays"
]
numerical_cols = ["raisedhands", "VisITedResources", "AnnouncementsView", "Discussion"]

In [4]:
from sklearn.preprocessing import OneHotEncoder

# Кодування категоріальних змінних
ohe = OneHotEncoder(drop="first", sparse_output=False)
encoded_cats = ohe.fit_transform(data[categorical_cols])
encoded_cat_df = pd.DataFrame(encoded_cats, columns=ohe.get_feature_names_out(categorical_cols))

In [5]:
from sklearn.preprocessing import StandardScaler

# Нормалізація числових ознак
scaler = StandardScaler()
scaled_nums = scaler.fit_transform(data[numerical_cols])
scaled_num_df = pd.DataFrame(scaled_nums, columns=numerical_cols)

In [6]:
# Збирання фінального датасету
X = pd.concat([encoded_cat_df, scaled_num_df], axis=1)
y = data["Class"].map({"L": 0, "M": 1, "H": 2})

In [7]:
from sklearn.model_selection import train_test_split

# Розділення на навчальний і тестовий набори
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.linear_model import Ridge, Lasso

# Створення та навчання моделей
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.01)

ridge.fit(X_train, y_train)
lasso.fit(X_train, y_train)

# Прогнозування
y_pred_ridge = ridge.predict(X_test).round().astype(int)
y_pred_lasso = lasso.predict(X_test).round().astype(int)

In [9]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score

# Оцінка точності
ridge_mse = mean_squared_error(y_test, y_pred_ridge)
ridge_mae = mean_absolute_error(y_test, y_pred_ridge)
lasso_mse = mean_squared_error(y_test, y_pred_lasso)
lasso_mae = mean_absolute_error(y_test, y_pred_lasso)

print("Ridge Accuracy:", accuracy_score(y_test, y_pred_ridge))
print("Lasso Accuracy:", accuracy_score(y_test, y_pred_lasso))
print("\nПорівняння метрик:")
print(f"Ridge MSE: {ridge_mse}, Ridge MAE: {ridge_mae}")
print(f"Lasso MSE: {lasso_mse}, Lasso MAE: {lasso_mae}")

Ridge Accuracy: 0.7916666666666666
Lasso Accuracy: 0.8125

Порівняння метрик:
Ridge MSE: 0.20833333333333334, Ridge MAE: 0.20833333333333334
Lasso MSE: 0.1875, Lasso MAE: 0.1875


In [10]:
# Аналіз коефіцієнтів
feature_names = encoded_cat_df.columns.tolist() + numerical_cols
ridge_coef = pd.Series(ridge.coef_, index=feature_names)
lasso_coef = pd.Series(lasso.coef_, index=feature_names)

print("\nНайважливіші фактори за Lasso:")
print(lasso_coef[lasso_coef != 0].sort_values(ascending=False))

print("\nНайважливіші фактори за Ridge:")
print(ridge_coef.sort_values(ascending=False))


Найважливіші фактори за Lasso:
StudentAbsenceDays_Under-7       0.531424
ParentAnsweringSurvey_Yes        0.191785
VisITedResources                 0.180981
Relation_Mum                     0.163361
raisedhands                      0.127536
Discussion                       0.034684
AnnouncementsView                0.030417
PlaceofBirth_SaudiArabia         0.019226
ParentschoolSatisfaction_Good    0.018027
NationalITy_Palestine           -0.009882
gender_M                        -0.102074
GradeID_G-08                    -0.113692
dtype: float64

Найважливіші фактори за Ridge:
StudentAbsenceDays_Under-7       0.563822
NationalITy_SaudiArabia          0.369476
PlaceofBirth_SaudiArabia         0.349739
PlaceofBirth_Tunis               0.276353
NationalITy_Iran                 0.204791
PlaceofBirth_Iran                0.204791
ParentAnsweringSurvey_Yes        0.198527
Topic_Math                       0.195579
Relation_Mum                     0.188663
VisITedResources                 0.1766