Используйте набор данных datasets/famcs_students.csv \
binary_cols = [ss, interest, weekend_study, bad_sleep, glasses, anime, study_form, literature]

Исползуйте целевую переменную $y=$ binary_cols[N % 8], где $N$ - номер в списке группы. Остальные переменные используйте в качестве признаков $X$ (предикторов). Вы можете отобрать наиболее информативные (по вашему экспертному мнению) признаки, но не менее 5.

Необходимо построить несколько моделей бинарной классификации $y$ по признакам $X$.

Разделите случайно исходные данные на 3 выборки:
* тренировочная (70%) - для определения параметров модели
* валидацонная (15%) - для подбора гиперпараметров модели
* тестовую (15%) - итоговая оценка качества


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
data = pd.read_csv('../datasets/famcs_students.csv')
data = data.rename(columns={'cource': 'course', 'retake': 'retake_del'})
data['retake'] = data['retake_del'].map({'0': 0, '1': 1, '2+': 2}).astype(float)
data = data.drop(['retake_del'], axis=1)
data.head(-30)

Unnamed: 0,course,group,stream,sex,age,ss,interest,os,weekend_study,bad_sleep,...,social,sport,miss,study_form,foot_size,eye_color,score,hostel,literature,retake
0,4,11,Прикладная информатика,М,20.0,Нет,Математика,MacOS,Да,Да,...,Экстраверт,"Редко, легкая физкультура",3.0,Бюджет,48.00000,Карие,9.20,"Нет, я из Минска",Да,0.0
1,4,11,Прикладная информатика,Ж,20.0,Нет,Программирование,MacOS,Нет,Нет,...,Экстраверт,"Да, я спортсмен",5.0,Бюджет,39.00000,Зеленые,8.80,"Нет, я из Минска",Да,0.0
2,4,11,Прикладная информатика,Ж,19.0,Нет,Программирование,MacOS,Да,Нет,...,Экстраверт,Вообще нет,10.0,Бюджет,41.00000,Карие,8.80,"Нет, я из Минска",Да,0.0
3,4,11,Прикладная информатика,Ж,20.0,Нет,Математика,MacOS,Да,Да,...,Экстраверт,"Да, я спортсмен",3.0,Бюджет,36.00000,Карие,8.80,"Нет, я из Минска",Да,0.0
4,4,11,Прикладная информатика,М,20.0,Нет,Математика,Windows,Да,Нет,...,Интроверт,Вообще нет,1.0,Бюджет,46.00000,Зеленые,9.00,"Нет, я из Минска",Да,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,4,3,Информатика,Ж,21.0,Нет,Программирование,Windows,Да,Да,...,Интроверт,"Редко, легкая физкультура",5.0,Бюджет,39.00000,Голубые,9.00,"Нет, я из регионов",Да,0.0
93,4,3,Информатика,М,20.0,Нет,Математика,Linux,Нет,Да,...,Экстраверт,"Да, я спортсмен",3.0,Платная,42.00000,Карие,9.42,"Нет, я из Минска",Да,0.0
94,4,3,Информатика,Ж,21.0,Нет,Математика,Windows,Да,Да,...,Экстраверт,Вообще нет,5.0,Бюджет,39.00000,Голубые,8.00,"Нет, я из регионов",Да,0.0
95,4,3,Информатика,М,20.0,Нет,Программирование,MacOS,Да,Да,...,Интроверт,"Редко, легкая физкультура",0.0,Платная,43.00000,Карие,8.00,"Нет, я из Минска",Нет,0.0


In [5]:
print(data.dtypes)

course               int64
group                int64
stream              object
sex                 object
age                float64
ss                  object
interest            object
os                  object
weekend_study       object
bad_sleep           object
glasses             object
work_experience     object
ai                  object
height             float64
anime               object
social              object
sport               object
miss               float64
study_form          object
foot_size          float64
eye_color           object
score              float64
hostel              object
literature          object
retake             float64
dtype: object


In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

class DataPreprocessing:
    def __init__(self, target_column):
        self.categorical_features = []
        self.numerical_features = []
        self.binary_features = []
        self.target_column = target_column
        self.y = None
        self.X = None
    
    def __auto_detect_features(self, df: pd.DataFrame):
        for col in df.columns:
            if col == self.target_column:
                continue
            
            if df[col].nunique() == 2:
                self.binary_features.append(col)
            elif df[col].dtype == object:
                self.categorical_features.append(col)
            else:
                self.numerical_features.append(col)

    def fit(self, df: pd.DataFrame):
        self.__auto_detect_features(df)

        X = df.drop([self.target_column], axis=1)
        y = df[self.target_column]

        transformers = []

        if self.categorical_features:
            categorical_transformer = OneHotEncoder(
                drop='first',
                sparse_output=False,
                handle_unknown='ignore'
            )
            transformers.append(('cat', categorical_transformer, self.categorical_features))
        
        if self.binary_features:
            binary_transformer = OneHotEncoder(
                drop='if_binary',
                sparse_output=False,
                handle_unknown='ignore'
            )
            transformers.append(('bin', binary_transformer, self.binary_features))

        if self.numerical_features:
            numerical_transformer = StandardScaler()
            transformers.append(('num', numerical_transformer, self.numerical_features))
        
        self.preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')

        self.preprocessor.fit(X)
        return self
    
    def transform(self, df: pd.DataFrame):
        X = df.drop([self.target_column], axis=1)

        transformed_data = self.preprocessor.transform(X)
        y = df[self.target_column].apply(lambda x: 1 if x == 'Да' else 0)

        result_df = pd.DataFrame(
            transformed_data,
            columns = self.preprocessor.get_feature_names_out(),
            index = df.index
        )

        result_df[self.target_column] = y

        return result_df

    def fit_transform(self, df):
        return self.fit(df).transform(df)

In [7]:
binary_cols = ['ss', 'interest', 'weekend_study', 'bad_sleep', 'glasses', 'anime', 'study_form', 'literature']
N = 3 % 8
var_col = binary_cols[3]
print(var_col)

bad_sleep


In [8]:
from sklearn.model_selection import train_test_split

data_train_val, data_test= train_test_split(data, test_size=0.15, random_state=4)

In [9]:
data_train_val.columns

Index(['course', 'group', 'stream', 'sex', 'age', 'ss', 'interest', 'os',
       'weekend_study', 'bad_sleep', 'glasses', 'work_experience', 'ai',
       'height', 'anime', 'social', 'sport', 'miss', 'study_form', 'foot_size',
       'eye_color', 'score', 'hostel', 'literature', 'retake'],
      dtype='object')

In [10]:
preprocessor = DataPreprocessing(target_column=var_col)

data_train_val_p = preprocessor.fit_transform(data_train_val)
data_test_p = preprocessor.transform(data_test)
data_train_val_p.head()

Unnamed: 0,cat__stream_Прикладная информатика,cat__stream_Прикладная математика,cat__os_MacOS,cat__os_Windows,cat__work_experience_Меньше года,cat__work_experience_Не работаю,cat__work_experience_более 2 лет,cat__ai_Claude,cat__ai_Copilot,cat__ai_Cursor,...,bin__study_form_Платная,bin__literature_Нет,num__group,num__age,num__height,num__miss,num__foot_size,num__score,num__retake,bad_sleep
7,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.187259,-0.334413,0.519002,0.272319,0.275464,-0.173564,-0.37789,1
61,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.383866,-0.334413,-1.354928,-0.487335,-0.881823,0.348836,-0.37789,1
113,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.057206,-0.334413,-1.354928,-0.704379,-1.171144,1.393636,-0.37789,1
10,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.187259,-0.334413,0.519002,0.272319,0.275464,-0.173564,-0.37789,1
102,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.608313,-0.334413,0.014482,-0.053247,0.275464,0.544736,-0.37789,1


In [11]:
def data_to_X_y(*args):
    return [(data.drop([var_col], axis=1).values, data[var_col].values) for data in args]

data_train_p, data_val_p = train_test_split(data_train_val_p, test_size=(0.15 / 0.70), random_state=4)

(X_train_p, y_train_p),\
(X_val_p, y_val_p),\
(X_test_p, y_test_p) = data_to_X_y(data_train_p, data_val_p, data_test_p)

print(X_train_p)
print(y_train_p)

[[ 0.          0.          0.         ...  0.2754636   0.08763626
  -0.37788971]
 [ 0.          0.          0.         ... -0.88182288  1.00183623
  -0.37788971]
 [ 0.          0.          0.         ... -1.1711445   0.54473624
  -0.37788971]
 ...
 [ 0.          0.          0.         ...  0.2754636   0.47943624
  -0.37788971]
 [ 0.          0.          0.         ...  0.2754636   0.34883625
  -0.37788971]
 [ 0.          0.          1.         ... -2.29223106 -4.87516357
   3.87834176]]
[1 1 1 0 1 1 1 1 1 0 1 0 1 0 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0
 1 1 1 1 0 1 1 0 1 0 1 1 1 0 0 1 0 1 0 0 1 0 1 0 1 1 1 1 1 1 0 0 0 1 0 1 0
 1 1 1 1 1 0 1 1 1 1]


## 0. Тривиальный классификатор
Всегда выдает наиболее частый класс

In [73]:
class TrivClassifier:
    def fit(self, X, y):
        vals, counts = np.unique(y, return_counts=True)
        self.pred = vals[np.argmax(counts)]

    def predict(self, X_test):
        return np.array([self.pred for x in range(len(X_test))])

In [74]:
model_0 = TrivClassifier()
model_0.fit(X_train_p, y_train_p)

## 1. Наивный байесовский классификатор
Алгоритм реализуйте сами!

P(1|features) = P(features|1) * P(1) / P(features)

* P(1|features) -- posterior prob 
* P(1) -- prior prob
* P(features|1) -- likelihood
* P(features) -- evidence
\
\
P(1) -- class prob \
P(0) -- not class prob


In [77]:
import numpy as np

class NaiveBayesClassifier: 
    def __init__(self):
        self.isTrainded = False
        
    def fit(self, X: np.ndarray, y: np.ndarray):
        classes, cls_counts = np.unique(y, return_counts=True)
        n_classes = len(classes) 
        self.log_priors = np.log(cls_counts / len(y))

        X_bin, X_num = self.bin_num_arr_from_X(X)
        
        # X_num
        self.X_cls_mean = np.array([np.mean(X_num[y == c], axis=0) for c in range(n_classes)])
        self.X_stds = np.array([np.std(X_num[y == c], axis=0) for c in range(n_classes)])
        
        # X_bin 
        self.X_prob = np.array([np.sum(X_bin[y == c], axis=0) / np.sum(y == c) for c in range(n_classes)])
        self.isTrainded = True

    
    def bin_num_arr_from_X(self, X: np.ndarray):
        if (not self.isTrainded):
            n_samples, n_features = X.shape
            X_bin = []  
            X_num = []
            self.binary_features_indices = []
            self.numeric_features_indices = []
            
            for feature in range(n_features):
                unique_vals = np.unique(X[:, feature])
                if len(unique_vals) == 2:
                    X_bin.append(X[:, feature])
                    self.binary_features_indices.append(feature)
                else:
                    X_num.append(X[:, feature])
                    self.numeric_features_indices.append(feature)
            X_bin = np.column_stack(X_bin) if X_bin else np.array([])
            X_num = np.column_stack(X_num) if X_num else np.array([])
            return X_bin, X_num
        X_bin = X[:, self.binary_features_indices] if self.binary_features_indices else np.array([])
        X_num = X[:, self.numeric_features_indices] if self.numeric_features_indices else np.array([])
        return X_bin, X_num

    def log_pdf(self, x, mean, std):
        std_safe = np.maximum(std, 1e-8)
        return -0.5 * ((x - mean) / std_safe) ** 2 - np.log(np.sqrt(2 * np.pi) * std_safe)
    
    def log_probs(self, x, prob):
        prob_safe = np.clip(prob, 1e-8, 1 - 1e-8)
        return np.where(x == 1, np.log(prob_safe), np.log(1 - prob_safe))
    
    def predict(self, X_test):
        n_samples = X_test.shape[0]
        n_classes = len(self.log_priors)

        log_posteriors = np.tile(self.log_priors, (n_samples, 1))
        
        X_test_bin, X_test_num = self.bin_num_arr_from_X(X_test)

        # if X_test_num.size > 0:
        #     for i in range(n_samples):
        #         for c in range(n_classes):
        #             log_likelihood_num = np.sum([self.log_pdf(X_test_num[i, j], self.X_cls_mean[c, j], self.X_stds[c, j]) 
        #                                        for j in range(X_test_num.shape[1])])
        #             log_posteriors[i, c] += log_likelihood_num
        

        if X_test_bin.size > 0:
            for i in range(n_samples):
                for c in range(n_classes):
                    log_likelihood_bin = np.sum(self.log_probs(X_test_bin[i], self.X_prob[c]))
                    log_posteriors[i, c] += log_likelihood_bin
        return np.argmax(log_posteriors, axis=1)

In [79]:
model_1 = NaiveBayesClassifier()
model_1.fit(X_train_p, y_train_p)

In [52]:
pr = model_1.predict(X_test_p)

[[-17.08193611 -15.73033113]
 [-16.44519592 -17.20752672]
 [-16.26618661 -30.45822163]
 [-22.26128093 -35.74698794]
 [-14.01461373 -12.61213304]
 [-21.24788038 -19.77072843]
 [-16.39985183 -13.97099404]
 [-33.30539227 -31.58769336]
 [-16.61420263 -13.03649329]
 [-19.86768548 -21.60988955]
 [-20.22730302 -20.51589244]
 [-19.79915038 -17.72328752]
 [-16.0395995  -14.84604769]
 [-15.18200907 -15.00984577]
 [-16.01710375 -14.67513097]
 [-12.53222083 -12.50494315]
 [-23.32650435 -21.91160249]
 [-15.57766055 -14.97433638]
 [-20.34106659 -22.47956018]
 [-19.39100899 -16.64540971]]


In [53]:
print(pr)
print(y_test_p)

[1 0 0 0 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1]
[1 0 1 0 1 1 1 1 1 1 1 0 1 0 0 1 1 0 1 1]


## 2. Метод k-ближайших соседей
Алгоритм реализуйте сами! Подберите оптимальный гиперпараметр 
k.

In [60]:
class KNN:
    def __init__(self, k_neighbors=5):
        self.k_neighbors = k_neighbors

    def fit(self, X, y):
        self.X = X
        self.y = y
    
    def predict(self, X_test):
        return np.array([self._make_prediction(x) for x in X_test])
    
    def _make_prediction(self, x_test_i):
        distances = self._euclidean_distances(x_test_i)
        k_nearest_indexes = np.argsort(distances)[:self.k_neighbors]
        targets = self.y[k_nearest_indexes]
        return np.bincount(targets).argmax()

    def _euclidean_distances(self, x_test_i):
        return np.sqrt(np.sum((self.X - x_test_i) ** 2, axis=1))

In [61]:
best_k = 1
best_accuracy = 0
for k in range(1, 100):
    model = KNN(k)
    model.fit(X_train_p, y_train_p)
    def accuracy(pred, y_val):
        return np.sum(pred == y_val) / len(pred)
    pred = model.predict(X_val_p)
    acc = accuracy(pred, y_val_p)
    if acc > best_accuracy:
        best_accuracy = accuracy(pred, y_val_p)
        best_k = k
print(best_k)

5


In [62]:
model_2 = KNN(5)
model_2.fit(X_train_p, y_train_p)

## 3. Логистическая регрессия
Алгоритм реализуйте сами! Изобразите график зависимости метрик precision, recall от порога классификаци, а итоговый порог классификации выберите таким, при котором достигается максимум метрики f1. Используйте валидационную выборку.

In [82]:
class LogisticRegressionGrad:
    def __init__(self, lr=0.01, n_iters=1000, epsilon=1e-6, threshold=0.5):
        self.lr = lr
        self.n_iters = n_iters
        self.bias = None
        self.weights = None
        self.epsilon = epsilon
        self.threshold = threshold

    def sigmoid(self, z):
        z = np.clip(z, -500, 500)  
        return 1 / (1 + np.exp(-z))

    def binary_cross_entropy(self, y_true, y_pred):
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.bias = 0
        self.weights = np.zeros(n_features)

        for i in range(self.n_iters):
            linear_model = X @ self.weights + self.bias
            y_pred = self.sigmoid(linear_model)

            loss = self.binary_cross_entropy(y, y_pred)

            dw = (1 / n_samples) * X.T @ (y_pred - y)
            db = (1 / n_samples) * np.sum(y_pred - y)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db
            
            if np.linalg.norm(dw) < self.epsilon and abs(db) < self.epsilon:
                print(f'Converged at iteration {i}, Loss: {loss:.6f}')
                break

    def predict_proba(self, X_test):
        linear_model = X_test @ self.weights + self.bias
        return self.sigmoid(linear_model)

    def predict(self, X_test):
        linear_model = X_test @ self.weights + self.bias
        y_pred = self.sigmoid(linear_model)
        return (y_pred > self.threshold).astype(int)

In [83]:
best_threshhold = 0.5
best_accuracy = 0
for threshhold_10 in range(1, 10):
    threshhold = threshhold_10 / 10
    model = LogisticRegressionGrad(threshold=threshhold)
    model.fit(X_train_p, y_train_p)
    def accuracy(pred, y_val):
        return np.sum(pred == y_val) / len(pred)
    pred = model.predict(X_val_p)
    acc = accuracy(pred, y_val_p)
    if acc > best_accuracy:
        best_accuracy = accuracy(pred, y_val_p)
        best_threshhold = threshhold

In [84]:
print(best_threshhold)

0.6


In [92]:
model_3 = LogisticRegressionGrad(threshold=0.6)
model_3.fit(X_train_p, y_train_p)

## 4. Оценка качества
Для каждой построенной модели, по отложенной тестовой выборке оцените качество моделей:
accuracy\
precision\
recall\
roc-auc\
постройте матрицу ошибок

In [93]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
import pandas as pd

def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test) if hasattr(model, 'predict_proba') else None
    
    metrics = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred)
    }
    
    if y_proba is not None:
        metrics['ROC-AUC'] = roc_auc_score(y_test, y_proba)
    else:
        metrics['ROC-AUC'] = None

    cm = confusion_matrix(y_test, y_pred)
    
    return metrics, cm

results = []
models = [model_0, model_1, model_2, model_3]

for i, model in enumerate(models):
    metrics, cm = evaluate_model(model, X_test_p, y_test_p, f'model_{i}')
    results.append(metrics)
    print(f"\n--- Model {i} Confusion Matrix ---")
    print(cm)

results_df = pd.DataFrame(results)
print("\n" + "="*50)
print("Summary Results:")
print(results_df.round(4))


--- Model 0 Confusion Matrix ---
[[ 0  6]
 [ 0 14]]

--- Model 1 Confusion Matrix ---
[[ 2  4]
 [ 4 10]]

--- Model 2 Confusion Matrix ---
[[ 0  6]
 [ 2 12]]

--- Model 3 Confusion Matrix ---
[[ 2  4]
 [ 2 12]]

Summary Results:
     Model  Accuracy  Precision  Recall  ROC-AUC
0  model_0       0.7     0.7000  1.0000      NaN
1  model_1       0.6     0.7143  0.7143      NaN
2  model_2       0.6     0.6667  0.8571      NaN
3  model_3       0.7     0.7500  0.8571   0.6548
