Используйте набор данных datasets/famcs_students.csv \
binary_cols = [ss, interest, weekend_study, bad_sleep, glasses, anime, study_form, literature]

Исползуйте целевую переменную $y=$ binary_cols[N % 8], где $N$ - номер в списке группы. Остальные переменные используйте в качестве признаков $X$ (предикторов). Вы можете отобрать наиболее информативные (по вашему экспертному мнению) признаки, но не менее 5.

Необходимо построить несколько моделей бинарной классификации $y$ по признакам $X$.

Разделите случайно исходные данные на 3 выборки:
* тренировочная (70%) - для определения параметров модели
* валидацонная (15%) - для подбора гиперпараметров модели
* тестовую (15%) - итоговая оценка качества


In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [58]:
data = pd.read_csv('../datasets/famcs_students.csv')
data = data.rename(columns={'cource': 'course', 'retake': 'retake_del'})
data['retake'] = data['retake_del'].map({'0': 0, '1': 1, '2+': 2}).astype(float)
data = data.drop(['retake_del'], axis=1)
data.head(-30)

Unnamed: 0,course,group,stream,sex,age,ss,interest,os,weekend_study,bad_sleep,...,social,sport,miss,study_form,foot_size,eye_color,score,hostel,literature,retake
0,4,11,Прикладная информатика,М,20.0,Нет,Математика,MacOS,Да,Да,...,Экстраверт,"Редко, легкая физкультура",3.0,Бюджет,48.00000,Карие,9.20,"Нет, я из Минска",Да,0.0
1,4,11,Прикладная информатика,Ж,20.0,Нет,Программирование,MacOS,Нет,Нет,...,Экстраверт,"Да, я спортсмен",5.0,Бюджет,39.00000,Зеленые,8.80,"Нет, я из Минска",Да,0.0
2,4,11,Прикладная информатика,Ж,19.0,Нет,Программирование,MacOS,Да,Нет,...,Экстраверт,Вообще нет,10.0,Бюджет,41.00000,Карие,8.80,"Нет, я из Минска",Да,0.0
3,4,11,Прикладная информатика,Ж,20.0,Нет,Математика,MacOS,Да,Да,...,Экстраверт,"Да, я спортсмен",3.0,Бюджет,36.00000,Карие,8.80,"Нет, я из Минска",Да,0.0
4,4,11,Прикладная информатика,М,20.0,Нет,Математика,Windows,Да,Нет,...,Интроверт,Вообще нет,1.0,Бюджет,46.00000,Зеленые,9.00,"Нет, я из Минска",Да,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,4,3,Информатика,Ж,21.0,Нет,Программирование,Windows,Да,Да,...,Интроверт,"Редко, легкая физкультура",5.0,Бюджет,39.00000,Голубые,9.00,"Нет, я из регионов",Да,0.0
93,4,3,Информатика,М,20.0,Нет,Математика,Linux,Нет,Да,...,Экстраверт,"Да, я спортсмен",3.0,Платная,42.00000,Карие,9.42,"Нет, я из Минска",Да,0.0
94,4,3,Информатика,Ж,21.0,Нет,Математика,Windows,Да,Да,...,Экстраверт,Вообще нет,5.0,Бюджет,39.00000,Голубые,8.00,"Нет, я из регионов",Да,0.0
95,4,3,Информатика,М,20.0,Нет,Программирование,MacOS,Да,Да,...,Интроверт,"Редко, легкая физкультура",0.0,Платная,43.00000,Карие,8.00,"Нет, я из Минска",Нет,0.0


In [59]:
print(data.dtypes)

course               int64
group                int64
stream              object
sex                 object
age                float64
ss                  object
interest            object
os                  object
weekend_study       object
bad_sleep           object
glasses             object
work_experience     object
ai                  object
height             float64
anime               object
social              object
sport               object
miss               float64
study_form          object
foot_size          float64
eye_color           object
score              float64
hostel              object
literature          object
retake             float64
dtype: object


In [69]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

class DataPreprocessing:
    def __init__(self, target_column):
        self.categorical_features = []
        self.numerical_features = []
        self.binary_features = []
        self.target_column = target_column
        self.y = None
        self.X = None
    
    def __auto_detect_features(self, df: pd.DataFrame):
        for col in df.columns:
            if col == self.target_column:
                continue
            
            if df[col].nunique() == 2:
                self.binary_features.append(col)
            elif df[col].dtype == object:
                self.categorical_features.append(col)
            else:
                self.numerical_features.append(col)

    def fit(self, df: pd.DataFrame):
        self.__auto_detect_features(df)

        X = df.drop([self.target_column], axis=1)
        y = df[self.target_column]

        transformers = []

        if self.categorical_features:
            categorical_transformer = OneHotEncoder(
                sparse_output=False,
                handle_unknown='ignore'
            )
            transformers.append(('cat', categorical_transformer, self.categorical_features))
        
        if self.binary_features:

            binary_transformer = OneHotEncoder(
                drop='if_binary',
                sparse_output=False,
                handle_unknown='ignore'
            )
            transformers.append(('bin', binary_transformer, self.binary_features))

        if self.numerical_features:
            numerical_transformer = StandardScaler()
            transformers.append(('num', numerical_transformer, self.numerical_features))
        
        self.preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')

        self.preprocessor.fit(X)
        return self
    
    def transform(self, df: pd.DataFrame):
        X = df.drop([self.target_column], axis=1)

        transformed_data = self.preprocessor.transform(X)
        y = df[self.target_column].apply(lambda x: 1 if x == 'Да' else 0)

        result_df = pd.DataFrame(
            transformed_data,
            columns = self.preprocessor.get_feature_names_out(),
            index = df.index
        )

        result_df[self.target_column] = y

        return result_df

    def fit_transform(self, df):
        return self.fit(df).transform(df)

In [70]:
binary_cols = ['ss', 'interest', 'weekend_study', 'bad_sleep', 'glasses', 'anime', 'study_form', 'literature']
N = 3 % 8
var_col = binary_cols[3]
print(var_col)

bad_sleep


In [71]:
from sklearn.model_selection import train_test_split

data_train, temp= train_test_split(data, test_size=0.3, random_state=4)
data_val, data_test = train_test_split(temp, test_size=0.5, random_state=4)

In [72]:
data_train.columns

Index(['course', 'group', 'stream', 'sex', 'age', 'ss', 'interest', 'os',
       'weekend_study', 'bad_sleep', 'glasses', 'work_experience', 'ai',
       'height', 'anime', 'social', 'sport', 'miss', 'study_form', 'foot_size',
       'eye_color', 'score', 'hostel', 'literature', 'retake'],
      dtype='object')

In [73]:
preprocessor = DataPreprocessing(target_column=var_col)

X_train = preprocessor.fit_transform(data_train)
X_train.head()

Unnamed: 0,cat__stream_Информатика,cat__stream_Прикладная информатика,cat__stream_Прикладная математика,cat__os_Linux,cat__os_MacOS,cat__os_Windows,cat__work_experience_1-2 года,cat__work_experience_Меньше года,cat__work_experience_Не работаю,cat__work_experience_более 2 лет,...,bin__study_form_Платная,bin__literature_Нет,num__group,num__age,num__height,num__miss,num__foot_size,num__score,num__retake,bad_sleep
68,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,-0.310597,0.288254,-0.447583,1.106986,-0.088391,-1.834774,1.75662,0
117,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,-0.999653,0.288254,-0.791578,-0.400205,-1.235431,-0.263784,-0.39036,1
14,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.297199,-0.330437,-0.103589,-0.299726,-0.375151,1.181527,-0.39036,1
76,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.526884,-0.330437,-0.65398,-0.601164,0.771889,0.993009,-0.39036,0
86,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,-0.540282,0.288254,0.446802,-0.701643,0.198369,0.364613,-0.39036,1


## 0. Тривиальный классификатор
Всегда выдает наиболее частый класс

In [None]:
y_train