Импорт нужных библиотек

In [395]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,PowerTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score,precision_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel
import opendatasets as od 

In [396]:
od.download("https://www.kaggle.com/competitions/playground-series-s4e2") 


Skipping, found downloaded files in ".\playground-series-s4e2" (use force=True to force download)


In [397]:
pd.set_option("display.max_columns", None)

In [398]:
df_train = pd.read_csv('playground-series-s4e2/train.csv') 
df_test = pd.read_csv('playground-series-s4e2/test.csv')

In [399]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

In [400]:
df_train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [401]:
# Расшифровки:
# FAVC - Frequent consumption of high caloric food (Частое употребление высококалорийной пищи). Тип данных: object (вероятно, категориальный признак). Возможные значения: «yes»/«no» или аналогичные категории
# FCVC - Frequency of consumption of vegetables (Частота употребления овощей). Тип данных: int64 (вероятно, количественный признак). Возможные значения: количество раз в день/неделю или аналогичные метрики
# NCP - Number of main meals (Количество основных приемов пищи). Тип данных: int64 (вероятно, количественный признак). Возможные значения: целое число, обозначающее количество основных приемов пищи в день
# CAEC - Consumption of food between meals (Употребление пищи между приемами пищи). Тип данных: object (вероятно, категориальный признак). Возможные значения: «yes»/«no» или аналогичные категории
# CH2O - Consumption of water daily (Употребление воды в день). Тип данных: float64 (вероятно, количественный признак). Возможные значения: количество литров/стаканов воды, употребляемой в день
# SCC - Calories consumption monitoring (Мониторинг потребления калорий). Тип данных: object (вероятно, категориальный признак). Возможные значения: «yes»/«no» или аналогичные категории
# FAF - Physical activity frequency (Частота физической активности). Тип данных: int64 (вероятно, количественный признак). Возможные значения: количество раз в день/неделю или аналогичные метрики
# TUE - Time using technology devices (Время, проводимое за использованием технологических устройств). Тип данных: float64 (вероятно, количественный признак). Возможные значения: количество часов/минут в день, проводимых за использованием устройств
# CALC - Consumption of alcohol (Употребление алкоголя). Тип данных: object (вероятно, категориальный признак). Возможные значения: «yes»/«no» или аналогичные категории

In [402]:
df_train = df_train.rename({'family_history_with_overweight':'FamilyOverweightHistory'},axis='columns')
df_test = df_test.rename({'family_history_with_overweight':'FamilyOverweightHistory'},axis='columns')


In [403]:
df_train.isnull().sum().sort_values(ascending=False)
df_test.isnull().sum().sort_values(ascending=False)

id                         0
Gender                     0
Age                        0
Height                     0
Weight                     0
FamilyOverweightHistory    0
FAVC                       0
FCVC                       0
NCP                        0
CAEC                       0
SMOKE                      0
CH2O                       0
SCC                        0
FAF                        0
TUE                        0
CALC                       0
MTRANS                     0
dtype: int64

In [404]:
df_test.duplicated().sum()

np.int64(0)

In [405]:
# Кодируем значения в df_train
lab_encoder = LabelEncoder()
for i in df_train.columns:
    if df_train[i].dtype == 'object':
        df_train[i] = lab_encoder.fit_transform(df_train[i])
        label_mapping = dict(zip(lab_encoder.classes_, lab_encoder.transform(lab_encoder.classes_)))
        print(label_mapping)

{'Female': np.int64(0), 'Male': np.int64(1)}
{'no': np.int64(0), 'yes': np.int64(1)}
{'no': np.int64(0), 'yes': np.int64(1)}
{'Always': np.int64(0), 'Frequently': np.int64(1), 'Sometimes': np.int64(2), 'no': np.int64(3)}
{'no': np.int64(0), 'yes': np.int64(1)}
{'no': np.int64(0), 'yes': np.int64(1)}
{'Frequently': np.int64(0), 'Sometimes': np.int64(1), 'no': np.int64(2)}
{'Automobile': np.int64(0), 'Bike': np.int64(1), 'Motorbike': np.int64(2), 'Public_Transportation': np.int64(3), 'Walking': np.int64(4)}
{'Insufficient_Weight': np.int64(0), 'Normal_Weight': np.int64(1), 'Obesity_Type_I': np.int64(2), 'Obesity_Type_II': np.int64(3), 'Obesity_Type_III': np.int64(4), 'Overweight_Level_I': np.int64(5), 'Overweight_Level_II': np.int64(6)}


In [406]:
# Кодируем значения в df_test
lab_encoder = LabelEncoder()
for i in df_test.columns:
    if df_test[i].dtype == 'object':
        df_test[i] = lab_encoder.fit_transform(df_test[i])
        label_mapping = dict(zip(lab_encoder.classes_, lab_encoder.transform(lab_encoder.classes_)))
        print(label_mapping)

{'Female': np.int64(0), 'Male': np.int64(1)}
{'no': np.int64(0), 'yes': np.int64(1)}
{'no': np.int64(0), 'yes': np.int64(1)}
{'Always': np.int64(0), 'Frequently': np.int64(1), 'Sometimes': np.int64(2), 'no': np.int64(3)}
{'no': np.int64(0), 'yes': np.int64(1)}
{'no': np.int64(0), 'yes': np.int64(1)}
{'Always': np.int64(0), 'Frequently': np.int64(1), 'Sometimes': np.int64(2), 'no': np.int64(3)}
{'Automobile': np.int64(0), 'Bike': np.int64(1), 'Motorbike': np.int64(2), 'Public_Transportation': np.int64(3), 'Walking': np.int64(4)}


In [407]:
df_train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,FamilyOverweightHistory,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,1,24.443011,1.699998,81.66995,1,1,2.0,2.983297,2,0,2.763573,0,0.0,0.976473,1,3,6
1,1,0,18.0,1.56,57.0,1,1,2.0,3.0,1,0,2.0,0,1.0,1.0,2,0,1
2,2,0,18.0,1.71146,50.165754,1,1,1.880534,1.411685,2,0,1.910378,0,0.866045,1.673584,2,3,0
3,3,0,20.952737,1.71073,131.274851,1,1,3.0,3.0,2,0,1.674061,0,1.467863,0.780199,1,3,4
4,4,1,31.641081,1.914186,93.798055,1,1,2.679664,1.971472,2,0,1.979848,0,1.967973,0.931721,1,3,6


In [408]:
df_test.head()

Unnamed: 0,id,Gender,Age,Height,Weight,FamilyOverweightHistory,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,20758,1,26.899886,1.848294,120.644178,1,1,2.938616,3.0,2,0,2.825629,0,0.8554,0.0,2,3
1,20759,0,21.0,1.6,66.0,1,1,2.0,1.0,2,0,3.0,0,1.0,0.0,2,3
2,20760,0,26.0,1.643355,111.600553,1,1,3.0,3.0,2,0,2.621877,0,0.0,0.250502,2,3
3,20761,1,20.979254,1.553127,103.669116,1,1,2.0,2.977909,2,0,2.786417,0,0.094851,0.0,2,3
4,20762,0,26.0,1.627396,104.835346,1,1,3.0,3.0,2,0,2.653531,0,0.0,0.741069,2,3


In [409]:
df_train['Age']=df_train['Age'].astype(int)
df_train['Height']=df_train['Height'].astype(float).round(2)
df_train['Weight']=df_train['Weight'].astype(float).round(2)
df_train['FCVC']=df_train['FCVC'].astype(int)
df_train['NCP']=df_train['NCP'].astype(int)
df_train['CH2O']=df_train['CH2O'].astype(float).round(2)
df_train['FAF']=df_train['FAF'].astype(int)
df_train['TUE']=df_train['TUE'].astype(float).round(2)

In [410]:
df_test['Age']=df_train['Age'].astype(int)
df_test['Height']=df_test['Height'].astype(float).round(2)
df_test['Weight']=df_test['Weight'].astype(float).round(2)
df_test['FCVC']=df_test['FCVC'].astype(int)
df_test['NCP']=df_test['NCP'].astype(int)
df_test['CH2O']=df_test['CH2O'].astype(float).round(2)
df_test['FAF']=df_test['FAF'].astype(int)
df_test['TUE']=df_test['TUE'].astype(float).round(2)

In [411]:
df_train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,FamilyOverweightHistory,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,1,24,1.7,81.67,1,1,2,2,2,0,2.76,0,0,0.98,1,3,6
1,1,0,18,1.56,57.0,1,1,2,3,1,0,2.0,0,1,1.0,2,0,1
2,2,0,18,1.71,50.17,1,1,1,1,2,0,1.91,0,0,1.67,2,3,0
3,3,0,20,1.71,131.27,1,1,3,3,2,0,1.67,0,1,0.78,1,3,4
4,4,1,31,1.91,93.8,1,1,2,1,2,0,1.98,0,1,0.93,1,3,6


In [412]:
df_test.head()

Unnamed: 0,id,Gender,Age,Height,Weight,FamilyOverweightHistory,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,20758,1,24,1.85,120.64,1,1,2,3,2,0,2.83,0,0,0.0,2,3
1,20759,0,18,1.6,66.0,1,1,2,1,2,0,3.0,0,1,0.0,2,3
2,20760,0,18,1.64,111.6,1,1,3,3,2,0,2.62,0,0,0.25,2,3
3,20761,1,20,1.55,103.67,1,1,2,2,2,0,2.79,0,0,0.0,2,3
4,20762,0,31,1.63,104.84,1,1,3,3,2,0,2.65,0,0,0.74,2,3


1. Посмотрели данные, проверили пропуски
2. Скорректировали имя колонки
3. Закодировали категориальные данные

In [413]:
df_train['BMI'] = (df_train['Weight'] / pow(df_train['Height'], 2)).round(2)

def categorize(value):
    if value <= 16:
        return 'Very low'
    elif 16 < value <= 18.5:
        return 'Low'
    elif 18.5 < value <= 25:
        return 'Normal'
    elif 25 < value <= 30:
        return 'Above normal'
    elif 30 < value <= 35:
        return '1st class'
    elif 35 < value <= 40:
        return '2nd class'
    else:
        return '3rd class'

df_train['BMICategory'] = df_train['BMI'].map(categorize)

df_test['BMI'] = (df_test['Weight'] / pow(df_test['Height'], 2)).round(2)

def categorize(value):
    if value <= 16:
        return 'Very low'
    elif 16 < value <= 18.5:
        return 'Low'
    elif 18.5 < value <= 25:
        return 'Normal'
    elif 25 < value <= 30:
        return 'Above normal'
    elif 30 < value <= 35:
        return '1st class'
    elif 35 < value <= 40:
        return '2nd class'
    else:
        return '3rd class'

df_test['BMICategory'] = df_test['BMI'].map(categorize)
    


In [414]:
df_train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,FamilyOverweightHistory,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI,BMICategory
0,0,1,24,1.7,81.67,1,1,2,2,2,0,2.76,0,0,0.98,1,3,6,28.26,Above normal
1,1,0,18,1.56,57.0,1,1,2,3,1,0,2.0,0,1,1.0,2,0,1,23.42,Normal
2,2,0,18,1.71,50.17,1,1,1,1,2,0,1.91,0,0,1.67,2,3,0,17.16,Low
3,3,0,20,1.71,131.27,1,1,3,3,2,0,1.67,0,1,0.78,1,3,4,44.89,3rd class
4,4,1,31,1.91,93.8,1,1,2,1,2,0,1.98,0,1,0.93,1,3,6,25.71,Above normal


In [415]:
df_test.head()

Unnamed: 0,id,Gender,Age,Height,Weight,FamilyOverweightHistory,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,BMI,BMICategory
0,20758,1,24,1.85,120.64,1,1,2,3,2,0,2.83,0,0,0.0,2,3,35.25,2nd class
1,20759,0,18,1.6,66.0,1,1,2,1,2,0,3.0,0,1,0.0,2,3,25.78,Above normal
2,20760,0,18,1.64,111.6,1,1,3,3,2,0,2.62,0,0,0.25,2,3,41.49,3rd class
3,20761,1,20,1.55,103.67,1,1,2,2,2,0,2.79,0,0,0.0,2,3,43.15,3rd class
4,20762,0,31,1.63,104.84,1,1,3,3,2,0,2.65,0,0,0.74,2,3,39.46,2nd class


In [416]:
# Кодируем значения
lab_encoder = LabelEncoder()
df_train["BMICategory"] = lab_encoder.fit_transform(df_train["BMICategory"])

# Выведем соответствие исходных значений и их кодированных меток
print("Соответствие исходных значений и кодов BMICategory:")
label_mapping = dict(zip(lab_encoder.classes_, lab_encoder.transform(lab_encoder.classes_)))
print(label_mapping)

Соответствие исходных значений и кодов BMICategory:
{'1st class': np.int64(0), '2nd class': np.int64(1), '3rd class': np.int64(2), 'Above normal': np.int64(3), 'Low': np.int64(4), 'Normal': np.int64(5), 'Very low': np.int64(6)}


In [417]:
# Кодируем значения
lab_encoder = LabelEncoder()
df_test["BMICategory"] = lab_encoder.fit_transform(df_test["BMICategory"])

# Выведем соответствие исходных значений и их кодированных меток
print("Соответствие исходных значений и кодов BMICategory:")
label_mapping = dict(zip(lab_encoder.classes_, lab_encoder.transform(lab_encoder.classes_)))
print(label_mapping)

Соответствие исходных значений и кодов BMICategory:
{'1st class': np.int64(0), '2nd class': np.int64(1), '3rd class': np.int64(2), 'Above normal': np.int64(3), 'Low': np.int64(4), 'Normal': np.int64(5), 'Very low': np.int64(6)}


In [418]:
df_train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,FamilyOverweightHistory,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI,BMICategory
0,0,1,24,1.7,81.67,1,1,2,2,2,0,2.76,0,0,0.98,1,3,6,28.26,3
1,1,0,18,1.56,57.0,1,1,2,3,1,0,2.0,0,1,1.0,2,0,1,23.42,5
2,2,0,18,1.71,50.17,1,1,1,1,2,0,1.91,0,0,1.67,2,3,0,17.16,4
3,3,0,20,1.71,131.27,1,1,3,3,2,0,1.67,0,1,0.78,1,3,4,44.89,2
4,4,1,31,1.91,93.8,1,1,2,1,2,0,1.98,0,1,0.93,1,3,6,25.71,3


In [419]:
# Создаем бины, которые содержат показатели категорий, границы групп. 
bins = [0, 12, 18, 35, 60, float('inf')] 
# Названия возрастных групп
labels = ['Kids', 'Teens', 'Young adults', 'Middle age', 'Elderly'] 
# Функция cut делит данные на интервалы 
df_train['Age_Category'] = pd.cut(df_train['Age'], bins=bins, labels=labels, right=False)

df_train.head() 

Unnamed: 0,id,Gender,Age,Height,Weight,FamilyOverweightHistory,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI,BMICategory,Age_Category
0,0,1,24,1.7,81.67,1,1,2,2,2,0,2.76,0,0,0.98,1,3,6,28.26,3,Young adults
1,1,0,18,1.56,57.0,1,1,2,3,1,0,2.0,0,1,1.0,2,0,1,23.42,5,Young adults
2,2,0,18,1.71,50.17,1,1,1,1,2,0,1.91,0,0,1.67,2,3,0,17.16,4,Young adults
3,3,0,20,1.71,131.27,1,1,3,3,2,0,1.67,0,1,0.78,1,3,4,44.89,2,Young adults
4,4,1,31,1.91,93.8,1,1,2,1,2,0,1.98,0,1,0.93,1,3,6,25.71,3,Young adults


In [420]:
# Создаем бины, которые содержат показатели категорий, границы групп. 
bins = [0, 12, 18, 35, 60, float('inf')] 
# Названия возрастных групп
labels = ['Kids', 'Teens', 'Young adults', 'Middle age', 'Elderly'] 
# Функция cut делит данные на интервалы 
df_test['Age_Category'] = pd.cut(df_test['Age'], bins=bins, labels=labels, right=False)

df_test.head() 

Unnamed: 0,id,Gender,Age,Height,Weight,FamilyOverweightHistory,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,BMI,BMICategory,Age_Category
0,20758,1,24,1.85,120.64,1,1,2,3,2,0,2.83,0,0,0.0,2,3,35.25,1,Young adults
1,20759,0,18,1.6,66.0,1,1,2,1,2,0,3.0,0,1,0.0,2,3,25.78,3,Young adults
2,20760,0,18,1.64,111.6,1,1,3,3,2,0,2.62,0,0,0.25,2,3,41.49,2,Young adults
3,20761,1,20,1.55,103.67,1,1,2,2,2,0,2.79,0,0,0.0,2,3,43.15,2,Young adults
4,20762,0,31,1.63,104.84,1,1,3,3,2,0,2.65,0,0,0.74,2,3,39.46,1,Young adults


In [421]:
# Кодируем значения
lab_encoder = LabelEncoder()
df_train["Age_Category"] = lab_encoder.fit_transform(df_train["Age_Category"])

# Выведем соответствие исходных значений и их кодированных меток
print("Соответствие исходных значений и кодов BMICategory:")
label_mapping = dict(zip(lab_encoder.classes_, lab_encoder.transform(lab_encoder.classes_)))
print(label_mapping)

Соответствие исходных значений и кодов BMICategory:
{'Elderly': np.int64(0), 'Middle age': np.int64(1), 'Teens': np.int64(2), 'Young adults': np.int64(3)}


In [422]:
# Кодируем значения
lab_encoder = LabelEncoder()
df_test["Age_Category"] = lab_encoder.fit_transform(df_test["Age_Category"])

# Выведем соответствие исходных значений и их кодированных меток
print("Соответствие исходных значений и кодов BMICategory:")
label_mapping = dict(zip(lab_encoder.classes_, lab_encoder.transform(lab_encoder.classes_)))
print(label_mapping)

Соответствие исходных значений и кодов BMICategory:
{'Elderly': np.int64(0), 'Middle age': np.int64(1), 'Teens': np.int64(2), 'Young adults': np.int64(3)}


In [423]:
df_train.to_csv("submission_train.csv", index=False)
df_test.to_csv("submission_test.csv", index=False)

In [None]:
# Делим данные на выборки
X_train = df_train.drop(columns=['NObeyesdad'])
X_test = df_test.drop(columns=['NObeyesdad'])
y_train = df_train['NObeyesdad'] 
y_test = df_test['NObeyesdad']

In [425]:
# Делаем значения более приятными для визуального восприятия после стандартизации 
np.set_printoptions(suppress=True)

In [426]:
# Нормализуем данные # Гипотеза 1 - для нашей задачи min-max scaler будет работать лучше 
scaler = MinMaxScaler()
X_train_normalized = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
# К test применяем только трансформ, чтобы избежать утечки данных
X_test_normalized = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)

In [427]:
# Исследуем дисперсию, чтобы задать treshhold
variances = np.var(X_train_normalized, axis=0)
print("Дисперсии признаков:\n", variances)

Дисперсии признаков:
 id                         0.083341
Gender                     0.249996
Age                        0.014353
Height                     0.027264
Weight                     0.043788
FamilyOverweightHistory    0.147833
FAVC                       0.078237
FCVC                       0.086543
NCP                        0.063886
CAEC                       0.022403
SMOKE                      0.011663
CH2O                       0.092567
SCC                        0.032000
FAF                        0.076026
TUE                        0.090637
CALC                       0.056093
MTRANS                     0.082470
BMI                        0.038936
BMICategory                0.079233
Age_Category               0.026647
dtype: float64


In [428]:
X_train_normalized.shape

(20758, 20)

In [429]:
# Метод 1 
selector = VarianceThreshold(threshold=0.02)

X_selected_train = selector.fit_transform(X_train_normalized)
X_selected_test = selector.transform(X_test_normalized)

# Используем get_support чотбы сохранить названия колонок 
selected_features = X_train_normalized.columns[selector.get_support()]

X_selected_train_df = pd.DataFrame(X_selected_train, columns=selected_features)
X_selected_test_df = pd.DataFrame(X_selected_test, columns=selected_features)

X_selected_train_df.head()


Unnamed: 0,id,Gender,Height,Weight,FamilyOverweightHistory,FAVC,FCVC,NCP,CAEC,CH2O,SCC,FAF,TUE,CALC,MTRANS,BMI,BMICategory,Age_Category
0,0.0,1.0,0.471698,0.33849,1.0,1.0,0.5,0.333333,0.666667,0.88,0.0,0.0,0.49,0.5,0.75,0.364002,0.5,1.0
1,4.8e-05,0.0,0.207547,0.142789,1.0,1.0,0.5,0.666667,0.333333,0.5,0.0,0.333333,0.5,1.0,0.0,0.249527,0.833333,1.0
2,9.6e-05,0.0,0.490566,0.088609,1.0,1.0,0.0,0.0,0.666667,0.455,0.0,0.0,0.835,1.0,0.75,0.101466,0.666667,1.0
3,0.000145,0.0,0.490566,0.731953,1.0,1.0,1.0,0.666667,0.666667,0.335,0.0,0.333333,0.39,0.5,0.75,0.757332,0.333333,1.0
4,0.000193,1.0,0.867925,0.434714,1.0,1.0,0.5,0.0,0.666667,0.49,0.0,0.333333,0.465,0.5,0.75,0.30369,0.5,1.0


In [430]:
# Удалено 2 признака, проверям насколько применим данный метод к нашей модели. 
model = LogisticRegression(max_iter=500)  # Увеличиваем max_iter для лучшей сходимости
model.fit(X_selected_train_df, y_train)
y_pred = model.predict(X_selected_test_df)

# Оцениваем модель 
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # average='weighted' для многоклассовой классификации
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'Recall: {recall:.2f}')
print(f'Precision: {precision:.2f}')


NameError: name 'y_test' is not defined

In [None]:
# Метод 2  ANOVA - попробуем выбрать 15 лучших признаков 
selector = SelectKBest(f_classif, k=15)

X_anova_train = selector.fit_transform(X_train_normalized, y_train)
X_anova_test = selector.transform(X_test_normalized)  
# Используем get_support чотбы сохранить названия колонок 
selected_features = X_train_normalized.columns[selector.get_support()]

X_anova_train_df = pd.DataFrame(X_anova_train, columns=selected_features)
X_anova_test_df = pd.DataFrame(X_anova_test, columns=selected_features)

X_anova_train_df.head()

Unnamed: 0,Gender,Age,Height,Weight,FamilyOverweightHistory,FAVC,FCVC,NCP,CAEC,CH2O,FAF,CALC,MTRANS,BMI,BMICategory
0,1.0,0.212766,0.471698,0.33849,1.0,1.0,0.5,0.333333,0.666667,0.88,0.0,0.5,0.75,0.364002,0.5
1,0.0,0.085106,0.207547,0.142789,1.0,1.0,0.5,0.666667,0.333333,0.5,0.333333,1.0,0.0,0.249527,0.833333
2,0.0,0.085106,0.490566,0.088609,1.0,1.0,0.0,0.0,0.666667,0.455,0.0,1.0,0.75,0.101466,0.666667
3,0.0,0.12766,0.490566,0.731953,1.0,1.0,1.0,0.666667,0.666667,0.335,0.333333,0.5,0.75,0.757332,0.333333
4,1.0,0.361702,0.867925,0.434714,1.0,1.0,0.5,0.0,0.666667,0.49,0.333333,0.5,0.75,0.30369,0.5


In [None]:
# Удалено 5 признаков, проверям насколько применим данный метод к нашей модели. 
model = LogisticRegression(max_iter=500)  # Увеличиваем max_iter для лучшей сходимости
model.fit(X_anova_train_df, y_train)
y_pred = model.predict(X_anova_test_df)

# Оцениваем модель 
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # average='weighted' для многоклассовой классификации
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'Recall: {recall:.2f}')
print(f'Precision: {precision:.2f}')

Accuracy: 0.86
F1 Score: 0.85
Recall: 0.86
Precision: 0.85


In [None]:
# Метод 3 - отбор признаков при помощи Логистической регрессии
pipe = make_pipeline(SelectFromModel(estimator=LogisticRegression(max_iter=500)), LogisticRegression(max_iter=500))

pipe.fit(X_train_normalized, y_train)
y_pred = pipe.predict(X_test_normalized)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # average='weighted' для многоклассовой классификации
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'Recall: {recall:.2f}')
print(f'Precision: {precision:.2f}')

Accuracy: 0.82
F1 Score: 0.82
Recall: 0.82
Precision: 0.82
