In [1]:
# 필요한 라이브러리 임포트
import pandas as pd  # 데이터 처리를 위한 pandas
import numpy as np   # 수치 계산을 위한 numpy
from sklearn.ensemble import RandomForestClassifier  # 랜덤 포레스트 분류기
from sklearn.svm import SVC  # 서포트 벡터 머신
from sklearn.linear_model import LogisticRegression  # 로지스틱 회귀
from sklearn.model_selection import train_test_split, cross_val_score  # 데이터 분할 및 교차 검증
from sklearn.metrics import accuracy_score  # 정확도 평가 지표

In [2]:
train = pd.read_csv('train.xls')
test = pd.read_csv('test.xls')

train, test

(            id     Sex  Age  Height  Weight  Duration  Heart_Rate  Body_Temp  \
 0            0    male   36   189.0    82.0      26.0       101.0       41.0   
 1            1  female   64   163.0    60.0       8.0        85.0       39.7   
 2            2  female   51   161.0    64.0       7.0        84.0       39.8   
 3            3    male   20   192.0    90.0      25.0       105.0       40.7   
 4            4  female   38   166.0    61.0      25.0       102.0       40.6   
 ...        ...     ...  ...     ...     ...       ...         ...        ...   
 749995  749995    male   28   193.0    97.0      30.0       114.0       40.9   
 749996  749996  female   64   165.0    63.0      18.0        92.0       40.5   
 749997  749997    male   60   162.0    67.0      29.0       113.0       40.9   
 749998  749998    male   45   182.0    91.0      17.0       102.0       40.3   
 749999  749999  female   39   171.0    65.0      19.0        97.0       40.6   
 
         Calories  
 0    

In [4]:
# target 설정
# 독립변수 : 신체정보 + 운동특성
# 종속변수 : 칼로리 소모량

features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
target = 'Calories'
train['Calories'].value_counts()

Calories
7.0      8350
13.0     7261
11.0     6982
12.0     6951
17.0     6946
         ... 
280.0      41
300.0      32
289.0      32
273.0      29
314.0      26
Name: count, Length: 277, dtype: int64

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          750000 non-null  int64  
 1   Sex         750000 non-null  object 
 2   Age         750000 non-null  int64  
 3   Height      750000 non-null  float64
 4   Weight      750000 non-null  float64
 5   Duration    750000 non-null  float64
 6   Heart_Rate  750000 non-null  float64
 7   Body_Temp   750000 non-null  float64
 8   Calories    750000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 51.5+ MB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          250000 non-null  int64  
 1   Sex         250000 non-null  object 
 2   Age         250000 non-null  int64  
 3   Height      250000 non-null  float64
 4   Weight      250000 non-null  float64
 5   Duration    250000 non-null  float64
 6   Heart_Rate  250000 non-null  float64
 7   Body_Temp   250000 non-null  float64
dtypes: float64(5), int64(2), object(1)
memory usage: 15.3+ MB


In [12]:
# 전처리
train['Sex'] = train['Sex'].map({'male':0, 'female':1})
train['Sex']

0         0
1         1
2         1
3         0
4         1
         ..
749995    0
749996    1
749997    0
749998    0
749999    1
Name: Sex, Length: 750000, dtype: int64

In [13]:
x = train[features]
y = train[target]

In [18]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, random_state = 42,
    stratify = y
)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((562500, 6), (187500, 6), (562500,), (187500,))

In [20]:
x_trian.isnull().sum()


Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
dtype: int64

In [21]:
x_test.isnull().sum()


Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
dtype: int64

In [22]:
y_train.isnull().sum()


np.int64(0)

In [23]:
y_test.isnull().sum()

np.int64(0)

In [24]:
# age_mean = round(x_train['Age'].mean(),0)
# height_mean = round(x_train['Height'].mean(),0)
# weight_mean = round(x_train['Weight'].mean(),0)


In [25]:
# 사용할 모델 정의
models = {
    'RandomForest': RandomForestClassifier(random_state=42),  # 랜덤 포레스트 분류기
    'SVM': SVC(random_state=42, probability=True),  # 서포트 벡터 머신
    'LogisticRegression': LogisticRegression(random_state=42)  # 로지스틱 회귀
}

# 각 모델별 하이퍼파라미터 후보군 정의
param_grid = {
    'RandomForest': [
        {'n_estimators': 100, 'max_depth': None},  # 트리 100개, 깊이 제한 없음
        {'n_estimators': 200, 'max_depth': 5}      # 트리 200개, 최대 깊이 5
    ],
    'SVM': [
        {'C': 1.0, 'kernel': 'rbf'},    # RBF 커널, C=1.0
        {'C': 0.5, 'kernel': 'linear'}  # 선형 커널, C=0.5
    ],
    'LogisticRegression': [
        {'C': 1.0, 'max_iter': 1000},  # 기본 설정
        {'C': 0.1, 'max_iter': 1000}   # 더 강한 정규화
    ]
}

In [26]:
# 객체 검증을 통한 최적 모델 선택
best_score = 0 # 최고 성능 점수
best_model_name = None # 최고 성능 모델 이름
best_model = None # 최고 성능 모델 객체

In [27]:
# 각 모델과 하이퍼파라미터 조합에 대해 교차검증 수행
for model_name, model in models.items():
    print(f"\n--- Testing {model_name} ---")
    for params in param_grid[model_name]:
        model.set_params(**params)  # 하이퍼파라미터 설정
        cv_scores = cross_val_score(model, x_train, y_train, cv=5, scoring='accuracy')  # 5-fold 교차검증
        mean_cv = np.mean(cv_scores)  # 평균 교차검증 점수
        print(f"Params: {params}, CV Accuracy: {mean_cv:.4f}")
        
        # 최고 성능 모델 업데이트
        if mean_cv > best_score:
            best_score = mean_cv
            best_model_name = model_name
            best_model = model.set_params(**params)

# 최종 선택된 모델로 테스트셋 평가
best_model.fit(x_train, y_train)  # 최적 모델 학습
y_pred = best_model.predict(x_test)  # 테스트셋 예측
test_acc = accuracy_score(y_test, y_pred)  # 테스트셋 정확도 계산

# 최종 결과 출력
print(f"\nBest Model: {best_model_name}")
print(f"Best CV Score: {best_score:.4f}")
print(f"Test Set Accuracy: {test_acc:.4f}")


--- Testing RandomForest ---


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\ensemble\_forest.py", line 487, in fit
    trees = Parallel(
    ...<2 lines>...
        prefer="threads",
    )(
        delayed(_parallel_build_trees)(
    ...<12 lines>...
        for i, t in enumerate(trees)
    )
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\parallel.py", line 77, in __call__
    return super().__call__(iterable_with_config)
           ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\parallel.py", line 1986, in __call__
    return output if self.return_generator else list(output)
                                                ~~~~^^^^^^^^
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\parallel.py", line 1914, in _get_sequential_output
    res = func(*args, **kwargs)
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\parallel.py", line 139, in __call__
    return self.function(*args, **kwargs)
           ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\ensemble\_forest.py", line 189, in _parallel_build_trees
    tree._fit(
    ~~~~~~~~~^
        X,
        ^^
    ...<3 lines>...
        missing_values_in_feature_mask=missing_values_in_feature_mask,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\tree\_classes.py", line 472, in _fit
    builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "_tree.pyx", line 153, in sklearn.tree._tree.DepthFirstTreeBuilder.build
  File "_tree.pyx", line 268, in sklearn.tree._tree.DepthFirstTreeBuilder.build
  File "_tree.pyx", line 923, in sklearn.tree._tree.Tree._add_node
  File "_tree.pyx", line 892, in sklearn.tree._tree.Tree._resize_c
  File "_utils.pyx", line 29, in sklearn.tree._utils.safe_realloc
MemoryError: could not allocate 1161822208 bytes

--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\ensemble\_forest.py", line 487, in fit
    trees = Parallel(
    ...<2 lines>...
        prefer="threads",
    )(
        delayed(_parallel_build_trees)(
    ...<12 lines>...
        for i, t in enumerate(trees)
    )
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\parallel.py", line 77, in __call__
    return super().__call__(iterable_with_config)
           ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\parallel.py", line 1986, in __call__
    return output if self.return_generator else list(output)
                                                ~~~~^^^^^^^^
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\parallel.py", line 1914, in _get_sequential_output
    res = func(*args, **kwargs)
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\parallel.py", line 139, in __call__
    return self.function(*args, **kwargs)
           ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\ensemble\_forest.py", line 189, in _parallel_build_trees
    tree._fit(
    ~~~~~~~~~^
        X,
        ^^
    ...<3 lines>...
        missing_values_in_feature_mask=missing_values_in_feature_mask,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\tree\_classes.py", line 472, in _fit
    builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "_tree.pyx", line 153, in sklearn.tree._tree.DepthFirstTreeBuilder.build
  File "_tree.pyx", line 268, in sklearn.tree._tree.DepthFirstTreeBuilder.build
  File "_tree.pyx", line 923, in sklearn.tree._tree.Tree._add_node
  File "_tree.pyx", line 892, in sklearn.tree._tree.Tree._resize_c
  File "_utils.pyx", line 29, in sklearn.tree._utils.safe_realloc
MemoryError: could not allocate 580911104 bytes
