# Overfitting


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Load datase and Feature & Target setting 

train = pd.read_csv('train.csv')

X = train.drop(columns=["Outcome", "ID"])
y = train["Outcome"]

In [3]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=False) 

In [4]:
display(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(521, 8)

(131, 8)

(521,)

(131,)

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

overfitting_model = DecisionTreeClassifier() 
overfitting_model.fit(X_train, y_train)

In [12]:
# train data accuracy
y_pred_valid = overfitting_model.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_valid)

In [13]:
# Validation data accuracy
y_pred_valid = overfitting_model.predict(X_valid)
accuracy_valid = accuracy_score(y_valid, y_pred_valid)

In [14]:
print(f"⚠️ 과적합 모델 - Train 데이터 정확도: {accuracy_train:.4f}")
print(f"⚠️ 과적합 모델 - Validation 데이터 정확도: {accuracy_valid:.4f}")
print("Train 데이터에서 정확도가 높은데 Validation 데이터에서 크게 떨어진다면 과적합이 의심됩니다!")

⚠️ 과적합 모델 - Train 데이터 정확도: 1.0000
⚠️ 과적합 모델 - Validation 데이터 정확도: 0.6183
Train 데이터에서 정확도가 높은데 Validation 데이터에서 크게 떨어진다면 과적합이 의심됩니다!


- 모델이 훈련 데이터에서 100% 정확도를 보인다는 것은, 훈련 데이터의 모든 샘플을 완벽하게 맞췄다는 뜻
- 즉, 모델이 훈련 데이터에 지나치게 적응(암기)한 상태 일 확률이 높음
- 따라서 새로운 데이터에는 제대로 예측을 할 수 없을 수 있다. 

## 의사결정나무(decision tree) model fitting and evaluation

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

overfit_model = DecisionTreeClassifier()
overfit_model.fit(X_train, y_train)



# 최종 모델 구축 및 제출 모든 과정 종합

In [4]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

sample_submission = pd.read_csv('sample_submission.csv')

print("Train data shape:", train.shape)
print("Test data shape:", test.shape)
print("Sample submission shape:", sample_submission.shape)
print("Train Head:\n", train.head(5))
print("Test Head:\n", test.head(5))
print("Sample Submission Head:\n", sample_submission.head(5))

Train data shape: (652, 10)
Test data shape: (116, 9)
Sample submission shape: (116, 2)
Train Head:
           ID  Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin  \
0  TRAIN_000            4      103             60             33      192   
1  TRAIN_001           10      133             68              0        0   
2  TRAIN_002            4      112             78             40        0   
3  TRAIN_003            1      119             88             41      170   
4  TRAIN_004            1      114             66             36      200   

    BMI  DiabetesPedigreeFunction  Age  Outcome  
0  24.0                     0.966   33        0  
1  27.0                     0.245   36        0  
2  39.4                     0.236   38        0  
3  45.3                     0.507   26        0  
4  38.1                     0.289   21        0  
Test Head:
          ID  Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin  \
0  TEST_000            5      112             66

In [6]:
# 혈압 (BloodPressure) 범주 화 및 숫자로 변환

def categorize_blood_pressure(bp):
    if bp < 80:
        return 'Low'
    elif 80 <= bp < 120:
        return 'Normal'
    elif 120 <= bp < 140:
        return 'Elevated'
    else:
        return 'High'

In [None]:
train["BP_Category_apply"] = train["BloodPressure"].apply(categorize_blood_pressure)
test["BP_Category_apply"] = test["BloodPressure"].apply(categorize_blood_pressure)

TypeError: '<' not supported between instances of 'str' and 'int'

In [None]:
bp_mapping = { "Low": 0, "Normal": 1, "Elevated": 2, "High": 3 }
train["BP_Category_apply"] = train["BP_Category_apply"].replace(bp_mapping)
test["BP_Category_apply"] = test["BP_Category_apply"].replace(bp_mapping)

  train["BP_Category_apply"] = train["BP_Category_apply"].replace(bp_mapping)
  test["BP_Category_apply"] = test["BP_Category_apply"].replace(bp_mapping)


In [8]:
train = train.drop(columns=["BloodPressure"])
test = test.drop(columns=["BloodPressure"])

In [9]:
X_train = train.drop(columns=["ID", "Outcome"])
y_train = train["Outcome"]
X_test = test.drop(columns=["ID"])

In [10]:
print("X_train shape:", X_train.shape)
X_train.head(5)

X_train shape: (652, 8)


Unnamed: 0,Pregnancies,Glucose,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,BP_Category_apply
0,4,103,33,192,24.0,0.966,33,Low
1,10,133,0,0,27.0,0.245,36,Low
2,4,112,40,0,39.4,0.236,38,Low
3,1,119,41,170,45.3,0.507,26,Normal
4,1,114,36,200,38.1,0.289,21,Low
