In [None]:
git_url = "https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert_v2/refs/heads/main"
part1_ch3 = git_url + "/part1/ch3/"
data_url = "https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert_v2/refs/heads/main/part1/ch3/type1_data1.csv"

## 작업 2유형: 머신러닝 - 이진분류

### 가짜 데이터셋 생성1: make_classification

In [2]:
from sklearn.datasets import make_classification
import pandas as pd

# 데이터 생성
X, y = make_classification(n_samples=1000, # 샘플수
                           n_features=5, # 총 특성 수
                           n_informative=3, # 정보가 있는 특성
                           n_redundant=1, # 중복된 특성
                           n_classes=2, # 이진 분류
                           random_state=42)

# 데이터프레임으로 반환
df = pd.DataFrame(X, columns=['feature1', 'feature2', 'feature3', 'feature4', 'feature5'])
df['target'] = y

print(df.head())

   feature1  feature2  feature3  feature4  feature5  target
0 -0.038769 -0.649239 -0.224746 -1.346275  0.126879       0
1  1.005284 -1.373239  1.157346  0.126493  1.422799       0
2 -0.742455 -0.573257  1.688442 -2.588237  0.762562       0
3 -1.587158  1.758582 -0.930664  0.764614  2.415399       1
4  0.195806 -0.058897 -0.549360  0.777375  1.147261       1


### 2: 판다스를 통해 데이터프레임 복제 및 가공

In [4]:
import numpy as np

# 데이터프레임
df = pd.DataFrame({
    'age': [25, 45, 35, 50, 23, 36, 52, 48],
    'income': [30000, 80000, 50000, 120000, 25000, 52000, 110000, 105000],
    'married': [0, 1, 1, 1, 0, 1, 1, 1],
    'buy': [0, 1, 0, 1, 1, 0, 0, 1]
})

# 데이터프레임 복사
clone_df = pd.concat([df] * 200, ignore_index=True)

# 노이즈 추가
np.random.seed(42)
clone_df['age'] = clone_df['age'] + np.random.randint(-3, 4, size=len(clone_df))
clone_df['income'] = clone_df['income'] + np.random.randint(-5000, 5000, size=len(clone_df))

print(clone_df.shape)
print(clone_df.head())

(1600, 4)
   age  income  married  buy
0   28   25804        0    0
1   45   76279        1    1
2   36   52083        1    0
3   53  120986        1    1
4   22   25619        0    1


### EDA

In [5]:
df = clone_df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   age      1600 non-null   int64
 1   income   1600 non-null   int64
 2   married  1600 non-null   int64
 3   buy      1600 non-null   int64
dtypes: int64(4)
memory usage: 50.1 KB


In [6]:
df.describe()

Unnamed: 0,age,income,married,buy
count,1600.0,1600.0,1600.0,1600.0
mean,39.2525,71553.169375,0.75,0.5
std,10.686387,35029.641619,0.433148,0.500156
min,20.0,20001.0,0.0,0.0
25%,31.0,42483.75,0.75,0.0
50%,40.5,65971.0,1.0,0.5
75%,49.0,107607.25,1.0,1.0
max,55.0,124911.0,1.0,1.0


In [7]:
df.isna().sum()

Unnamed: 0,0
age,0
income,0
married,0
buy,0


In [9]:
df['buy'].value_counts()

Unnamed: 0_level_0,count
buy,Unnamed: 1_level_1
0,800
1,800


In [10]:
from sklearn.model_selection import train_test_split

# 훈련 / 테스트 나눔
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 훈련을 훈련/검증으로 나눔
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=42)

# 데이터 크기 확인
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(600, 5)
(200, 5)
(200, 5)
(600,)
(200,)
(200,)


In [11]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
pred = rf.predict_proba(X_val)

print(rf.classes_)
pred[:10]

[0 1]


array([[0.83, 0.17],
       [0.77, 0.23],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.86, 0.14],
       [0.98, 0.02],
       [0.99, 0.01],
       [0.  , 1.  ],
       [0.99, 0.01]])

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
pred = rf.predict(X_val) # 이진 예측 값 반환

accuracy = accuracy_score(y_val, pred)
precision = precision_score(y_val, pred)
recall = recall_score(y_val, pred)
f1 = f1_score(y_val, pred)

roc_auc = roc_auc_score(y_val, rf.predict_proba(X_val)[:, 1])  # Positive class의 확률 반환

metrics_list = [accuracy, precision, recall, f1, roc_auc]
metrics_names = ['정확도', '정밀도', '재현율', 'F1_Score', 'ROC_AUC']

for name, metric in zip(metrics_names, metrics_list):
  print(f"{name}: {round(metric, 3)}") # 소수점 3자리 수 반환

정확도: 0.93
정밀도: 0.915
재현율: 0.935
F1_Score: 0.925
ROC_AUC: 0.977


In [19]:
# test 데이터 예측
pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)

roc_auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])  # Positive class의 확률 반환

metrics_list = [accuracy, precision, recall, f1, roc_auc]
metrics_names = ['정확도', '정밀도', '재현율', 'F1_Score', 'ROC_AUC']

for name, metric in zip(metrics_names, metrics_list):
  print(f"{name}: {round(metric, 3)}") # 소수점 3자리 수 반환

정확도: 0.965
정밀도: 0.991
재현율: 0.947
F1_Score: 0.968
ROC_AUC: 0.988
