In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

# 첫 번째 데이터셋 생성 (기본 버전)
X1, y1 = make_classification(
    n_samples=1000,
    n_features=5,
    n_redundant=0,
    n_informative=3,
    random_state=1,
    n_clusters_per_class=1,
    class_sep=0.5
)

# 특성에 의미 있는 이름 부여
features = ['age', 'income', 'education_years', 'work_experience', 'credit_score']
df1 = pd.DataFrame(X1, columns=features)
df1['target'] = y1

# 데이터 정규화 및 현실적인 값으로 변환
df1['age'] = (df1['age'] * 20 + 30).round()  # 20-60세
df1['income'] = (df1['income'] * 30000 + 50000).round()  # 20000-80000
df1['education_years'] = (df1['education_years'] * 8 + 12).round()  # 4-20년
df1['work_experience'] = (df1['work_experience'] * 15 + 10).round()  # 0-25년
df1['credit_score'] = (df1['credit_score'] * 300 + 500).round()  # 200-800

# 두 번째 데이터셋 생성 (개선된 버전 - 더 명확한 클래스 구분)
X2, y2 = make_classification(
    n_samples=1000,
    n_features=5,
    n_redundant=0,
    n_informative=3,
    random_state=2,
    n_clusters_per_class=1,
    class_sep=1.0  # 클래스 간 구분을 더 명확하게
)

df2 = pd.DataFrame(X2, columns=features)
df2['target'] = y2

# 두 번째 데이터셋도 같은 방식으로 변환
df2['age'] = (df2['age'] * 20 + 30).round()
df2['income'] = (df2['income'] * 30000 + 50000).round()
df2['education_years'] = (df2['education_years'] * 8 + 12).round()
df2['work_experience'] = (df2['work_experience'] * 15 + 10).round()
df2['credit_score'] = (df2['credit_score'] * 300 + 500).round()

# CSV 파일로 저장
df1.to_csv('./datas/dataset_1.csv', index=False)
df2.to_csv('./datas/dataset_2.csv', index=False)

# 데이터셋 정보 출력
print("Dataset 1 shape:", df1.shape)
print("\nDataset 1 head:")
print(df1.head())
print("\nDataset 1 description:")
print(df1.describe())

print("\n" + "="*50 + "\n")

print("Dataset 2 shape:", df2.shape)
print("\nDataset 2 head:")
print(df2.head())
print("\nDataset 2 description:")
print(df2.describe())

Dataset 1 shape: (1000, 6)

Dataset 1 head:
    age   income  education_years  work_experience  credit_score  target
0  37.0  76222.0             16.0              8.0         314.0       0
1  62.0   6438.0              7.0             13.0         414.0       1
2  -3.0  53024.0             16.0             20.0         426.0       1
3  39.0  23695.0             -0.0              3.0          47.0       1
4  38.0  -3427.0             15.0             18.0        1082.0       0

Dataset 1 description:
              age         income  education_years  work_experience  \
count  1000.00000    1000.000000        1000.0000      1000.000000   
mean     30.58400   50482.329000          11.9020        17.473000   
std      20.27891   31460.735692           8.3453        15.018749   
min     -53.00000  -46769.000000         -15.0000       -35.000000   
25%      23.00000   28649.250000           6.0000         9.000000   
50%      37.00000   50374.500000          12.0000        17.000000   
75% 