### 1. 데이터셋 생성
- 데이터 로드
- 분할(train-test-split)

#### 데이터 로드

In [1]:
# 데이터 로드

import pandas as pd

data = pd.read_csv('test/iris.csv')
data.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [2]:
# 데이터 가정 : 결측치 없음
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal length  150 non-null    float64
 1   sepal width   150 non-null    float64
 2   petal length  150 non-null    float64
 3   petal width   150 non-null    float64
 4   target        150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [3]:
# 데이터 가정 : 범주형 변수 전처리 완료

print(data['target'].value_counts())

data['target'] = data['target'].map({
    'Iris-setosa' : 0, 
    'Iris-versicolor' : 1, 
    # 'Iris-virginica' : 2
    'Iris-virginica' : 1 # 이진분류 테스트용
})
print(data['target'].value_counts())

target
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64
target
1    100
0     50
Name: count, dtype: int64


#### 데이터 분할

##### (1) X/y

In [4]:
# 데이터 분할 (1) X, y 분할

X = data.drop(columns = 'target')
y = data['target']

X.shape, y.shape

((150, 4), (150,))

##### (2) train/test

In [5]:
# 데이터 분할 (2) train, test 분할

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    # stratify = y, # 분류모델인 경우
    random_state=42
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape, y_train.mean(), y_test.mean()

((120, 4), (30, 4), (120,), (30,), 0.6666666666666666, 0.6666666666666666)

### 2. 데이터 전처리
- 스케일링
- 리샘플링

#### 스케일링

In [6]:
""" TODO : 스케일링 대상 정의 """

X_TRAIN, X_TEST = X_train, X_test

In [7]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

scalers = {
    'std' : StandardScaler(),
    'mm' : MinMaxScaler(),
    'rb' : RobustScaler()
}

""" TODO - 스케일링 방법 선택 """
# method_scaler = 'std'
method_scaler = 'mm'

scaler = scalers[method_scaler]
X_train_scaled = scaler.fit_transform(X_TRAIN)
X_test_scaled = scaler.transform(X_TEST)

pd.DataFrame(X_train_scaled, columns = X_TRAIN.columns).describe()

Unnamed: 0,sepal length,sepal width,petal length,petal width
count,120.0,120.0,120.0,120.0
mean,0.443873,0.440625,0.478509,0.451042
std,0.242296,0.185999,0.307237,0.313934
min,0.0,0.0,0.0,0.0
25%,0.235294,0.333333,0.087719,0.083333
50%,0.426471,0.416667,0.570175,0.5
75%,0.617647,0.552083,0.719298,0.708333
max,1.0,1.0,1.0,1.0


#### 리샘플링

In [8]:
""" TODO : 리샘플링 대상 정의 """

X_TRAIN, y_TRAIN = X_train, y_train

In [9]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

samplers = {
    'rus' : RandomUnderSampler(random_state=42),
    'ros' : RandomOverSampler(random_state=42),
    'smote' : SMOTE(random_state=42)
}

""" TODO - 리샘플링 방법 선택 """
method_sampler = 'smote'
# while True:
#     method_sampler = input('리샘플링 방법 입력(rus/ros/smote) - 올바르게 입력할 때까지 실행됨')
#     if method_sampler in samplers.keys():
#         break
 
sampler = samplers[method_sampler]

X_train_sampled, y_train_sampled = sampler.fit_resample(X_TRAIN, y_TRAIN)

y_train.value_counts(), y_train_sampled.value_counts()

(target
 1    80
 0    40
 Name: count, dtype: int64,
 target
 0    80
 1    80
 Name: count, dtype: int64)

### 3. 모델링(Modeling)
- 분류모델

- 회귀모델


In [10]:
# [TODO] 학습 및 예측에 사용할 데이터 정의

X_TRAIN, y_TRAIN = X_train, y_train
X_TEST, y_TEST = X_test, y_test

#### 분류모델

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

models_clf = {
    'lr' : LogisticRegression(random_state=42),
    'dt' : DecisionTreeClassifier(random_state=42),
    'rf' : RandomForestClassifier(random_state=42)
}

#### 회귀모델

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

models_rg = {
    'lr' : LinearRegression(),
    'dt' : DecisionTreeRegressor(random_state=42),
    'rf' : RandomForestRegressor(random_state=42)
}

#### 모델 학습 및 예측

In [13]:
""" [TODO] - 모델링 방법 설정 """
method_type = 'clf' # clf, rg
method_model = 'rf' # lr, dt, rf, ...

# 모델 생성
if method_type == 'clf':
    model = models_clf[method_model]
elif method_type== 'rg':
    model = models_rg[method_model]
else:
    raise("clf or rg 중에 택1")

# 모델 학습
model.fit(X_train, y_train)

# 모델 예측
y_pred = model.predict(X_test)

### 4. 성능평가(Evaluation)
- 분류모델 : Accuracy, Precision, Recall, F1-score
- 회귀모델 : MAE, MSE, RMSE, R2_score

#### 분류모델 : Accuracy, Precision, Recall, F1-score

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

accuracy, precision, recall, f1

(1.0, 1.0, 1.0, 1.0)

#### 회귀모델 : MAE, MSE, RMSE, R2_score


In [15]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** (1/2)
r2 = r2_score(y_test, y_pred)

print(mae, mse, rmse, r2)

0.0 0.0 0.0 1.0
