## Sikit-Learn 라이브러리
- 머신러닝 라이브러리



In [None]:
%pip install scikit-learn==1.0.2

In [None]:
import sklearn
print(sklearn.__version__)

In [3]:
# Classfication - 꽃 분류
from sklearn.datasets import load_iris
import pandas as pd

In [None]:
iris = load_iris()
iris.data

In [6]:
# train_test_split() -> 학습용 데이터셋과 테스트용 데이터 셋으로 분리
from sklearn.model_selection import train_test_split
# X_train: 학습용 피처 데이터, X_test: 테스트용 피처 데이터
# y_train: 학습용 레이블 데이터, y_test: 테스트용 레이블 데이터
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=123)

y_test

array([1, 2, 2, 1, 0, 2, 1, 0, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 1, 0, 0, 2,
       0, 2, 0, 0, 0, 2, 2, 0])

In [9]:
# 학습을 위한 알고리즘을 불러옴
from sklearn.tree import DecisionTreeClassifier # 분류 알고리즘

dt_clf = DecisionTreeClassifier(random_state=123)
dt_clf

DecisionTreeClassifier(random_state=123)

In [10]:
# 학습을 진행함 => train_data (X_train, y_train) // 모의고사
dt_clf.fit(X_train, y_train) # fit() - 학습 진행

DecisionTreeClassifier(random_state=123)

In [11]:
# 시험을 치러감니다. => test_data(X_test, y_test) // 수능
# predict() -> 예측해봐

predict = dt_clf.predict(X_test)
predict

array([1, 2, 2, 1, 0, 1, 1, 0, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 1, 0, 0, 2,
       0, 2, 0, 0, 0, 2, 2, 0])

In [None]:
# 5. 채점 진행
# accuracy_score()

from sklearn.metrics import accuracy_score
accuracy_score(y_test, predict) # 실제 정답, 예측값

In [None]:
df = pd.DataFrame(data=X_test, columns=iris.feature_names)
df['answer'] = y_test
df['predict'] = predict
df

In [None]:
iris = load_iris()
dt_clf = DecisionTreeClassifier(random_state=123)
dt_clf.fit(iris.data, iris.target)
pred = dt_clf.predict(iris.data)
accuracy_score(iris.target, pred)

## 교차검증
- (1) K Fold 검증
- (2) Stratified 검증
- (3) cross_val

In [None]:
# (1) KFold

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
train_data = iris.data
train_target = iris.target

# 모델 학습 및 평가 함수 정의
def train_and_evaluate(train_index, test_index, model, traind_data, train_target):
    X_train, X_test = traind_data[train_index], traind_data[test_index]
    y_train, y_test = train_target[train_index], train_target[test_index]

    model.fit(X_train, y_train)
    predict = model.predict(X_test)

    accuracy = accuracy_score(y_test, predict)
    return accuracy, X_train.shape[0], X_test.shape[0], test_index

# K-Fold 교차 검증 설정
kfold = KFold(n_splits=5)
cv_accuracy = []
dt_clf = DecisionTreeClassifier()

for fold, (train_index, test_index) in enumerate(kfold.split(train_data), 1):
    accuracy, train_size, test_size, test_index = train_and_evaluate(
        train_index, test_index, dt_clf, train_data, train_target)
    cv_accuracy.append(accuracy)
    print(f"Fold {fold}: Accuracy = {accuracy:.4f}, Train Size = {train_size}, Test Size = {test_size}")

print("최종 정확도 평균 값:",  np.mean(cv_accuracy))

In [None]:
# (2) Stratified

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# iris 데이터 셋 로드

iris = load_iris()
features = iris.data
label = iris.target

# 모델 초기화
dt_clf = DecisionTreeClassifier(random_state=156)

# Stratified KFold 교차 검증 설정
skfold = StratifiedKFold(n_splits=3)
cv_accuracy = []

# Stratified KFold 교차 검증 수행

for fold, (train_index, test_index) in enumerate(skfold.split(features, label), 1):
  accuracy, train_size, test_size, test_idx = train_and_evaluate(
      train_index, test_index, dt_clf, features, label
  )
  cv_accuracy.append(accuracy)

  # 출력 메세지 변경
  print(f"{fold}번째 트레이닝")
  print(f"검증 정확도: {accuracy}, 학습 데이터 크기: {train_size}, 검증 데이터 크기: {test_size}")
  print(f"검증 데이터 셋 인덱스 값: {test_idx}")
  print("===" * 10)

# 교차 검증 별 정확도 및 평균 정확도 계산
print("\n최종 정확도 평균 값:", np.mean(cv_accuracy))

# 최종 정확도 평균 값: 0.9666666666666666

In [None]:
# (3) cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
import numpy as np

iris_data = load_iris()
dt_clf = DecisionTreeClassifier(random_state=156)

data = iris_data.data
label = iris_data.target

scores = cross_val_score(dt_clf, data, label, scoring="accuracy", cv=5)

print("교차 검증 별 정확도: ", np.round(scores, 5))
print("평균 검증 정확도: ", np.round(np.mean(scores), 5))

# 교차 검증 별 정확도:  [0.96667 0.96667 0.9     0.96667 1.     ]
# 평균 검증 정확도:  0.96

## 데이터 인코딩

- Label Encoding
- OneHot Encoding
- GetDeummles

In [24]:
from sklearn.preprocessing import LabelEncoder

items = ['강남구', '서초구', '송파구', '노원구', '마포구','마포구', '용산구','용산구']

encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
labels

array([0, 3, 4, 1, 2, 2, 5, 5])

In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

items = ['강남구', '서초구', '송파구', '노원구', '마포구','마포구', '용산구','용산구']

items = np.array(items).reshape(-1,1)
oh_encoder = OneHotEncoder()
oh_encoder.fit(items)
oh_labels = oh_encoder.transform(items)
print("원-핫 인코딩 데이터")
print(oh_labels.toarray())
print("원-핫 인코딩 데이터 차원")
print(oh_labels.shape)

In [None]:
# df.get_dummies()
import pandas as pd
df = pd.DataFrame({'items':['강남구', '서초구', '송파구', '노원구', '마포구','마포구', '용산구','용산구']})
pd.get_dummies(df)

In [28]:
import pandas as pd

data = {'Fruit': ['Apple', 'Banana', 'Cherry', 'Apple', 'Cherry'],
        'Color': ['Red', 'Yellow', 'Red', 'Green', 'Red']}
df = pd.DataFrame(data)
df_encoded = pd.get_dummies(df, columns=['Fruit', 'Color'], prefix=['Fruit', 'Color'])
df_encoded