In [44]:
import seaborn as sns
iris = sns.load_dataset("iris")
X = iris.drop("species", axis=1)
y = iris["species"]

In [45]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y.values)

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

## fit_transform
- fit() 과 transform() 함께 수행하는 메소드
- 테스트 데이터에 fit_transform()을 적용해서는 안된다.
- 이를 수행하면 scaler 객체가 기존에 학습 데이터에 fit 했던 기준을 모두 무시하고 다시 테스트 데이터를 기반으로 기준을 적용하기 때문

In [47]:
# iris data에 대한 LDA 적합
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(store_covariance=True)
lda.fit(X_train_std, y_train)
y_train_pred = lda.predict(X_train_std)
y_test_pred = lda.predict(X_test_std)

In [48]:
from sklearn.metrics import accuracy_score, confusion_matrix

print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

0.9809523809523809
1.0
[[15  0  0]
 [ 0 15  0]
 [ 0  0 15]]


In [49]:
print("means")
print(lda.means_)
print("xbar")
print(lda.xbar_)
print("covariance")
print(lda.covariance_)
print("intercept")
print(lda.intercept_)
print("codf")
print(lda.coef_)

means
[[-1.00472855  0.84898398 -1.30082725 -1.25109721]
 [ 0.01720426 -0.67386718  0.27036385  0.1648152 ]
 [ 0.98752429 -0.1751168   1.0304634   1.08628201]]
xbar
[ 5.92349793e-16 -2.28056152e-16 -9.30822715e-17 -2.01002550e-17]
covariance
[[0.33834011 0.23780165 0.10444926 0.05261614]
 [0.23780165 0.59815444 0.07233365 0.09008138]
 [0.10444926 0.07233365 0.05763234 0.03301815]
 [0.05261614 0.09008138 0.03301815 0.07586104]]
intercept
[-27.75948332  -3.00559413 -16.39539191]
codf
[[  4.68593828   4.94077322 -29.598802   -12.25506085]
 [ -1.80135024  -1.69409546   9.1529667    1.38778412]
 [ -2.88458804  -3.24667776  20.44583531  10.86727672]]


### xbar
- lda.xbar_ 속성은 훈련 데이터의 클래스별 평균을 저장하는 배열
- 각 클래스에 대한 특성의 평균값들을 나타냅니다.

In [50]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
lda = QuadraticDiscriminantAnalysis(store_covariance=True)
lda.fit(X_train_std, y_train)
y_train_pred = lda.predict(X_train_std)
y_test_pred = lda.predict(X_test_std)

print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))

0.9809523809523809
0.9777777777777777
