In [1]:
# seaborn을 불러옴
import seaborn as sns

# iris라는 변수명으로 iris data를 download함.
iris = sns.load_dataset('iris')

# 'species' 열을 drop하고 특성변수 X를 정의함.
X = iris.drop('species', axis=1)

# 'species' 열을 label y로 정의함.
y = iris['species']

In [2]:
# LabelEncoder() method를 불러옴.
from sklearn.preprocessing import LabelEncoder

classle = LabelEncoder()

# species 열의 문자형을 범주형 값으로 전환
y = classle.fit_transform(iris['species'].values)

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)

In [4]:
# 적합된 모형을 통해 Train Data와 Test Data의 정밀도를 출력

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
cld = LinearDiscriminantAnalysis(store_covariance=True)

# LDA 적합
cld.fit(X_train, y_train)

y_train_pred = cld.predict(X_train)
y_test_pred = cld.predict(X_test)

In [5]:
from sklearn.metrics import accuracy_score

# train data에 대한 accuracy
print(accuracy_score(y_train, y_train_pred))

# test data에 대한 accuracy
print(accuracy_score(y_test, y_test_pred))

0.9714285714285714
0.9777777777777777


### LDA의 각 판별식을 구하기 위한
#### 특성변수 sepal_length, sepal_width, petal_length, petal_width의

- 각 클래스별 평균추정치 : means_
- 전체 평균추정치 : xbar_
- 분산추정치 : covariance_
- 각 클래스별 판별식의 bias : intercept_
- 계수추정치 : coef_

In [6]:
# 각 클래스별 평균추정치
print(cld.means_)

[[5.04285714 3.43428571 1.48857143 0.26571429]
 [5.89714286 2.76       4.22857143 1.3       ]
 [6.51714286 2.92857143 5.49142857 2.00285714]]


In [8]:
# 전체 평균추정치
print(cld.xbar_)

[5.81904762 3.04095238 3.73619048 1.18952381]


In [9]:
# 분산추정치
print(cld.covariance_)

[[0.24214422 0.09216599 0.13204898 0.03123537]
 [0.09216599 0.11137415 0.05049796 0.03388844]
 [0.13204898 0.05049796 0.15023129 0.03387755]
 [0.03123537 0.03388844 0.03387755 0.04198639]]


In [10]:
# 각 클래스별 판별식의 bias
print(cld.intercept_)

[ -7.59067635  -5.09049202 -43.13606185]


In [11]:
# 계수 추정치
print(cld.coef_)

[[  3.76019999  15.13062424 -17.99406267 -21.86481204]
 [ -0.91185753  -4.90401617   4.9041086    3.23562397]
 [ -2.84834246 -10.22660807  13.08995407  18.62918807]]


In [12]:
# Test Data에 대한 분류 결과 - Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_test_pred))

[[15  0  0]
 [ 0 14  1]
 [ 0  0 15]]


In [13]:
# 각각의 행은 setosa, versicolor, virginica 순으로 분류 결과를 알려주고 있음.
# 최초 5개의 관측치에 대해 setosa, versicolor, verginica로 분류될 확률

# 확률 추정
print(cld.predict_proba(X_test[:5]))

# [1, 2, 1, 1, 2]

[[1.37493315e-32 5.86839618e-01 4.13160382e-01]
 [4.87880054e-42 4.08368248e-04 9.99591632e-01]
 [1.49908347e-19 9.99998532e-01 1.46841688e-06]
 [2.45317549e-24 9.97417042e-01 2.58295753e-03]
 [1.76506456e-50 1.80298359e-07 9.99999820e-01]]


### 2차판별분석 - QuadraticDiscriminantAnalysis

In [15]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

cqd = QuadraticDiscriminantAnalysis()

# QDA 적합
cqd.fit(X_train, y_train)

y_train_pred = cqd.predict(X_train)
y_test_pred = cqd.predict(X_test)

from sklearn.metrics import accuracy_score

# Train Data에 대한 Accuracy
print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))

0.9809523809523809
0.9777777777777777


In [16]:
# 2차판별분석 결과로 정확도 행렬 confusion matrix를 출력하여 보여줌.

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_test_pred))

[[15  0  0]
 [ 0 14  1]
 [ 0  0 15]]
