## 목표 : 생선 분류 모델
- 데이터 : fish.csv
- 피쳐 : 5개 Weight, Length, Diagonal, Height, Width
- 타겟 : 1개 Species
- 방법 : 지도학습 + 다중분류

### (1) 모듈로딩 및 데이터 준비 <hr>

In [2]:
import pandas as pd
import numpy as np

In [3]:
# 데이터 준비
data_file = '../data/fish.csv'
fishDF = pd.read_csv(data_file)
fishDF

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.5200,4.0200
1,Bream,290.0,26.3,31.2,12.4800,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.7300,4.4555
4,Bream,430.0,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...
154,Smelt,12.2,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,12.4,13.5,2.4300,1.2690
156,Smelt,12.2,13.0,13.8,2.2770,1.2558
157,Smelt,19.7,14.3,15.2,2.8728,2.0672


### (2) 학습 위한 데이터 준비 <hr>

#### (2-1) 피쳐와 타겟 분리

In [4]:
targetDF = fishDF.Species
featureDF = fishDF.drop('Species', axis=1)
featureDF

Unnamed: 0,Weight,Length,Diagonal,Height,Width
0,242.0,25.4,30.0,11.5200,4.0200
1,290.0,26.3,31.2,12.4800,4.3056
2,340.0,26.5,31.1,12.3778,4.6961
3,363.0,29.0,33.5,12.7300,4.4555
4,430.0,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...
154,12.2,12.2,13.4,2.0904,1.3936
155,13.4,12.4,13.5,2.4300,1.2690
156,12.2,13.0,13.8,2.2770,1.2558
157,19.7,14.3,15.2,2.8728,2.0672


In [5]:
print(f'featureDF.shape: {featureDF.shape}, targetDF.shape: {targetDF.shape}')

featureDF.shape: (159, 5), targetDF.shape: (159,)


In [6]:
# 타겟 즉 클래스 수 확인
targetDF.nunique()

7

In [7]:
# 타겟 클래스 별 데이터 수 확인
targetDF.value_counts() / targetDF.shape[0]
# 데이터가 불균등함

Species
Perch        0.352201
Bream        0.220126
Roach        0.125786
Pike         0.106918
Smelt        0.088050
Parkki       0.069182
Whitefish    0.037736
Name: count, dtype: float64

#### (2-2) 학습용/테스트용 데이터셋 준비

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(featureDF,
                                                    targetDF,
                                                    stratify=targetDF,
                                                    random_state=11)

In [10]:
print(f'Training set: {X_train.shape}, {y_train.shape} test set: {X_test.shape}, {y_test.shape}')

Training set: (119, 5), (119,) test set: (40, 5), (40,)


### (3) 학습 진행<hr>

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
# 모델 인스턴스 생성 및 학습
model = LogisticRegression(max_iter=1000, solver='liblinear')
model.fit(X_train, y_train)

In [13]:
# 모델 파라미터 확인
print(f'classes_ : {model.classes_}')
print(f'feature_names_in_ : {model.feature_names_in_}')
print(f'max_iter : {model.max_iter}')
print(f'coef_ : {len(model.coef_)}개\n{model.coef_}')
print(f'intercept_ : \n{model.intercept_}')


classes_ : ['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']
feature_names_in_ : ['Weight' 'Length' 'Diagonal' 'Height' 'Width']
max_iter : 1000
coef_ : 7개
[[ 1.31151754e-02 -1.64944473e+00  8.28009600e-01  1.41621596e+00
  -4.15067210e-01]
 [-2.10617657e-02  3.33701594e-01 -9.64909143e-01  2.19381184e+00
   2.66611700e-02]
 [-1.97453974e-03  2.60616873e+00 -2.66412260e+00 -7.93176709e-03
   1.91659551e+00]
 [ 1.01422059e-02  2.55168743e-01  1.51461260e-01 -1.94779290e+00
  -8.36602128e-01]
 [-9.89829706e-03 -1.72578825e+00  1.53807538e+00 -5.12880032e-01
   1.65750894e+00]
 [-7.29426634e-02  3.82049401e-01  1.62783679e-01 -1.55364795e+00
  -5.97839461e-01]
 [ 5.67536044e-03 -5.15807250e-01  2.50622960e-01 -2.45458511e-01
   8.38223029e-01]]
intercept_ : 
[-0.27362899  0.07982094 -0.34682853 -1.23222237 -1.32590576  0.41907035
 -0.35145235]


### (4) 평가 <hr>

In [14]:
print(f'[Train Score] {model.score(X_train, y_train)}\n[Test Score] {model.score(X_test, y_test)}')

[Train Score] 0.9495798319327731
[Test Score] 0.975


### (5) 모델 활용 <hr>

In [15]:
y_pre = model.predict(X_test.iloc[[0]])
y_pre, y_test[:1]

(array(['Bream'], dtype=object),
 1    Bream
 Name: Species, dtype: object)

In [16]:
model.predict_proba(X_test.iloc[[0]]).round(5)

array([[5.0433e-01, 3.1086e-01, 3.8000e-04, 0.0000e+00, 1.7295e-01,
        0.0000e+00, 1.1480e-02]])

In [17]:
# 5개 데이터에 대한 생선 분류 예측
print(model.classes_)
np.round(model.predict_proba(X_test.iloc[:5]), 3)

['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']


array([[0.504, 0.311, 0.   , 0.   , 0.173, 0.   , 0.011],
       [0.158, 0.73 , 0.044, 0.   , 0.057, 0.   , 0.01 ],
       [0.772, 0.024, 0.001, 0.   , 0.18 , 0.   , 0.023],
       [0.001, 0.089, 0.719, 0.002, 0.155, 0.004, 0.03 ],
       [0.   , 0.021, 0.753, 0.009, 0.176, 0.009, 0.031]])

In [18]:
result = model.predict_proba(X_test.iloc[:5]).argmax(axis=1) # argmax : 결과 인덱스 뽑아줌
result

array([0, 1, 0, 2, 2])

In [19]:
data = {"Pre Y": [model.classes_[idx] for idx in result], "True Y" : y_test[:5].to_list()}

In [20]:
pd.DataFrame(data)

Unnamed: 0,Pre Y,True Y
0,Bream,Bream
1,Parkki,Parkki
2,Bream,Bream
3,Perch,Perch
4,Perch,Perch


In [21]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(targetDF)

model.classes_, encoder.inverse_transform(model.classes_)

ValueError: y contains previously unseen labels: ['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']

In [22]:
print("타겟 라벨 ", model.classes_)
labels=encoder.inverse_transform

타겟 라벨  ['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']
