# 분류 - 물고기 종류

---

In [None]:
# Visual Python: Data Analysis > Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#### 데이터 로드

In [None]:
# Visual Python: Data Analysis > File
df = pd.read_csv('./data/fish.csv')
df

#### 길이의 제곱, 길이와 무게 비율 컬럼 생성

In [None]:
# Visual Python: Data Analysis > Frame
df['L2'] = df['Length'] * df['Length']
df['LKgRatio'] = df['Kg'] / df['Length']
df

#### 컬럼 추가(isTuna) - 참치: 1, 나머지: 0

In [None]:
# Visual Python: Data Analysis > Frame
df['isTuna'] = df['Type'].apply(lambda x: 1 if (x == 'tuna') else 0)
df

#### 컬럼 추가( TypeNum) - 참치: 0, 연어: 1, 고등어:2

In [None]:
# Visual Python: Data Analysis > Frame
df['TypeNum'] = df['Type'].apply(lambda x: 0 if (x == 'tuna') else 1 if (x == 'salmon') else 2)
df

In [None]:
# Visual Python: Visualization > Seaborn
sns.scatterplot(data=df, x='Length', y='Kg', hue='TypeNum')
plt.show()

# 1. Logistic Regression
- X = ['Length','Depth'], y = isTuna

In [None]:
# Visual Python: Machine Learning > Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['Length', 'Depth']], df['isTuna'])

In [None]:
# Visual Python: Machine Learning > Classifier
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

In [None]:
# Visual Python: Machine Learning > Fit/Predict
model.fit(X_train, y_train)

In [None]:
# Visual Python: Machine Learning > Fit/Predict
pred = model.predict(X_test)
pred

In [None]:
# Visual Python: Machine Learning > Evaluation
from sklearn import metrics

In [None]:
# Visual Python: Machine Learning > Evaluation
# Confusion Matrix
pd.crosstab(y_test, pred, margins=True)

In [None]:
# Visual Python: Machine Learning > Evaluation
# Classification report
print(metrics.classification_report(y_test, pred))

In [None]:
# Visual Python: Machine Learning > Model Info
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_test, model.decision_function(X_test))                                    
plt.plot(fpr, tpr, label='ROC Curve')                                    
plt.xlabel('Sensitivity')                                    
plt.ylabel('Specificity')                                    
plt.show()

In [None]:
# Visual Python: Machine Learning > Model Info
from sklearn import metrics

metrics.roc_auc_score(y_test, model.decision_function(X_test))

In [None]:
# Visual Python: Machine Learning > Model Info
def vp_create_permutation_importances(model, X_train, y_train, scoring=None, sort=False):
    from sklearn.inspection import permutation_importance
    if isinstance(X_train, pd.core.frame.DataFrame):
        feature_names = X_train.columns
    else:
        feature_names = [ 'X{}'.format(i) for i in range(X_train.shape[1]) ]
                        
    imp = permutation_importance(model, X_train, y_train, scoring=scoring)

    df_i = pd.DataFrame(imp['importances_mean'], index=feature_names, columns=['Feature_importance'])
    df_i['Percentage'] = 100 * df_i['Feature_importance']
    if sort: df_i.sort_values(by='Feature_importance', ascending=False, inplace=True)
    df_i = df_i.round(2)
                        
    return df_i
def vp_plot_permutation_importances(model, X_train, y_train, scoring=None, sort=False, top_count=0):
    df_i = vp_create_permutation_importances(model, X_train, y_train, scoring=scoring, sort=sort)
                        
    if sort: 
        if top_count > 0:
            df_i['Percentage'].sort_values().tail(top_count).plot(kind='barh')
        else:
            df_i['Percentage'].sort_values().plot(kind='barh')
    else: 
        df_i['Percentage'].plot(kind='barh')
    plt.xlabel('Feature importance Percentage')
    plt.ylabel('Features')
                        
    plt.show()

In [None]:
# Visual Python: Machine Learning > Model Info
from sklearn.inspection import permutation_importance

vp_plot_permutation_importances(model, X_train, y_train, sort=True)

---

# 2. Support Vector Machine
- X = ['Length','Depth'], y = TypeNum

In [None]:
# Visual Python: Machine Learning > Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['Length', 'Depth']], df['TypeNum'])

In [None]:
# Visual Python: Machine Learning > Classifier
from sklearn.svm import SVC

model = SVC()

In [None]:
# Visual Python: Machine Learning > Fit/Predict
model.fit(X_train, y_train)

In [None]:
# Visual Python: Machine Learning > Fit/Predict
pred = model.predict(X_test)
pred

In [None]:
# Visual Python: Machine Learning > Evaluation
# Confusion Matrix
pd.crosstab(y_test, pred, margins=True)

In [None]:
# Visual Python: Machine Learning > Evaluation
# Classification report
print(metrics.classification_report(y_test, pred))

## 2.1 Support Vector Machine - PCA
- X = ['Length','Depth','Kg','L2','LKgRatio'], y = TypeNum

In [None]:
# Visual Python: Machine Learning > Dimension
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

In [None]:
# Visual Python: Machine Learning > Fit/Predict
X_pca = pca.fit_transform(df[['Length', 'Depth', 'Kg', 'L2', 'LKgRatio']])
X_pca

In [None]:
# Visual Python: Machine Learning > Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_pca, df['TypeNum'])

In [None]:
# 모델 생성
model = SVC(kernel='linear')

# 모델 학습
model.fit(X_train, y_train)

# 결과 예측
pred = model.predict(X_test)
pred

In [None]:
# Visual Python: Machine Learning > Evaluation
# Confusion Matrix
pd.crosstab(y_test, pred, margins=True)

In [None]:
# Visual Python: Machine Learning > Evaluation
# Classification report
print(metrics.classification_report(y_test, pred))

# 3. Decision Tree
- X = ['Length','Depth','Kg','L2','LKgRatio'], y = TypeNum

In [None]:
# Visual Python: Machine Learning > Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['Length', 'Depth', 'Kg', 'L2', 'LKgRatio']], df['TypeNum'])

In [None]:
# Visual Python: Machine Learning > Classifier
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

In [None]:
# Visual Python: Machine Learning > Fit/Predict
model.fit(X_train, y_train)

In [None]:
# Visual Python: Machine Learning > Fit/Predict
pred = model.predict(X_test)
pred

In [None]:
# Visual Python: Machine Learning > Evaluation
# Confusion Matrix
pd.crosstab(y_test, pred, margins=True)

In [None]:
# Visual Python: Machine Learning > Evaluation
# Classification report
print(metrics.classification_report(y_test, pred))

# 4. Random Forest
- X = ['Length','Depth','Kg','L2','LKgRatio'], y = TypeNum

In [None]:
# Visual Python: Machine Learning > Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['Length', 'Depth', 'Kg', 'L2', 'LKgRatio']], df['TypeNum'])

In [None]:
# Visual Python: Machine Learning > Classifier
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10)

In [None]:
# Visual Python: Machine Learning > Fit/Predict
model.fit(X_train, y_train)

In [None]:
# Visual Python: Machine Learning > Fit/Predict
pred = model.predict(X_test)
pred

In [None]:
# Visual Python: Machine Learning > Evaluation
# Confusion Matrix
pd.crosstab(y_test, pred, margins=True)

In [None]:
# Visual Python: Machine Learning > Evaluation
# Classification report
print(metrics.classification_report(y_test, pred))

In [None]:
# Visual Python: Machine Learning > Model Info
def vp_create_feature_importances(model, X_train=None, sort=False):
    if isinstance(X_train, pd.core.frame.DataFrame):
        feature_names = X_train.columns
    else:
        feature_names = [ 'X{}'.format(i) for i in range(len(model.feature_importances_)) ]
                        
    df_i = pd.DataFrame(model.feature_importances_, index=feature_names, columns=['Feature_importance'])
    df_i['Percentage'] = 100 * df_i['Feature_importance']
    if sort: df_i.sort_values(by='Feature_importance', ascending=False, inplace=True)
    df_i = df_i.round(2)
                        
    return df_i
def vp_plot_feature_importances(model, X_train=None, sort=False, top_count=0):
    df_i = vp_create_feature_importances(model, X_train, sort)
                        
    if sort: 
        if top_count > 0:
            df_i['Percentage'].sort_values().tail(top_count).plot(kind='barh')
        else:
            df_i['Percentage'].sort_values().plot(kind='barh')
    else: 
        df_i['Percentage'].plot(kind='barh')
    plt.xlabel('Feature importance Percentage')
    plt.ylabel('Features')
                        
    plt.show()

In [None]:
# Visual Python: Machine Learning > Model Info
vp_plot_feature_importances(model, X_train, sort=True)

# 5. 하이퍼 파라미터 튜닝(매개변수 최적화)

In [None]:
# Visual Python: Machine Learning > Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['Length', 'Depth', 'Kg', 'L2', 'LKgRatio']], df['TypeNum'])

In [None]:
# Visual Python: Machine Learning > GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

gs = GridSearchCV(RandomForestClassifier(), {'n_estimators': [10,50,100,200,500]}, cv=5)

In [None]:
# Visual Python: Machine Learning > Fit/Predict
gs.fit(X_train, y_train)

In [None]:
# Visual Python: Machine Learning > Model Info
best_score = gs.best_score_
best_score

In [None]:
# Visual Python: Machine Learning > Model Info
best_estimator = gs.best_estimator_
best_estimator

In [None]:
# Visual Python: Machine Learning > Model Info
best_params = gs.best_params_
best_params

In [None]:
# Visual Python: Machine Learning > Fit/Predict
pred = gs.predict(X_test)
pred

In [None]:
# Visual Python: Machine Learning > Evaluation
# Confusion Matrix
pd.crosstab(y_test, pred, margins=True)

In [None]:
# Visual Python: Machine Learning > Evaluation
# Classification report
print(metrics.classification_report(y_test, pred))

---

In [None]:
# End of file