# Load Packages

In [1]:
# scikit-learn, scikit-image are needed
# If those packages are missed, then, install them by using the following command
# pip install scikit-learn scikit-image


import os

import sklearn.datasets
import sklearn.linear_model
import sklearn.svm
import sklearn.tree
import sklearn.ensemble
import sklearn.model_selection
import sklearn.metrics

from sklearn.feature_selection import SelectFromModel

import skimage.io
import skimage.transform
import skimage.color
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score
from scipy.stats import reciprocal, uniform

from sklearn.model_selection import RandomizedSearchCV


import numpy as np
import pandas as pd

import matplotlib.pyplot as plt 
%matplotlib inline

# Load Additional Packages (if you want to use other modules in Scikit Learn)

In [2]:
# Load additional scikit learn packages! if you need

# Load Data Points (Do not modify the following block)

In [None]:
image_size = 64
labels = ['glioma_tumor','meningioma_tumor','no_tumor','pituitary_tumor']

images = []
y = []
for i in labels:
    folderPath = os.path.join('./tumor_dataset/Training',i)
    for j in os.listdir(folderPath):
        img = skimage.io.imread(os.path.join(folderPath,j),)
        img = skimage.transform.resize(img,(image_size,image_size))
        img = skimage.color.rgb2gray(img)
        images.append(img)
        y.append(i)
        
images = np.array(images)

X = images.reshape((-1, image_size**2))
y = np.array(y)

In [None]:
j = 0
for i in range(len(y)):
    if y[i] in labels[j]:
        plt.imshow(images[i])
        plt.title("[Index:{}] Label:{}".format(i, y[i]))
        plt.show()
        j += 1
    if j >= len(labels):
        break

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.3, random_state=0)

# Classification with Scikit Learn Library (Programming Assignment)
### Variable Explanation (Do not change variable names)
- 'X_train' is feature vectors of training dataset
- 'y_train' is target labels of training dataset
- 'X_test' is feature vectors of test dataset
- 'y_test' is target labels of test dataset
- 'y_pred' was initialized as zero vectors and fill 'y_pred' with predicted labels

### Find the best model and hyperparameter for tumor classification
- Find the best random seed as well and fix it to reproduce your result on other computers.

In [None]:

rfc=sklearn.ensemble.RandomForestClassifier(max_depth=100)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print('RandomForest : %f' % sklearn.metrics.accuracy_score(y_test,y_pred))

In [None]:
print("특성 중요도 : \n{}".format(rfc.feature_importances_))

def plot_feature_importances(model):
    n_features = X_test.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features),x)
    plt.xlabel("feature importances")
    plt.ylabel("feature")
    plt.ylim(-1, n_features)
plt.show()

plot_feature_importances(rfc)

In [None]:
from sklearn.feature_selection import SelectFromModel

thresholds = np.sort(rfc.feature_importances_)[::-1]  # 중요도를 기준으로 오름차순 정렬
best_accuracy = 0
best_threshold = 0

for i in range(1000,2500):
    # SelectFromModel을 사용하여 중요도가 threshold 이상인 특성 선택
    sfm = SelectFromModel(rfc, threshold=thresholds[i], prefit=True)
    X_train_selected = sfm.transform(X_train)
    X_test_selected = sfm.transform(X_test)

    # 모델 훈련 및 예측
    model = sklearn.ensemble.RandomForestClassifier(random_state=42)
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_test_selected)

    # 정확도 계산
    accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)

    # 현재 임계값에서의 정확도가 최고인 경우 업데이트
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = thresholds[i]

print(f"Best Threshold: {best_threshold:f}")
print(f"Best Accuracy: {best_accuracy:.4f}")

In [None]:
sfm = SelectFromModel(rfc, threshold=best_threshold, prefit=True)
X_train_selected = sfm.transform(X_train)
X_test_selected = sfm.transform(X_test)

In [None]:
max_depth = np.random.randint(90, 150, 10) ## 3~20사이 임의의 100개 int 생성
max_depth = np.append(max_depth, None)
n_estimators = np.random.randint(100,400,50)

param_dist = {
    'rf_classifier__n_estimators': n_estimators,
    'rf_classifier__max_depth': max_depth,
    'rf_classifier__min_samples_split': np.random.randint(2,10,5),
    'rf_classifier__min_samples_leaf': np.random.randint(2,10,5),
    'rf_classifier__max_features': ['sqrt']+list(reciprocal(0.1,0.5).rvs(20))
}

pipeline = Pipeline([
    ('scaler', StandardScaler()),  # 표준화
    ('rf_classifier', sklearn.ensemble.RandomForestClassifier(random_state=42))  # 랜덤 포레스트 분류기
])

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=10,
    scoring='accuracy',
    cv=5,
    random_state=42
)

random_search.fit(X_train_selected, y_train)

# RandomizedSearchCV에서 찾은 최적의 매개변수 가져오기
best_params = random_search.best_params_

# 최적의 매개변수로 파이프라인을 사용하여 모델 훈련
best_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # 표준화
    ('rf_classifier', sklearn.ensemble.RandomForestClassifier(
        n_estimators=best_params['rf_classifier__n_estimators'],
        max_depth=best_params['rf_classifier__max_depth'],
        min_samples_split=best_params['rf_classifier__min_samples_split'],
        min_samples_leaf=best_params['rf_classifier__min_samples_leaf'],
        max_features=best_params['rf_classifier__max_features'],
        random_state=42
    ))
])

# 최적의 매개변수로 파이프라인을 사용하여 모델 훈련
best_pipeline.fit(X_train_selected, y_train)
y_pred = best_pipeline.predict(X_test_selected)

joblib.dump(best_pipeline, 'model.pkl')

### Print accuracy (do not modify the following block)

In [None]:
print('Accuracy: %.2f' % sklearn.metrics.accuracy_score(y_test, y_pred))