In [1]:
import pandas as pd 
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [2]:
df = pd.read_csv('../static/data/cancer_train.csv')
df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,25.73,17.46,174.2,2010.0,0.1149,0.2363,0.3368,0.1913,0.1956,0.06121,...,23.58,229.3,3234.0,0.153,0.5937,0.6451,0.2756,0.369,0.08815,0
1,13.5,12.71,85.69,566.2,0.07376,0.03614,0.002758,0.004419,0.1365,0.05335,...,16.94,95.48,698.7,0.09023,0.05836,0.01379,0.0221,0.2267,0.06192,1
2,17.35,23.06,111.0,933.1,0.08662,0.0629,0.02891,0.02837,0.1564,0.05307,...,31.47,128.2,1218.0,0.124,0.1486,0.1211,0.08235,0.2452,0.06515,0


In [3]:
y_train = df.target.values
scaler = MinMaxScaler()
X_train = scaler.fit_transform(df.drop(columns='target', axis=1))
X_train.shape, y_train.shape

((426, 30), (426,))

In [4]:
df = pd.read_csv('../static/data/cancer_test.csv')
y_test = df.target.values
X_test = scaler.fit_transform(df.drop(columns='target', axis=1))
X_test.shape, y_test.shape

((426, 30), (426,))

### 1. Logistic Regression

In [5]:
lr_clf = LogisticRegression()
lr_clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [6]:
params = {
    'C': [1, 3, 5, 7, 10, 20]
    # 'C': [0.1, 1, 5, 10]
}

In [7]:
grid_cv = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_) # best_params_: 좋은 파라미터를 보여줌.

최고 평균 정확도: 0.9741
최적 파라미터: {'C': 5}


In [8]:
best_lr = grid_cv.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred = best_lr.predict(X_test)
acc_lr = accuracy_score(y_test, pred)
print(f'평균 정확도 : {acc_lr:.4f}')

평균 정확도 : 0.9765


In [9]:
joblib.dump(best_lr, '../static/model/cancer_lr.pkl')

['../static/model/cancer_lr.pkl']

### 2. Decision Tree

In [10]:
dt_clf = DecisionTreeClassifier()
dt_clf.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [15]:
params = {
    'max_depth': [1, 3, 6, 8, 10],
    'min_samples_split': [1, 3, 6]
}

In [16]:
grid_cv = GridSearchCV(dt_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_) # best_params_: 좋은 파라미터를 보여줌.

최고 평균 정확도: 0.9389
최적 파라미터: {'max_depth': 8, 'min_samples_split': 3}


In [17]:
best_dt = grid_cv.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred = best_dt.predict(X_test)
acc_dt = accuracy_score(y_test, pred)
print(f'평균 정확도 : {acc_dt:.4f}')

평균 정확도 : 0.9953


In [18]:
joblib.dump(best_dt, '../static/model/cancer_dt.pkl')

['../static/model/cancer_dt.pkl']

### 3. SVM

In [19]:
sv_clf = SVC()
sv_clf.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [20]:
params = {
    'C': [1.0, 1, 5, 7, 8, 10]
    # 'C': [0.1, 1, 5, 10]
}

In [21]:
grid_cv = GridSearchCV(sv_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_) # best_params_: 좋은 파라미터를 보여줌.

최고 평균 정확도: 0.9789
최적 파라미터: {'C': 7}


In [22]:
best_sv = grid_cv.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred = best_sv.predict(X_test)
acc_sv = accuracy_score(y_test, pred)
print(f'평균 정확도 : {acc_sv:.4f}')

평균 정확도 : 0.9883


In [23]:
joblib.dump(best_sv, '../static/model/cancer_sv.pkl')

['../static/model/cancer_sv.pkl']

### Test

In [34]:
index = 10
df = pd.read_csv('../static/data/cancer_test.csv')
scaled_test = scaler.fit_transform(df.iloc[:, :-1])
scaled_test.shape

(426, 30)

In [35]:
test_data = scaled_test[index, :].reshape(1,-1)
test_data

array([[0.26072704, 0.21245675, 0.24736217, 0.13887593, 0.35289338,
        0.15002147, 0.07900656, 0.11292247, 0.29524826, 0.19334457,
        0.05707247, 0.14692362, 0.05493769, 0.02405313, 0.10677499,
        0.11970138, 0.06010533, 0.19474328, 0.08411662, 0.07977702,
        0.21985059, 0.28933873, 0.19572688, 0.1046746 , 0.31535692,
        0.15233189, 0.13602236, 0.2975945 , 0.20756949, 0.138069  ]])

In [36]:
label = df.iloc[index, -1]
label

1

In [37]:
pred_lr = best_lr.predict(test_data)
pred_sv = best_sv.predict(test_data)
pred_dt = best_dt.predict(test_data)

In [38]:
label, pred_lr[0], pred_sv[0], pred_dt[0]

(1, 1, 1, 1)

In [43]:
option_dict = {'label':label, 'pred_lr':pred_lr[0], 'area_ratio': pred_sv[0], 'per_person':pred_dt[0]}

In [44]:
option_dict

{'label': 1, 'pred_lr': 1, 'area_ratio': 1, 'per_person': 1}