# 유방암 모델 만들기

In [1]:
import pandas as pd 
import joblib

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

### 유방암 데이터

In [3]:
df_train = pd.read_csv('../static/data/cancer_train.csv')
df_train.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,25.73,17.46,174.2,2010.0,0.1149,0.2363,0.3368,0.1913,0.1956,0.06121,...,23.58,229.3,3234.0,0.153,0.5937,0.6451,0.2756,0.369,0.08815,0
1,13.5,12.71,85.69,566.2,0.07376,0.03614,0.002758,0.004419,0.1365,0.05335,...,16.94,95.48,698.7,0.09023,0.05836,0.01379,0.0221,0.2267,0.06192,1
2,17.35,23.06,111.0,933.1,0.08662,0.0629,0.02891,0.02837,0.1564,0.05307,...,31.47,128.2,1218.0,0.124,0.1486,0.1211,0.08235,0.2452,0.06515,0
3,11.61,16.02,75.46,408.2,0.1088,0.1168,0.07097,0.04497,0.1886,0.0632,...,19.67,81.93,475.7,0.1415,0.217,0.2302,0.1105,0.2787,0.07427,1
4,11.87,21.54,76.83,432.0,0.06613,0.1064,0.08777,0.02386,0.1349,0.06612,...,28.18,83.51,507.2,0.09457,0.3399,0.3218,0.0875,0.2305,0.09952,1


In [4]:
X_train = df_train.iloc[:, :-1].values
y_train = df_train.iloc[:, -1].values
X_train.shape, y_train.shape

((426, 30), (426,))

In [5]:
df_test = pd.read_csv('../static/data/cancer_test.csv')
df_test.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,20.26,23.03,132.4,1264.0,0.09078,0.1313,0.1465,0.08683,0.2095,0.05649,...,31.59,156.1,1750.0,0.119,0.3539,0.4098,0.1573,0.3689,0.08368,0
1,15.3,25.27,102.4,732.4,0.1082,0.1697,0.1683,0.08751,0.1926,0.0654,...,36.71,149.3,1269.0,0.1641,0.611,0.6335,0.2024,0.4027,0.09876,0
2,11.95,14.96,77.23,426.7,0.1158,0.1206,0.01171,0.01787,0.2459,0.06581,...,17.72,83.09,496.2,0.1293,0.1885,0.03122,0.04766,0.3124,0.0759,1


In [6]:
X_test = df_test.iloc[:, :-1].values
y_test = df_test.target.values
X_test.shape, y_test.shape

((143, 30), (143,))

- 모든데이터로 스케일링 하는경우(X)

In [7]:
df = pd.concat([df_train, df_test], ignore_index=True)
scaler = MinMaxScaler()
scaler.fit(df.iloc[:, :-1])

MinMaxScaler()

- 훈련데이터로 스케일링 하는경우(O)

In [8]:
scaler = MinMaxScaler()
scaler.fit(df_train.iloc[:, :-1])

MinMaxScaler()

In [9]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((426, 30), (143, 30))

In [10]:
joblib.dump(scaler, '../static/model/cancer_scaler.pkl')

['../static/model/cancer_scaler.pkl']

### 1. Logistic Regression

In [35]:
lr_clf = LogisticRegression()
lr_clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [36]:
params = {
    'C': [3, 4, 5, 6, 7]
}

In [37]:
grid_cv = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9741
최적 파라미터: {'C': 5}


In [38]:
best_lr = grid_cv.best_estimator_
pred = best_lr.predict(X_test_scaled)
accuracy_score(y_test, pred)

0.986013986013986

In [39]:
joblib.dump(best_lr, '../static/model/cancer_lr.pkl')

['../static/model/cancer_lr.pkl']

### 2.SVM

In [43]:
sv_clf = SVC()
sv_clf.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [46]:
params = {
    # 'C':[0, 1, 1,5,7,10]
    'C':[3, 4, 5, 6]
}

In [49]:
grid_cv = GridSearchCV(sv_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9765
최적 파라미터: {'C': 4}


In [50]:
best_sv = grid_cv.best_estimator_
pred = best_sv.predict(X_test_scaled)
accuracy_score(y_test, pred)

0.986013986013986

In [51]:
joblib.dump(best_sv, '../static/model/cancer_sv.pkl')

['../static/model/cancer_sv.pkl']

### 3.RendomForest

In [55]:
rf_clf = RandomForestClassifier()
rf_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [56]:
params = {
    'max_depth': [4, 6, 8, 10],
    'min_samples_split': [2, 3, 4]
}

In [57]:
grid_cv = GridSearchCV(rf_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9601
최적 파라미터: {'max_depth': 10, 'min_samples_split': 2}


In [58]:
best_rf = grid_cv.best_estimator_
pred = best_rf.predict(X_test_scaled)
accuracy_score(y_test, pred)

0.965034965034965

In [59]:
joblib.dump(best_rf, '../static/model/cancer_rf.pkl')

['../static/model/cancer_rf.pkl']

### TEST
- Logistic Regression, SVM은 스케일된 데이터로
- Decison Tree는 스케일 하지 않은 데이터로

In [60]:
index = 100
test_data = (df_test.iloc[index, :-1].values)
test_data

array([1.245e+01, 1.570e+01, 8.257e+01, 4.771e+02, 1.278e-01, 1.700e-01,
       1.578e-01, 8.089e-02, 2.087e-01, 7.613e-02, 3.345e-01, 8.902e-01,
       2.217e+00, 2.719e+01, 7.510e-03, 3.345e-02, 3.672e-02, 1.137e-02,
       2.165e-02, 5.082e-03, 1.547e+01, 2.375e+01, 1.034e+02, 7.416e+02,
       1.791e-01, 5.249e-01, 5.355e-01, 1.741e-01, 3.985e-01, 1.244e-01])

In [61]:
test_data = test_data.reshape(1,-1)
test_data

array([[1.245e+01, 1.570e+01, 8.257e+01, 4.771e+02, 1.278e-01, 1.700e-01,
        1.578e-01, 8.089e-02, 2.087e-01, 7.613e-02, 3.345e-01, 8.902e-01,
        2.217e+00, 2.719e+01, 7.510e-03, 3.345e-02, 3.672e-02, 1.137e-02,
        2.165e-02, 5.082e-03, 1.547e+01, 2.375e+01, 1.034e+02, 7.416e+02,
        1.791e-01, 5.249e-01, 5.355e-01, 1.741e-01, 3.985e-01, 1.244e-01]])

In [62]:
test_scaled = scaler.transform(test_data)
test_scaled

array([[0.21270927, 0.27977581, 0.2166679 , 0.11903621, 0.93383326,
        0.55858842, 0.38412853, 0.50714734, 0.63473424, 0.60603933,
        0.07978685, 0.13864053, 0.06496468, 0.03832722, 0.24691428,
        0.2896095 , 0.09272727, 0.2153817 , 0.21818539, 0.14465956,
        0.27691393, 0.33380763, 0.29710836, 0.15711645, 0.68594325,
        0.67171859, 0.65177702, 0.59972442, 0.57485741, 0.74076058]])

In [63]:
label = df_test.iloc[index, -1]
label

0

In [64]:
pred_lr = best_lr.predict(test_data)
pred_sv = best_sv.predict(test_data)
pred_rf = best_rf.predict(test_data)

In [65]:
label, pred_lr[0], pred_sv[0], pred_rf[0]

(0, 0, 0, 0)

In [66]:
df_test.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,20.26,23.03,132.4,1264.0,0.09078,0.1313,0.1465,0.08683,0.2095,0.05649,...,31.59,156.1,1750.0,0.119,0.3539,0.4098,0.1573,0.3689,0.08368,0
1,15.3,25.27,102.4,732.4,0.1082,0.1697,0.1683,0.08751,0.1926,0.0654,...,36.71,149.3,1269.0,0.1641,0.611,0.6335,0.2024,0.4027,0.09876,0
2,11.95,14.96,77.23,426.7,0.1158,0.1206,0.01171,0.01787,0.2459,0.06581,...,17.72,83.09,496.2,0.1293,0.1885,0.03122,0.04766,0.3124,0.0759,1
3,16.26,21.88,107.5,826.8,0.1165,0.1283,0.1799,0.07981,0.1869,0.06532,...,25.21,113.7,975.2,0.1426,0.2116,0.3344,0.1047,0.2736,0.07953,0
4,13.94,13.17,90.31,594.2,0.1248,0.09755,0.101,0.06615,0.1976,0.06457,...,15.38,94.52,653.3,0.1394,0.1364,0.1559,0.1015,0.216,0.07253,1


In [67]:
a = dict(zip(df_test.columns[:-1], df_test.iloc[index, :-1]))

In [68]:
a

{'mean radius': 12.45,
 'mean texture': 15.7,
 'mean perimeter': 82.57,
 'mean area': 477.1,
 'mean smoothness': 0.1278,
 'mean compactness': 0.17,
 'mean concavity': 0.1578,
 'mean concave points': 0.08089,
 'mean symmetry': 0.2087,
 'mean fractal dimension': 0.07612999999999999,
 'radius error': 0.3345,
 'texture error': 0.8902,
 'perimeter error': 2.217,
 'area error': 27.19,
 'smoothness error': 0.007509999999999999,
 'compactness error': 0.03345,
 'concavity error': 0.036719999999999996,
 'concave points error': 0.01137,
 'symmetry error': 0.02165,
 'fractal dimension error': 0.005082,
 'worst radius': 15.47,
 'worst texture': 23.75,
 'worst perimeter': 103.4,
 'worst area': 741.6,
 'worst smoothness': 0.1791,
 'worst compactness': 0.5249,
 'worst concavity': 0.5355,
 'worst concave points': 0.1741,
 'worst symmetry': 0.3985,
 'worst fractal dimension': 0.1244}