In [65]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
import pandas as pd

### 손글씨 분류하기

In [3]:
digits = load_digits()
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [4]:
digits_data = digits.data
digits_label = digits.target
print(digits_data.shape)
print(digits_label.shape)

(1797, 64)
(1797,)


In [5]:
print(digits.target_names)

[0 1 2 3 4 5 6 7 8 9]


In [6]:
digits_df = pd.DataFrame(data=digits_data, columns=digits.feature_names)
digits_df.describe()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
count,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,...,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0
mean,0.0,0.30384,5.204786,11.835838,11.84808,5.781859,1.36227,0.129661,0.005565,1.993879,...,3.725097,0.206455,0.000556,0.279354,5.557596,12.089037,11.809126,6.764051,2.067891,0.364496
std,0.0,0.907192,4.754826,4.248842,4.287388,5.666418,3.325775,1.037383,0.094222,3.19616,...,4.919406,0.984401,0.02359,0.934302,5.103019,4.374694,4.933947,5.900623,4.090548,1.860122
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,11.0,10.0,0.0,0.0,0.0
50%,0.0,0.0,4.0,13.0,13.0,4.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,4.0,13.0,14.0,6.0,0.0,0.0
75%,0.0,0.0,9.0,15.0,15.0,11.0,0.0,0.0,0.0,3.0,...,7.0,0.0,0.0,0.0,10.0,16.0,16.0,12.0,2.0,0.0
max,0.0,8.0,16.0,16.0,16.0,16.0,16.0,15.0,2.0,16.0,...,16.0,13.0,1.0,9.0,16.0,16.0,16.0,16.0,16.0,16.0


In [7]:
train_input, test_input, train_target, test_target = train_test_split(digits.data, digits.target, test_size=0.2)
print("train_input 갯수 :", len(train_input), "test_input 갯수 :", len(test_input))

train_input 갯수 : 1437 test_input 갯수 : 360


##### 손 글씨 모델에서는 0의 존재 때문에 정확도는 믿을수 없다. 해서, 정밀도(Precision)와 재현율(Recall)의 종합점수인 f1-score의 평균으로 평가함

In [8]:
#의사결정 나무
decision_tree = DecisionTreeClassifier()
decision_tree.fit(train_input, train_target)
test_pred = decision_tree.predict(test_input)

f1 = f1_score(test_target, test_pred, average='macro')
print("F1 점수:", f1)

F1 점수: 0.8398389447183868


In [9]:
#RandomForest
random_forest = RandomForestClassifier()
random_forest.fit(train_input, train_target)
test_pred2 = random_forest.predict(test_input)

f1 = f1_score(test_target, test_pred2, average='macro')
print("F1 점수:", f1)

F1 점수: 0.9766621078907859


In [10]:
#svm
svm_model = svm.SVC()
svm_model.fit(train_input, train_target)
test_pred3 = svm_model.predict(test_input)

f1 = f1_score(test_target, test_pred3, average='macro')
print("F1 점수:", f1)

F1 점수: 0.9858301670032329


In [11]:
#SGDClassfier
sgd_model = SGDClassifier()
sgd_model.fit(train_input, train_target)
test_pred4 = sgd_model.predict(test_input)

f1 = f1_score(test_target, test_pred4, average='macro')
print("F1 점수:", f1)

F1 점수: 0.9425766834763006


In [14]:
# Logistic Regression
logistic_model = LogisticRegression(max_iter=3000) # 반복 횟수 증가 및 제한
logistic_model.fit(train_input, train_target)
test_pred5 = logistic_model.predict(test_input)

f1 = f1_score(test_target, test_pred5, average='macro')
print("F1 점수:", f1)

F1 점수: 0.9594234990769529


### 와인 분류하기

In [15]:
from sklearn.datasets import load_wine

wine = load_wine()
dir(wine)

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']

In [16]:
wine_data = wine.data
wine_label = wine.target
print(wine_data.shape)
print(wine_label.shape)

(178, 13)
(178,)


In [17]:
print(wine.target_names)

['class_0' 'class_1' 'class_2']


In [18]:
digits_df = pd.DataFrame(data=wine_data, columns=wine.feature_names)
digits_df.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [21]:
train_input, test_input, train_target, test_target = train_test_split(wine.data, wine.target, test_size=0.15) # 데이터 갯수에 따른 비율 조정
print("train_input 갯수 :", len(train_input), "test_input 갯수 :", len(test_input))

train_input 갯수 : 151 test_input 갯수 : 27


##### 와인분류 문제의 경우, 지정된 3가지의 와인종류 중 하나를 맞추는 문제이므로 단순하게 정확도를 기준으로 모델을 평가하였음

In [24]:
# 의사 결정 나무
decision_tree = DecisionTreeClassifier()
decision_tree.fit(train_input, train_target)
test_pred = decision_tree.predict(test_input)

accuracy = accuracy_score(test_target, test_pred)
print("정확도 점수:", accuracy)

정확도 점수: 0.8888888888888888


In [25]:
# RandomForest
random_forest = RandomForestClassifier()
random_forest.fit(train_input, train_target)
test_pred2 = random_forest.predict(test_input)

accuracy = accuracy_score(test_target, test_pred2)
print("정확도 점수:", accuracy)

정확도 점수: 0.9629629629629629


In [55]:
# svm
svm_model = svm.SVC()
svm_model.fit(train_input, train_target)
test_pred3 = svm_model.predict(test_input)

accuracy = accuracy_score(test_target, test_pred3)
print("정확도 점수:", accuracy)

정확도 점수: 0.7777777777777778


In [49]:
# SGDClassifier
sgd_model = SGDClassifier()
sgd_model.fit(train_input, train_target)
test_pred4 = sgd_model.predict(test_input)

accuracy = accuracy_score(test_target, test_pred4)
print("정확도 점수:", accuracy)

정확도 점수: 0.7037037037037037


In [58]:
# Logistic Regression
logistic_model = LogisticRegression(max_iter=3000)
logistic_model.fit(train_input, train_target)
test_pred5 = logistic_model.predict(test_input)

accuracy = accuracy_score(test_target, test_pred5)
print("정확도 점수:", accuracy)

정확도 점수: 0.9629629629629629


### 유방암 여부 진단하기

In [59]:
from sklearn.datasets import load_breast_cancer

breast = load_breast_cancer()
print(dir(breast))

['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'frame', 'target', 'target_names']


In [60]:
breast_data = breast.data
breast_label = breast.target
print(breast_data.shape)
print(breast_label.shape)

(569, 30)
(569,)


In [61]:
print(breast.target_names) # mailgnant = 악성, beningn = 양호

['malignant' 'benign']


In [62]:
digits_df = pd.DataFrame(data=breast_data, columns=breast.feature_names)
digits_df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [63]:
train_input, test_input, train_target, test_target = train_test_split(breast.data, breast.target, test_size=0.15)
print("train_input 갯수 :", len(train_input), "test_input 갯수 :", len(test_input))

train_input 갯수 : 483 test_input 갯수 : 86


##### 유방암 여부 문제에 관해서는, 실제 양성인경우를 음성으로 구분 할시 매우 큰 위험을 초래하기 때문에 거짓음성(FN)높으면 안되므로 실제 양성인 경우에서 모델이 양성으로 정확하게 찾아낸 비율을 나타내는 재현율(Recall)을 기준으로 모델을 평가함

In [74]:
# 의사 결정 나무
decision_tree = DecisionTreeClassifier()
decision_tree.fit(train_input, train_target)
test_pred = decision_tree.predict(test_input)

recall = recall_score(test_target, test_pred)
print("재현율 점수:", recall)

재현율 점수: 0.9636363636363636


In [96]:
#RandomForest
random_forest = RandomForestClassifier()
random_forest.fit(train_input, train_target)
test_pred2 = random_forest.predict(test_input)

recall = recall_score(test_target, test_pred2)
print("재현율 점수:", recall)

재현율 점수: 0.9818181818181818


In [95]:
# svm
svm_model = svm.SVC()
svm_model.fit(train_input, train_target)
test_pred3 = svm_model.predict(test_input)

recall = recall_score(test_target, test_pred3)
print("재현율 점수:", recall)

재현율 점수: 0.9818181818181818


In [91]:
# SGDClassfier
sgd_model = SGDClassifier()
sgd_model.fit(train_input, train_target)
test_pred4 = sgd_model.predict(test_input)

recall = recall_score(test_target, test_pred4)
print("재현율 점수:", recall)

재현율 점수: 0.9090909090909091


In [93]:
# Logistic Regression
logistic_model = LogisticRegression(max_iter=3000) # 반복 횟수 증가 및 제한
logistic_model.fit(train_input, train_target)
test_pred5 = logistic_model.predict(test_input)

recall = recall_score(test_target, test_pred5)
print("재현율 점수:", recall)

재현율 점수: 0.9818181818181818
