# 프로젝트 설명

본 프로젝트는 python의 sci-kit learn 라이브러리에서 제공하는 세 가지의 toy dataset 구조를 살펴보고, 다섯 가지 분류 모델에 적용하여 어떤 모델이 좋은 성능을 보이는지 확인하고자 한다.
세 가지의 데이터는 손글씨, 와인, 유방암 데이터를 사용한다.

예측 결과를 어떻게 해석하고, 모델의 성능을 평가하는 지표로는 어떤 것을 선택할지에 대해 생각해보자.

### 패키지 임포트

In [111]:
# Load and preprocessing
from sklearn.datasets import load_digits, load_wine, load_breast_cancer

from sklearn.model_selection import train_test_split

# Model
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

# Score
from sklearn.metrics import precision_score, accuracy_score, recall_score, classification_report

### 데이터 준비

In [83]:
digits = load_digits()
wine = load_wine()
bcan = load_breast_cancer()

### 데이터 살펴보기

In [84]:
def toydata_eda(data):
    
    # EDA
    data_structure = {}
    data_name = data.DESCR.split('\n')[0].split("_")[1]
    
    print(f" Summary of {data_name} dataset", end = '\n\n\n')
    print(f" Keys of {data_name} datasets : {data.keys()}")
    print("-------------------------------------------------------------------------------", end = "\n\n")
    print(f" Shape of {data_name} datasets : {data.data.shape}")
    print("-------------------------------------------------------------------------------", end = "\n\n")
    print(f" Target name of {data_name} datasets : {data.target_names}")
    print("-------------------------------------------------------------------------------", end = "\n\n")
    print(f" Description of {data_name} datasets : \n {data.DESCR}")
    print("-------------------------------------------------------------------------------", end = "\n\n")

In [85]:
toydata_eda(digits)

 Summary of digits dataset


 Keys of digits datasets : dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])
-------------------------------------------------------------------------------

 Shape of digits datasets : (1797, 64)
-------------------------------------------------------------------------------

 Target name of digits datasets : [0 1 2 3 4 5 6 7 8 9]
-------------------------------------------------------------------------------

 Description of digits datasets : 
 .. _digits_dataset:

Optical recognition of handwritten digits dataset
--------------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 1797
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
ht

In [86]:
toydata_eda(wine)

 Summary of wine dataset


 Keys of wine datasets : dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])
-------------------------------------------------------------------------------

 Shape of wine datasets : (178, 13)
-------------------------------------------------------------------------------

 Target name of wine datasets : ['class_0' 'class_1' 'class_2']
-------------------------------------------------------------------------------

 Description of wine datasets : 
 .. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- 

In [87]:
toydata_eda(bcan)

 Summary of breast dataset


 Keys of breast datasets : dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
-------------------------------------------------------------------------------

 Shape of breast datasets : (569, 30)
-------------------------------------------------------------------------------

 Target name of breast datasets : ['malignant' 'benign']
-------------------------------------------------------------------------------

 Description of breast datasets : 
 .. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
      

#### 다양한 모델로 학습시켜보기

In [112]:
def five_classification_model(data, max_iter = 100, perf = "accuracy"):
    
    # EDA
    simple_eda(data)
    
    feature_data = data.data
    label_data = data.target
    
    # 훈련 데이터, 테스트 데이터 분리
    train_input, test_input, train_target, test_target = train_test_split(feature_data, label_data)
    
    dt = DecisionTreeClassifier()
    rf = RandomForestClassifier()
    svc = SVC
    lg = LogisticRegression()
    sgd = SGDClassifier()
    
    models = [dt, rf, svc, lg, sgd]
    
    for m in models:
        m.fit(train_input, train_target)
        test_pred = m.predict(test_input)
    
    ##### 다양한 모델 넣어서 지정한 성능평가 지표 출력하는 코드 짜기 ###########
    
    if model == SGDClassifier:
        md = model(max_iter = 1000)
    elif model == LogisticRegression:
        md = model(C = 20, max_iter = 1000)
    md.fit(train_input, train_target)
    test_pred = md.predict(test_input)
    precision = precision_score(test_target, test_pred, average = 'weighted')
    #recall = recall_score(test_target, test_pred, average = 'weighted')
    #print(classification_report(test_target, test_pred))
    #print(precision_score(test_target, test_pred, average = 'weighted'))
    print(f'{md}\'s Precision : {precision}')
    print('----------------------------------------------------------------')
    
    return precision

In [94]:
feature_data = digits.data
label_data = digits.target

In [95]:
train_input, test_input, train_target, test_target = train_test_split(feature_data, label_data)

In [109]:
def classification_model(model):
    md = model()
    if model == SGDClassifier:
        md = model(max_iter = 100)
    elif model == LogisticRegression:
        md = model(C = 20, max_iter = 1000)
    md.fit(train_input, train_target)
    test_pred = md.predict(test_input)
    precision = precision_score(test_target, test_pred, average = 'weighted')
    precision = accuracy_score(test_target, test_target)
    #recall = recall_score(test_target, test_pred, average = 'weighted')
    #print(classification_report(test_target, test_pred))
    #print(precision_score(test_target, test_pred, average = 'weighted'))
    print(f'{md}\'s Precision : {precision}')
    print('----------------------------------------------------------------')
    return precision

In [110]:
classification_model(RandomForestClassifier)

NameError: name 'accuracy_score' is not defined

In [104]:
model_list = ['DecisionTreeClassifier', 'RandomForestClassifier', 'SVC', 'LogisticRegression', 'SGDClassifier' ]

In [42]:
result = {}
for ml in model_list:
    temp = classification_model(eval(ml))
    result[ml] = temp

DecisionTreeClassifier()'s Precision : 0.8476832582158264
----------------------------------------------------------------
RandomForestClassifier()'s Precision : 0.9801751950503235
----------------------------------------------------------------
SVC()'s Precision : 0.99180256869773
----------------------------------------------------------------
LogisticRegression(C=20, max_iter=1000)'s Precision : 0.9633435367388856
----------------------------------------------------------------
SGDClassifier(max_iter=100)'s Precision : 0.9551877062911546
----------------------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [43]:
result

{'DecisionTreeClassifier': 0.8476832582158264,
 'RandomForestClassifier': 0.9801751950503235,
 'SVC': 0.99180256869773,
 'LogisticRegression': 0.9633435367388856,
 'SGDClassifier': 0.9551877062911546}

In [94]:
dt = DecisionTreeClassifier()
dt.fit(train_scaled, train_target)
test_pred = dt.predict(test_scaled)
report = pd.DataFrame(classification_report(test_target, test_pred,output_dict=True)).transpose()
report

Unnamed: 0,precision,recall,f1-score,support
0,0.941176,0.941176,0.941176,51.0
1,0.842105,0.820513,0.831169,39.0
2,0.869565,0.8,0.833333,50.0
3,0.866667,0.78,0.821053,50.0
4,0.825,0.825,0.825,40.0
5,0.895833,0.914894,0.905263,47.0
6,0.891892,0.891892,0.891892,37.0
7,0.777778,0.897436,0.833333,39.0
8,0.826923,0.86,0.843137,50.0
9,0.791667,0.808511,0.8,47.0


In [103]:
if 'accuracy' in report.index:
    

1


In [105]:
report.index.str.contains('accuracy')

array([False, False, False, False, False, False, False, False, False,
       False,  True, False, False])

# 회고

ㄴㄷ균균ㅇ륜