In [12]:
import numpy as np
np.random.seed(42)
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
dataset = load_wine()
x = dataset.data
y = dataset.target

In [3]:
print(f"Target names: {dataset.target_names}")
print(f"DESCR:\n{dataset.DESCR}")

Target names: ['class_0' 'class_1' 'class_2']
DESCR:
.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 16

In [4]:
df = pd.DataFrame(x, columns=dataset.feature_names)
df["y"] = y
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,y
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

### CART Classifier: GridSerachCV

In [6]:
# Hyperparameter
parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 4, 8, 10]
}

# 10 Modelle - Jedes Modell wird 10x trainiert (cv=10): 100x

clf = DecisionTreeClassifier()
grid_cv = GridSearchCV(clf, parameters, cv=10, n_jobs=-1)
grid_cv.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [None, 2, 4, 8, 10]})

In [7]:
print(f"Parameters of best parameters: {grid_cv.best_params_}")
print(f"Score of best model: {grid_cv.best_score_}")

Parameters of best parameters: {'criterion': 'gini', 'max_depth': 10}
Score of best model: 0.9371794871794871


#### Cart Classifier Train Best Model

In [8]:
clf = DecisionTreeClassifier(criterion='gini', max_depth=4)
clf.fit(x_train, y_train) # Auf das ganze Trainingsset
score = clf.score(x_test, y_test)

print(f"Accuracy: {score}")

Accuracy: 0.9444444444444444


### RandomForest Classifier: GridSearchCV

In [9]:
# Hyperparameter
parameters = {
    'n_estimators': [10, 20, 40, 80, 160],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 4, 8, 10]
}

# 50 Modelle - Jedes Modell wird 10x trainiert (cv=10): 500 Modelle, die trainiert werden

clf = RandomForestClassifier()
grid_cv = GridSearchCV(clf, parameters, cv=10, n_jobs=-1)
grid_cv.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [None, 2, 4, 8, 10],
                         'n_estimators': [10, 20, 40, 80, 160]})

In [10]:
print(f"Parameters of best parameters: {grid_cv.best_params_}")
print(f"Score of best model: {grid_cv.best_score_}")

Parameters of best parameters: {'criterion': 'entropy', 'max_depth': 8, 'n_estimators': 10}
Score of best model: 0.976923076923077


#### RandomForest Classifier: Train Best Model

In [11]:
clf = RandomForestClassifier(criterion='gini', max_depth=None, n_estimators=20)
clf.fit(x_train, y_train) # Auf das ganze Trainingsset
score = clf.score(x_test, y_test)

print(f"Accuracy: {score}")

Accuracy: 1.0


### GradientBoosting Classifier: Grid Search CV

In [15]:
# Hyperparameter
parameters = {
    'loss': ['deviance', 'exponential'],
    'n_estimators': [10, 20, 40],
    'criterion': ['mse', 'mae'],
    'max_depth': [None, 2, 4, 8]
}

# 48 Modelle - Jedes Modell wird 10x trainiert (cv=10): 480 Modelle, die trainiert werden

clf = GradientBoostingClassifier()
grid_cv = GridSearchCV(clf, parameters, cv=10, n_jobs=-1)
grid_cv.fit(x_train, y_train)

 0.90448718 0.91282051 0.91282051 0.91282051 0.91282051 0.91282051
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.9275641  0.91987179 0.9275641  0.91346154 0.92115385 0.92115385
 0.90384615 0.89679487 0.91217949 0.91987179 0.92051282 0.92051282
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan]


GridSearchCV(cv=10, estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'],
                         'loss': ['deviance', 'exponential'],
                         'max_depth': [None, 2, 4, 8],
                         'n_estimators': [10, 20, 40]})

In [16]:
print(f"Parameters of best parameters: {grid_cv.best_params_}")
print(f"Score of best model: {grid_cv.best_score_}")

Parameters of best parameters: {'criterion': 'mse', 'loss': 'deviance', 'max_depth': 2, 'n_estimators': 40}
Score of best model: 0.9435897435897436


#### GradientBoosting Classifier: Train Best Model

In [18]:
clf = GradientBoostingClassifier(criterion='mse', max_depth=2, n_estimators=40, loss='deviance')
clf.fit(x_train, y_train) # Auf das ganze Trainingsset
score = clf.score(x_test, y_test)

print(f"Accuracy: {score}")

Accuracy: 0.9814814814814815
