In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, ShuffleSplit, LeaveOneOut
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = datasets.load_wine()

In [3]:
print(data.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0.98  3.88    2.29  0.63
    Fl

In [4]:
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

In [5]:
X.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [6]:
y.head()

0    0
1    0
2    0
3    0
4    0
dtype: int32

### KFold

In [7]:
kfold = KFold(n_splits=5)
kfold

KFold(n_splits=5, random_state=None, shuffle=False)

In [8]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()

In [9]:
def get_score(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    return model.score(x_test, y_test)

In [10]:
dtc_score = []
rfc_score = []
for train_index, test_index in kfold.split(data.data):
    x_train, x_test, y_train, y_test = X.loc[train_index], X.loc[test_index], y.loc[train_index], y.loc[test_index]
    rfc_score.append(get_score(rfc, x_train, x_test, y_train, y_test))
    dtc_score.append(get_score(dtc, x_train, x_test, y_train, y_test))

In [11]:
rfc_score

[0.9166666666666666,
 0.8611111111111112,
 0.9444444444444444,
 1.0,
 0.9714285714285714]

In [12]:
dtc_score

[0.9166666666666666,
 0.8055555555555556,
 0.8055555555555556,
 0.7142857142857143,
 0.8]

### Cross Validation


In [13]:
rfc_score = cross_val_score(rfc, X, y, scoring='accuracy', cv=kfold)
rfc_score

array([0.94444444, 0.97222222, 0.94444444, 1.        , 0.97142857])

In [14]:
rfc_score.mean()

0.9665079365079364

### Other techniques

In [15]:
skfold = StratifiedKFold(n_splits=5)
shuffle = ShuffleSplit(n_splits=5)
leave = LeaveOneOut()

In [16]:
def get_score_using_cross_val(model, X, y, fold):
    return cross_val_score(model, X, y, scoring='accuracy', cv=fold)

In [17]:
get_score_using_cross_val(rfc, X, y, kfold)

array([0.91666667, 0.91666667, 0.94444444, 1.        , 1.        ])

In [18]:
get_score_using_cross_val(rfc, X, y, skfold)

array([0.97222222, 0.94444444, 1.        , 0.97142857, 0.97142857])

In [19]:
get_score_using_cross_val(rfc, X, y, shuffle)

array([0.88888889, 1.        , 0.94444444, 1.        , 1.        ])

In [20]:
get_score_using_cross_val(rfc, X, y, leave).mean()

0.9831460674157303