In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.datasets import load_iris



### Assignment 1

Train the classifier on the standard Fisher Iris dataset to solve the problem of predicting a flower variety by describing its 4 parameters.

1) The training sample should include 60% of all objects randomly selected from the dataset

2) The maximum depth of the constructed tree should not exceed 3

3) Use the Gini criterion as the branching criterion

4) Random state and random seed set to 42

As an answer to the problem, indicate the classification quality score on the test sample using accuracy_score

In [2]:
X, y = load_iris(return_X_y=True)

X_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.6, shuffle=True)
clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)

clf.fit(X_train, y_train)
preds = clf.predict(x_test)

acc = clf.score(x_test, y_test)

In [3]:
print('Accuracy score is ', round(acc, 2))

Accuracy score is  0.98


### Assignment 2

Train a standard decision tree regressor to solve the problem of predicting the value of a diamond given a set of diamond features. Do data preprocessing, use the 'sklearn.preprocessing.LabelEncoder' class, see documentation for details.

Choose the best combination of hyperparameters from the following:

    Branching criterion: squared_error, tree depth: 12
    Branching criterion: friedman_mse, tree depth: 16
    Branching criterion: poisson, tree depth: 22
    Branching criterion: squared_error, tree depth: 45
    Branching criterion: friedman_mse, tree depth: 95
    Branching criterion: poisson, tree depth: 33

   The best criterion will be the one that will show the best average quality in terms of the r2 metric in cross-validation with cv=10. Set random state and random seed to 42.

In [4]:
data = pd.read_csv('TRAIN.csv')
data = data.drop(data.columns[[0]], axis=1)
data.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


#### Preprocessing

Here we transform categorical features.

In [5]:
le = LabelEncoder()

for col in ['color', 'cut', 'clarity']:
    data[col] = le.fit_transform(data[col].values)

data.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31


In [6]:
X, y = data.drop(['price'], axis=1).values, data['price'].values
X, y = shuffle(X, y, random_state=42)

Models training and mean accurcy score comparison for given sets of hyperparameters:

In [7]:
hyper_params  = {12: 'squared_error',
                 16: 'friedman_mse',
                 22: 'poisson',
                 45: 'squared_error',
                 95: 'friedman_mse',
                 33: 'poisson'}
r2_score = {}

for depth, criterion in hyper_params.items():
    params = criterion + f', max_depth = {depth}'
    
    reg_model = DecisionTreeRegressor(criterion=criterion, max_depth=depth, random_state=42)
    r2_score[params] = cross_validate(reg_model, X, y, cv=10, scoring='r2')['test_score'].mean().round(5)

In [8]:
r2_score

{'squared_error, max_depth = 12': 0.97438,
 'friedman_mse, max_depth = 16': 0.96981,
 'poisson, max_depth = 22': 0.96497,
 'squared_error, max_depth = 45': 0.96573,
 'friedman_mse, max_depth = 95': 0.96573,
 'poisson, max_depth = 33': 0.9645}