In [51]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.tree import export_text

In [25]:
df=pd.read_csv('Mall_Customers.csv')
df

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
...,...,...,...,...,...
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18


In [26]:
df['Genre'] = df['Genre'].apply(lambda x: 1 if x == "Male" else 0)

In [27]:
df

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,1,19,15,39
1,2,1,21,15,81
2,3,0,20,16,6
3,4,0,23,16,77
4,5,0,31,17,40
...,...,...,...,...,...
195,196,0,35,120,79
196,197,0,45,126,28
197,198,1,32,126,74
198,199,1,32,137,18


In [28]:
df.columns = [col.lower().replace(' ', '') for col in df.columns]

In [29]:
feature_names=list(df.columns)
feature_names.remove('customerid')
feature_names.remove('spendingscore(1-100)')
feature_names

['genre', 'age', 'annualincome(k$)']

In [30]:
df

Unnamed: 0,customerid,genre,age,annualincome(k$),spendingscore(1-100)
0,1,1,19,15,39
1,2,1,21,15,81
2,3,0,20,16,6
3,4,0,23,16,77
4,5,0,31,17,40
...,...,...,...,...,...
195,196,0,35,120,79
196,197,0,45,126,28
197,198,1,32,126,74
198,199,1,32,137,18


In [31]:
features=df[feature_names]
target=df['spendingscore(1-100)']

In [32]:
def calc_squared_error(values):
    mean_value=values.mean()
    res=sum([(v-mean_value)**2 for v in values])/len(values)
    return res

In [33]:
def calc_optoins(X, y):
    feature_names=list(X.columns)
    max_information_gain=0
    best_feature_name=None
    best_split_value=None
    for feature_name in feature_names:
        feature_values=X[feature_name]
        diff_feature_values=sorted(set(feature_values))
        init_criterion_value=calc_squared_error(y)
        for i in range(len(diff_feature_values)-1):
            information_gain=init_criterion_value
            split_value=(diff_feature_values[i]+diff_feature_values[i+1])/2

            values_in_group=y[X[feature_name]<=split_value]
            criterion_value_for_group=calc_squared_error(values_in_group)
            ratio_for_group=len(values_in_group)/len(y)

            information_gain -=criterion_value_for_group*ratio_for_group


            values_in_group=y[X[feature_name]>split_value]
            criterion_value_for_group=calc_squared_error(values_in_group)
            ratio_for_group=len(values_in_group)/len(y)

            information_gain -=criterion_value_for_group*ratio_for_group

            if max_information_gain<=information_gain:
                max_information_gain=information_gain
                best_feature_name=feature_name
                best_split_value=split_value
    return [best_feature_name, best_split_value]
            

In [47]:
def fit_regression_tree(features, values, max_tree_depth, min_sampses_split, depth=0):
    if min_sampses_split is None:
        min_sampses_split=1
    if (((max_tree_depth is not None) and (depth==max_tree_depth))or 
        (len(values)-1<min_sampses_split)or
        (features.duplicated(keep=False).sum()==len(features))):
        print(f'{"|    "*depth}|---values:[{round(values.mean(),2)}]')
        return depth
    feature_name, split_value=calc_optoins(features, values)
    left_mask=features[feature_name]<=split_value
    left_features_group=features[left_mask]
    left_values_group=values[left_mask]
    print(f'{"|    "*depth}|---{feature_name}<={split_value}')

    left_depth=fit_regression_tree(left_features_group, left_values_group, max_tree_depth,min_sampses_split, depth+1)

    right_mask=features[feature_name]>split_value
    right_features_group=features[right_mask]
    right_values_group=values[right_mask]
    print(f'{"|    "*depth}|---{feature_name}>{split_value}')

    right_depth=fit_regression_tree(right_features_group, right_values_group, max_tree_depth,min_sampses_split, depth+1)
    return max(left_depth,right_depth)




In [49]:
tree_depth=fit_regression_tree(features, target, 2, None)

|---age<=39.5
|    |---age<=20.5
|    |    |---values:[44.65]
|    |---age>20.5
|    |    |---values:[62.53]
|---age>39.5
|    |---annualincome(k$)<=72.0
|    |    |---values:[41.77]
|    |---annualincome(k$)>72.0
|    |    |---values:[19.79]


In [52]:
model=tree.DecisionTreeRegressor(max_depth=2)
model.fit(features, target)
print(export_text(model, feature_names=feature_names))

|--- age <= 39.50
|   |--- age <= 20.50
|   |   |--- value: [44.65]
|   |--- age >  20.50
|   |   |--- value: [62.53]
|--- age >  39.50
|   |--- annualincome(k$) <= 72.00
|   |   |--- value: [41.77]
|   |--- annualincome(k$) >  72.00
|   |   |--- value: [19.79]



Построенные деревья совпадают