## Libraries and versions

### Libraries

In [1]:
python_version = !Python -V #version 3.7.9
import pandas as pd #version 1.2.3
import numpy as np #version 1.19.2


#for machine learning models
#pre processing
from category_encoders import OneHotEncoder, __version__ as ce_version #version 2.2.2

#split data in train and test
#version
from sklearn import __version__ as sk_version #version 0.24.1

#pre-processing
from sklearn.model_selection import train_test_split

#classificators
from sklearn.tree import DecisionTreeClassifier

#metrics
from sklearn.metrics import accuracy_score

### Versions

In [2]:
print(f'{python_version[0]}')
print(f'Pandas version: {pd.__version__}')
print(f'Numpy version: {np.__version__}')
print(f'Category Encoders version: {ce_version}')
print(f'Sklearn version: {sk_version}')

Python 3.7.9
Pandas version: 1.2.3
Numpy version: 1.19.2
Category Encoders version: 2.2.2
Sklearn version: 0.24.1


## Configurations

In [3]:
import warnings
warnings.filterwarnings("ignore")

## Dataset

In [4]:
dataset = pd.read_csv('train_cleaning_model7.csv')

In [5]:
dataset.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation,age_categories
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D,18-25
1,462643,Female,Yes,38,Yes,Engineer,1.0,Average,3.0,Cat_4,A,36-40
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B,66-70
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B,66-70
4,462669,Female,Yes,40,Yes,Entertainment,1.0,High,6.0,Cat_6,A,36-40


## make model better

OPTIONS

- change categories
- change dependent variables

OBSERVATIONS

- model6 has all changes in dataset was maded, so other analysis only will be made if model7 needs

# Dataset transformation

## Encoders

### OneHotEncoder

In [6]:
#OneHotEncoder
encoder = OneHotEncoder(cols=['Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1'],
                       use_cat_names=True)
dataset_onehorencoder = encoder.fit_transform(dataset)

#map column Segmentation and transform for numeric variables
map_segmentation = {'A':0, 'B':1, 'C':2, 'D':3}
dataset_onehorencoder['Segmentation'] = dataset_onehorencoder['Segmentation'].map(map_segmentation)

#drop non used columns ID, Gender and Family_Size
dataset_onehorencoder.drop(columns=['ID','Gender', 'Family_Size', 'age_categories'], inplace=True)

## Variables

### OneHotEncoder

In [7]:
#selection of all variables, except Segmentation
X_onehotencoder = dataset_onehorencoder.loc[:, dataset_onehorencoder.columns != 'Segmentation']
y_onehotencoder = dataset_onehorencoder['Segmentation']

## Test 7 - Original

In [8]:
#first - split train and test data
accuracy_list = []
iterations = 10
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X_onehotencoder, y_onehotencoder, train_size=0.7,
                                                        stratify=y_onehotencoder)

    #third - macinhe learning apply
    model7 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=30)
    model7.fit(X_train, y_train)
    y_predict = model7.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')
print(model7.set_params())

ACCURACY FOR 10 ITERATIONS
Max = 0.5402
Mean = 0.5229
Min = 0.5089
Std = 0.0109
DecisionTreeClassifier(max_depth=5, min_samples_leaf=30)


In [9]:
#create DataFrame with errors
dict_compare = {'Real':y_test.values, 'Predict':y_predict}
dataset_error = pd.DataFrame(dict_compare, index=y_test.index)
dataset_error['Error'] = dataset_error['Real'] !=dataset_error['Predict']
dataset_error['Cat_Real'] = dataset_error['Real'].map({0:'A', 1:'B', 2:'C', 3:'D'})
dataset_error['Cat_Predict'] = dataset_error['Predict'].map({0:'A', 1:'B', 2:'C', 3:'D'})

In [10]:
print('Percent of Error for each Segmentation')
pd.crosstab(index=dataset_error['Cat_Real'], columns=dataset_error['Error'], normalize='index')

Percent of Error for each Segmentation


Error,False,True
Cat_Real,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.491071,0.508929
B,0.29588,0.70412
C,0.581722,0.418278
D,0.73511,0.26489


In [145]:
importance = model7.feature_importances_
pd.DataFrame(data={'Importance':importance}, index=X_onehotencoder.columns).sort_values('Importance', ascending=False)

Unnamed: 0,Importance
Age,0.5541
Profession_Artist,0.165142
Spending_Score_Low,0.152281
Profession_Healthcare,0.035741
Graduated_Yes,0.030443
Profession_Entertainment,0.021611
Profession_Marketing,0.019851
Work_Experience,0.006227
Spending_Score_Average,0.005195
Graduated_No,0.005058


# Tests for model 7

## Encoders - exclude variable Var_1 and change params (class_weight, entropy)

### OneHotEncoder

In [27]:
#OneHotEncoder
encoder = OneHotEncoder(cols=['Ever_Married', 'Graduated', 'Profession', 'Spending_Score'],
                       use_cat_names=True)
dataset_onehorencoder = encoder.fit_transform(dataset)

#map column Segmentation and transform for numeric variables
map_segmentation = {'A':0, 'B':1, 'C':2, 'D':3}
dataset_onehorencoder['Segmentation'] = dataset_onehorencoder['Segmentation'].map(map_segmentation)

#drop non used columns ID, Gender and Family_Size
dataset_onehorencoder.drop(columns=['ID','Gender', 'Family_Size', 'age_categories', 'Var_1'], inplace=True)

## Variables

### OneHotEncoder

In [28]:
#selection of all variables, except Segmentation
X_onehotencoder = dataset_onehorencoder.loc[:, dataset_onehorencoder.columns != 'Segmentation']
y_onehotencoder = dataset_onehorencoder['Segmentation']

### model7_1 - exclude Var_1

In [113]:
print('Weight for model7_1')
y_onehotencoder.value_counts(normalize=True)

Weight for model7_1


3    0.277350
2    0.247229
0    0.243448
1    0.231973
Name: Segmentation, dtype: float64

In [172]:
#first - split train and test data
accuracy_list = []
iterations = 50
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X_onehotencoder, y_onehotencoder, train_size=0.7,
                                                        stratify=y_onehotencoder)

    #third - macinhe learning apply
    model7_1 = DecisionTreeClassifier(max_depth=5,
                                      min_samples_leaf=30,
                                      criterion='entropy',
                                      class_weight={0:0.24, 1:0.23, 2:0.24, 3:0.27})
    model7_1.fit(X_train, y_train)
    y_predict = model7_1.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')
print(model7_1.set_params())

ACCURACY FOR 50 ITERATIONS
Max = 0.5419
Mean = 0.5232
Min = 0.5093
Std = 0.0082
DecisionTreeClassifier(class_weight={0: 0.24, 1: 0.23, 2: 0.24, 3: 0.27},
                       criterion='entropy', max_depth=5, min_samples_leaf=30)


In [173]:
#create DataFrame with errors
dict_compare = {'Real':y_test.values, 'Predict':y_predict}
dataset_error = pd.DataFrame(dict_compare, index=y_test.index)
dataset_error['Error'] = dataset_error['Real'] !=dataset_error['Predict']
dataset_error['Cat_Real'] = dataset_error['Real'].map({0:'A', 1:'B', 2:'C', 3:'D'})
dataset_error['Cat_Predict'] = dataset_error['Predict'].map({0:'A', 1:'B', 2:'C', 3:'D'})

In [174]:
print('Percent of Error for each Segmentation')
pd.crosstab(index=dataset_error['Cat_Real'], columns=dataset_error['Error'], normalize='index')

Percent of Error for each Segmentation


Error,False,True
Cat_Real,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466071,0.533929
B,0.340824,0.659176
C,0.558875,0.441125
D,0.681818,0.318182


In [176]:
importance = model7_1.feature_importances_
pd.DataFrame(data={'Importance':importance}, index=X_onehotencoder.columns).sort_values('Importance', ascending=False)

Unnamed: 0,Importance
Age,0.50853
Profession_Artist,0.180997
Spending_Score_Low,0.152421
Profession_Healthcare,0.064199
Graduated_No,0.033653
Profession_Entertainment,0.012539
Work_Experience,0.010888
Ever_Married_Yes,0.010545
Profession_Marketing,0.009
Graduated_Yes,0.007539


- Age is more than 50% factor of decision for model7 e model7_1

### Model7_2 - add age_categories

In [128]:
#OneHotEncoder
encoder = OneHotEncoder(cols=['Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'age_categories'],
                       use_cat_names=True)
dataset_onehorencoder_new = encoder.fit_transform(dataset)

#map column Segmentation and transform for numeric variables
map_segmentation = {'A':0, 'B':1, 'C':2, 'D':3}
dataset_onehorencoder_new['Segmentation'] = dataset_onehorencoder_new['Segmentation'].map(map_segmentation)

#drop non used columns ID, Gender and Family_Size
dataset_onehorencoder_new.drop(columns=['ID','Gender', 'Family_Size', 'Var_1', 'Age' ], inplace=True)

## Variables

### OneHotEncoder

In [129]:
#selection of all variables, except Segmentation
X_onehotencoder_new = dataset_onehorencoder_new.loc[:, dataset_onehorencoder_new.columns != 'Segmentation']
y_onehotencoder_new = dataset_onehorencoder_new['Segmentation']

In [168]:
#first - split train and test data
accuracy_list = []
iterations = 20
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X_onehotencoder_new, y_onehotencoder_new, train_size=0.7,
                                                        stratify=y_onehotencoder_new)

    #third - macinhe learning apply
    model7_2 = DecisionTreeClassifier(max_depth=5,
                                      min_samples_leaf=30,
                                      criterion='entropy',
                                      class_weight={0:0.24, 1:0.23, 2:0.24, 3:0.27})
    model7_2.fit(X_train, y_train)
    y_predict = model7_2.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')
print(model7_2.set_params())

ACCURACY FOR 20 ITERATIONS
Max = 0.5054
Mean = 0.4913
Min = 0.4781
Std = 0.0074
DecisionTreeClassifier(class_weight={0: 0.24, 1: 0.23, 2: 0.24, 3: 0.27},
                       criterion='entropy', max_depth=5, min_samples_leaf=30)


In [169]:
#create DataFrame with errors
dict_compare = {'Real':y_test.values, 'Predict':y_predict}
dataset_error = pd.DataFrame(dict_compare, index=y_test.index)
dataset_error['Error'] = dataset_error['Real'] !=dataset_error['Predict']
dataset_error['Cat_Real'] = dataset_error['Real'].map({0:'A', 1:'B', 2:'C', 3:'D'})
dataset_error['Cat_Predict'] = dataset_error['Predict'].map({0:'A', 1:'B', 2:'C', 3:'D'})

In [170]:
print('Percent of Error for each Segmentation')
pd.crosstab(index=dataset_error['Cat_Real'], columns=dataset_error['Error'], normalize='index')

Percent of Error for each Segmentation


Error,False,True
Cat_Real,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.3125,0.6875
B,0.35206,0.64794
C,0.588752,0.411248
D,0.689655,0.310345


In [167]:
importance = model7_2.feature_importances_
pd.DataFrame(data={'Importance':importance}, index=X_onehotencoder_new.columns).sort_values('Importance', ascending=False)

Unnamed: 0,Importance
Spending_Score_Low,0.333505
Profession_Healthcare,0.214028
Profession_Artist,0.204484
age_categories_18-25,0.071944
Graduated_Yes,0.045645
age_categories_26-30,0.026607
age_categories_31-35,0.022459
Ever_Married_No,0.022068
Graduated_No,0.017074
Work_Experience,0.015282


- seems which divide Age in categories did not increase mean accuracy, but that change maked model understand better segmentations (not too much better)
- there is important train the model with other categories in profession variable (create categorie Bussiness = Executive + Marketing, etc.)

## Conclusions

- split variable Age did not work with use onehotencoder, because accuracy is less then 50% and did not solve problem os segmentation
- the best model is model7_1, which reach 52% os accuracy, but have some problems with distinction of each segmentation and the importance of variable Age with almost 50%.