## Libraries and versions

### Libraries

In [1]:
python_version = !Python -V #version 3.7.9
import pandas as pd #version 1.2.3
import numpy as np #version 1.19.2


#for machine learning models
#pre processing
from category_encoders import OneHotEncoder, __version__ as ce_version #version 2.2.2

#split data in train and test
#version
from sklearn import __version__ as sk_version #version 0.24.1

#pre-processing
from sklearn.model_selection import train_test_split

#classificators
from sklearn.tree import DecisionTreeClassifier

#metrics
from sklearn.metrics import accuracy_score

### Versions

In [2]:
print(f'{python_version[0]}')
print(f'Pandas version: {pd.__version__}')
print(f'Numpy version: {np.__version__}')
print(f'Category Encoders version: {ce_version}')
print(f'Sklearn version: {sk_version}')

Python 3.7.9
Pandas version: 1.2.3
Numpy version: 1.19.2
Category Encoders version: 2.2.2
Sklearn version: 0.24.1


## Configurations

In [3]:
import warnings
warnings.filterwarnings("ignore")

## Dataset

In [4]:
dataset = pd.read_csv('train_cleaning_model7.csv')

In [5]:
dataset.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation,age_categories
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D,18-25
1,462643,Female,Yes,38,Yes,Engineer,1.0,Average,3.0,Cat_4,A,36-40
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B,66-70
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B,66-70
4,462669,Female,Yes,40,Yes,Entertainment,1.0,High,6.0,Cat_6,A,36-40


# Dataset transformation

## Encoders

### OneHotEncoder

In [7]:
#OneHotEncoder
encoder = OneHotEncoder(cols=['Ever_Married', 'Graduated', 'Profession', 'Spending_Score'],
                       use_cat_names=True)
dataset_onehorencoder = encoder.fit_transform(dataset)

#map column Segmentation and transform for numeric variables
map_segmentation = {'A':0, 'B':1, 'C':2, 'D':3}
dataset_onehorencoder['Segmentation'] = dataset_onehorencoder['Segmentation'].map(map_segmentation)

#drop non used columns ID, Gender and Family_Size
dataset_onehorencoder.drop(columns=['ID','Gender', 'Family_Size', 'age_categories', 'Var_1'], inplace=True)

## Variables

### OneHotEncoder

In [8]:
#selection of all variables, except Segmentation
X_onehotencoder = dataset_onehorencoder.loc[:, dataset_onehorencoder.columns != 'Segmentation']
y_onehotencoder = dataset_onehorencoder['Segmentation']

In [9]:
#first - split train and test data
accuracy_list = []
iterations = 50
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X_onehotencoder, y_onehotencoder, train_size=0.7,
                                                        stratify=y_onehotencoder)

    #third - macinhe learning apply
    model7_1 = DecisionTreeClassifier(max_depth=5,
                                      min_samples_leaf=30,
                                      criterion='entropy',
                                      class_weight={0:0.24, 1:0.23, 2:0.24, 3:0.27})
    model7_1.fit(X_train, y_train)
    y_predict = model7_1.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')
print(model7_1.set_params())

ACCURACY FOR 50 ITERATIONS
Max = 0.5411
Mean = 0.5221
Min = 0.5063
Std = 0.0081
DecisionTreeClassifier(class_weight={0: 0.24, 1: 0.23, 2: 0.24, 3: 0.27},
                       criterion='entropy', max_depth=5, min_samples_leaf=30)


In [10]:
#create DataFrame with errors
dict_compare = {'Real':y_test.values, 'Predict':y_predict}
dataset_error = pd.DataFrame(dict_compare, index=y_test.index)
dataset_error['Error'] = dataset_error['Real'] !=dataset_error['Predict']
dataset_error['Cat_Real'] = dataset_error['Real'].map({0:'A', 1:'B', 2:'C', 3:'D'})
dataset_error['Cat_Predict'] = dataset_error['Predict'].map({0:'A', 1:'B', 2:'C', 3:'D'})

In [11]:
print('Percent of Error for each Segmentation')
pd.crosstab(index=dataset_error['Cat_Real'], columns=dataset_error['Error'], normalize='index')

Percent of Error for each Segmentation


Error,False,True
Cat_Real,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.4875,0.5125
B,0.286517,0.713483
C,0.543058,0.456942
D,0.694357,0.305643


In [12]:
importance = model7_1.feature_importances_
pd.DataFrame(data={'Importance':importance}, index=X_onehotencoder.columns).sort_values('Importance', ascending=False)

Unnamed: 0,Importance
Age,0.489653
Profession_Artist,0.182853
Spending_Score_Low,0.173384
Profession_Healthcare,0.069534
Graduated_No,0.040497
Profession_Marketing,0.016158
Work_Experience,0.01142
Spending_Score_Average,0.010585
Spending_Score_High,0.004263
Profession_Doctor,0.001304
