## Libraries and versions

### Libraries

In [1]:
python_version = !Python -V #version 3.7.9
import pandas as pd #version 1.2.3
import numpy as np #version 1.19.2


#for machine learning models
#pre processing
from category_encoders import OneHotEncoder, OrdinalEncoder, __version__ as ce_version #version 2.2.2

#split data in train and test
#version
from sklearn import __version__ as sk_version #version 0.23.2

#pre-processing
from sklearn.model_selection import train_test_split

#classificators
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

#metrics
from sklearn.metrics import accuracy_score

#baseline
from sklearn.dummy import DummyClassifier

### Versions

In [2]:
print(f'{python_version[0]}')
print(f'Pandas version: {pd.__version__}')
print(f'Numpy version: {np.__version__}')
print(f'Category Encoders version: {ce_version}')
print(f'Sklearn version: {sk_version}')

Python 3.7.9
Pandas version: 1.2.3
Numpy version: 1.19.2
Category Encoders version: 2.2.2
Sklearn version: 0.23.2


## Configurations

In [3]:
import warnings
warnings.filterwarnings("ignore")

## Dataset

In [4]:
dataset_clients = pd.read_csv('train_cleaning.csv')
dataset_new_clients = pd.read_csv('test.csv')

In [5]:
dataset_clients.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,1.0,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,1.0,High,6.0,Cat_6,A


In [6]:
dataset_new_clients.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,458989,Female,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6
1,458994,Male,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6
2,458996,Female,Yes,69,No,,0.0,Low,1.0,Cat_6
3,459000,Male,Yes,59,No,Executive,11.0,High,2.0,Cat_6
4,459001,Female,No,19,No,Marketing,,Low,4.0,Cat_6


## Tests

OPTIONS

Treatment of categorical variables:

variables = Profession, Spending_Score and Var_1
- onehotencoder
- ordinal

Models
- kneighborsclassifier
- RadiusNeighborsClassifier
- RandomForestClassifier
- DecisionTreeClassifier
- ExtraTreesClassifier

## Variables

In [7]:
X = dataset_clients[['Ever_Married', 'Age', 'Graduated', 'Profession', 'Work_Experience', 'Spending_Score', 'Var_1']]
y = dataset_clients['Segmentation']

## Test 1 - OneHotEnconder and Kneighborsclassifier

In [9]:
#first - split train and test data
accuracy_list = []
iterations = 10
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    #second - transform independet variables (train and test)
    encoder = OneHotEncoder(cols=['Ever_Married', 'Graduated','Profession', 'Spending_Score', 'Var_1'],
                              use_cat_names=True)
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.fit_transform(X_test)

    #third - macinhe learning apply
    test1 = KNeighborsClassifier(n_neighbors=4, weights='distance')
    test1.fit(X_train, y_train)
    y_predict = test1.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')

ACCURACY FOR 10 ITERATIONS
Max = 0.4263
Mean = 0.3459
Min = 0.2794
Std = 0.0532


## Test 2 - Ordinal and Kneighborsclassifier

In [10]:
#first - split train and test data
accuracy_list = []
iterations = 10
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    #second - transform independet variables (train and test)
    encoder = OrdinalEncoder(cols=['Ever_Married', 'Graduated','Profession', 'Spending_Score', 'Var_1'])
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.fit_transform(X_test)

    #third - macinhe learning apply
    test2 = KNeighborsClassifier(n_neighbors=4, weights='distance')
    test2.fit(X_train, y_train)
    y_predict = test2.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')

ACCURACY FOR 10 ITERATIONS
Max = 0.4537
Mean = 0.4413
Min = 0.4289
Std = 0.0080


## Test 3 - OneHotEncoder and RadiusNeighborsClassifier

In [11]:
#first - split train and test data
accuracy_list = []
iterations = 10
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    #second - transform independet variables (train and test)
    encoder = OneHotEncoder(cols=['Ever_Married', 'Graduated','Profession', 'Spending_Score', 'Var_1'],
                              use_cat_names=True)
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.fit_transform(X_test)

    #third - macinhe learning apply
    test3 = RadiusNeighborsClassifier(n_neighbors=4, weights='distance', outlier_label='most_frequent')
    test3.fit(X_train, y_train)
    y_predict = test3.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')

ACCURACY FOR 10 ITERATIONS
Max = 0.2955
Mean = 0.2800
Min = 0.2629
Std = 0.0104


## Test 4 - OrdinalEncoder and RadiusNeighborsClassifier

In [12]:
#first - split train and test data
accuracy_list = []
iterations = 10
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    #second - transform independet variables (train and test)
    encoder = OrdinalEncoder(cols=['Ever_Married', 'Graduated','Profession', 'Spending_Score', 'Var_1'])
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.fit_transform(X_test)

    #third - macinhe learning apply
    test4 = RadiusNeighborsClassifier(n_neighbors=4, weights='distance', outlier_label='most_frequent')
    test4.fit(X_train, y_train)
    y_predict = test4.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')

ACCURACY FOR 10 ITERATIONS
Max = 0.4311
Mean = 0.4199
Min = 0.4085
Std = 0.0074


## Test 5 - OneHotEncoder and RandomForestClassifier

In [13]:
#first - split train and test data
accuracy_list = []
iterations = 10
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    #second - transform independet variables (train and test)
    encoder = OneHotEncoder(cols=['Ever_Married', 'Graduated','Profession', 'Spending_Score', 'Var_1'])
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.fit_transform(X_test)

    #third - macinhe learning apply
    test5 = RandomForestClassifier(max_depth=4, min_samples_leaf=30)
    test5.fit(X_train, y_train)
    y_predict = test5.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')

ACCURACY FOR 10 ITERATIONS
Max = 0.5093
Mean = 0.3949
Min = 0.1973
Std = 0.0905


## Test 6 - OrdinalEncoder and RandomForestClassifier

In [14]:
#first - split train and test data
accuracy_list = []
iterations = 10
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    #second - transform independet variables (train and test)
    encoder = OrdinalEncoder(cols=['Ever_Married', 'Graduated','Profession', 'Spending_Score', 'Var_1'])
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.fit_transform(X_test)

    #third - macinhe learning apply
    test6 = RandomForestClassifier(max_depth=4, min_samples_leaf=30)
    test6.fit(X_train, y_train)
    y_predict = test6.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')

ACCURACY FOR 10 ITERATIONS
Max = 0.5363
Mean = 0.5032
Min = 0.4798
Std = 0.0155


## Test 7 - OneHotEncoder and DecisionTreeClassifier

In [15]:
#first - split train and test data
accuracy_list = []
iterations = 10
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    #second - transform independet variables (train and test)
    encoder = OneHotEncoder(cols=['Ever_Married', 'Graduated','Profession', 'Spending_Score', 'Var_1'])
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.fit_transform(X_test)

    #third - macinhe learning apply
    test7 = DecisionTreeClassifier(max_depth=4, min_samples_leaf=30)
    test7.fit(X_train, y_train)
    y_predict = test7.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')

ACCURACY FOR 10 ITERATIONS
Max = 0.4985
Mean = 0.4179
Min = 0.3029
Std = 0.0630


## Test 8 - OrdinalEncoder and DecisionTreeClassifier

In [16]:
#first - split train and test data
accuracy_list = []
iterations = 10
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    #second - transform independet variables (train and test)
    encoder = OrdinalEncoder(cols=['Ever_Married', 'Graduated','Profession', 'Spending_Score', 'Var_1'])
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.fit_transform(X_test)

    #third - macinhe learning apply
    test8 = DecisionTreeClassifier(max_depth=4, min_samples_leaf=30)
    test8.fit(X_train, y_train)
    y_predict = test8.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')

ACCURACY FOR 10 ITERATIONS
Max = 0.5163
Mean = 0.5035
Min = 0.4933
Std = 0.0081


## Test 9 - OneHotEncoder and ExtraTreesClassifier

In [17]:
#first - split train and test data
accuracy_list = []
iterations = 10
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    #second - transform independet variables (train and test)
    encoder = OneHotEncoder(cols=['Ever_Married', 'Graduated','Profession', 'Spending_Score', 'Var_1'])
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.fit_transform(X_test)

    #third - macinhe learning apply
    test7 = ExtraTreesClassifier(max_depth=4, min_samples_leaf=30)
    test7.fit(X_train, y_train)
    y_predict = test7.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')

ACCURACY FOR 10 ITERATIONS
Max = 0.4289
Mean = 0.2745
Min = 0.1638
Std = 0.1029


## Test 10 - OrdinalEncoder and ExtraTreesClassifier

In [18]:
#first - split train and test data
accuracy_list = []
iterations = 10
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    #second - transform independet variables (train and test)
    encoder = OrdinalEncoder(cols=['Ever_Married', 'Graduated','Profession', 'Spending_Score', 'Var_1'])
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.fit_transform(X_test)

    #third - macinhe learning apply
    test10 = ExtraTreesClassifier(max_depth=4, min_samples_leaf=30)
    test10.fit(X_train, y_train)
    y_predict = test10.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')

ACCURACY FOR 10 ITERATIONS
Max = 0.5137
Mean = 0.4924
Min = 0.4794
Std = 0.0097


## Baseline (sklearn.dummy)

### OneHotEncoder

In [22]:
#first - split train and test data
accuracy_list = []
iterations = 100
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    #second - transform independet variables (train and test)
    encoder = OneHotEncoder(cols=['Ever_Married', 'Graduated','Profession', 'Spending_Score', 'Var_1'])
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.fit_transform(X_test)

    #third - macinhe learning apply
    test_dummy_1 = DummyClassifier(strategy='uniform')
    test_dummy_1.fit(X_train, y_train)
    y_predict = test_dummy_1.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')

ACCURACY FOR 100 ITERATIONS
Max = 0.2755
Mean = 0.2521
Min = 0.2308
Std = 0.0093


### OrdinalEncoder

In [24]:
#first - split train and test data
accuracy_list = []
iterations = 100
for iter in range(0, iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    #second - transform independet variables (train and test)
    encoder = OrdinalEncoder(cols=['Ever_Married', 'Graduated','Profession', 'Spending_Score', 'Var_1'])
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.fit_transform(X_test)

    #third - macinhe learning apply
    test_dummy_2 = DummyClassifier(strategy='uniform')
    test_dummy_2.fit(X_train, y_train)
    y_predict = test_dummy_2.predict(X_test)

    #fourth - test accuray
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)

max_accuracy = np.asarray(accuracy_list).max()
mean_accuracy = np.asarray(accuracy_list).mean()
min_accuracy = np.asarray(accuracy_list).min()
std_accuracy = np.asarray(accuracy_list).std()
print(f'ACCURACY FOR {iterations} ITERATIONS')
print(f'Max = {max_accuracy:.4f}\nMean = {mean_accuracy:.4f}\nMin = {min_accuracy:.4f}\nStd = {std_accuracy:.4f}')

ACCURACY FOR 100 ITERATIONS
Max = 0.2673
Mean = 0.2500
Min = 0.2286
Std = 0.0088


## Conclusions

- test 6 reach the best scores (higher mean and minimum standard deviation) in 100 iterations
- the scores of test 6 is 100% bigger then a dummy classifier

## Next step

- work parameters to increase model accuracy