In [17]:
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from utils import calculate_income_1000_customers
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
import itertools

In [3]:
TRAIN_SIZE = 0.8

In [5]:
np.random.seed(0)
X = pd.read_csv('../data/x_train.txt', sep=' ', header=None).to_numpy()
y = pd.read_csv('../data/y_train.txt', sep=' ', header=None).to_numpy().ravel()

## Permutations of length 3

In [30]:
np.random.seed(0)
N_ITER = 1000

In [31]:
model = ExtraTreesClassifier(bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=50)

In [19]:
features = [101, 102, 103, 104, 105]
combinations = list(itertools.combinations(features, 3))

In [33]:
accuracies = {n: [] for n in combinations} 
accuracies_top = {n: [] for n in combinations} 
income = {n: [] for n in combinations} 

for i in range(N_ITER):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE, shuffle=True)
    print("Iteration: ", i)
    for selected in combinations:

        model.fit(X_train[:, selected], y_train)
        y_pred = model.predict(X_test[:, selected])
        y_proba = model.predict_proba(X_test[:, selected])
        
        accuracies_top[selected].append(calculate_income_1000_customers(len(selected), y_proba=y_proba, y_true=y_test, y_pred=y_pred)[0]) 
        income[selected].append(calculate_income_1000_customers(len(selected), y_proba=y_proba, y_true=y_test, y_pred=y_pred)[1])
        
        accuracies[selected].append(accuracy_score(y_test, y_pred))

Iteration:  0
Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Iteration:  20
Iteration:  21
Iteration:  22
Iteration:  23
Iteration:  24
Iteration:  25
Iteration:  26
Iteration:  27
Iteration:  28
Iteration:  29
Iteration:  30
Iteration:  31
Iteration:  32
Iteration:  33
Iteration:  34
Iteration:  35
Iteration:  36
Iteration:  37
Iteration:  38
Iteration:  39
Iteration:  40
Iteration:  41
Iteration:  42
Iteration:  43
Iteration:  44
Iteration:  45
Iteration:  46
Iteration:  47
Iteration:  48
Iteration:  49
Iteration:  50
Iteration:  51
Iteration:  52
Iteration:  53
Iteration:  54
Iteration:  55
Iteration:  56
Iteration:  57
Iteration:  58
Iteration:  59
Iteration:  60
Iteration:  61
Iteration:  62
Iteration:  63
Iteration:  64
Iteration:  65
Iteration:  66
Itera

In [34]:
avg_accuracies = [np.mean(accuracies[n]) for n in combinations]
avg_accuracies_top = [np.mean(accuracies_top[n]) for n in combinations]
avg_income = [np.mean(income[n]) for n in combinations]

In [35]:
incomes = dict(zip(combinations, avg_income))
incomes = sorted(incomes.items(), key=lambda item: item[1], reverse=True)
for combination, income in incomes[:5]:
    print(f"Feature set: {combination}, Income: {income}")

Feature set: (102, 103, 105), Income: 6874.25
Feature set: (101, 102, 105), Income: 6832.95
Feature set: (101, 102, 103), Income: 6814.75
Feature set: (101, 103, 105), Income: 6722.2
Feature set: (102, 104, 105), Income: 6721.05


In [36]:
print(max(avg_income))

6874.25


### GridSearch for best feature set

In [11]:
np.random.seed(0)
N_ITER = 1

In [12]:
param_grid = {
    'n_estimators': [10, 30, 50, 100, 150, 200, 250],
    'max_depth': [None, 2, 5, 10, 15, 20, 25],
    'min_samples_split': [2, 3, 4, 5, 6, 8, 10, 20],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10],
    'bootstrap': [False, True]
}

model = GridSearchCV(estimator=ExtraTreesClassifier(random_state=0), param_grid=param_grid, cv=5, scoring='accuracy')
selected = [102, 103, 105]

In [13]:
accuracies = []
accuracies_top = []
income = []
for _ in range(N_ITER):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE, shuffle=True)

    model.fit(X_train[:, selected], y_train)
    y_pred = model.predict(X_test[:, selected])
    y_proba = model.predict_proba(X_test[:, selected])
    
    accuracies_top.append(calculate_income_1000_customers(len(selected), y_proba=y_proba, y_true=y_test, y_pred=y_pred)[0]) 
    income.append(calculate_income_1000_customers(len(selected), y_proba=y_proba, y_true=y_test, y_pred=y_pred)[1])
    
    accuracies.append(accuracy_score(y_test, y_pred))

In [14]:
p = model.best_params_
p

{'bootstrap': False,
 'max_depth': 10,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 30}

In [26]:
np.random.seed(0)
N_ITER = 1000

In [27]:
model = ExtraTreesClassifier(bootstrap=p['bootstrap'], max_depth=p['max_depth'], min_samples_leaf=p['min_samples_leaf'], min_samples_split=p['min_samples_split'], n_estimators=p['n_estimators'])
# 
accuracies = []
accuracies_top = []
income = []
for _ in range(N_ITER):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE, shuffle=True)
    
    selected = [102, 103, 105]

    model.fit(X_train[:, selected], y_train)
    y_pred = model.predict(X_test[:, selected])
    y_proba = model.predict_proba(X_test[:, selected])
    
    accuracies_top.append(calculate_income_1000_customers(len(selected), y_proba=y_proba, y_true=y_test, y_pred=y_pred)[0]) 
    income.append(calculate_income_1000_customers(len(selected), y_proba=y_proba, y_true=y_test, y_pred=y_pred)[1])
    
    accuracies.append(accuracy_score(y_test, y_pred))

In [28]:
avg_accuracies = [np.mean(accuracies)]
avg_accuracies_top = [np.mean(accuracies_top)]
avg_income = [np.mean(income)]

In [29]:
print(avg_income)

[6839.15]


## Permutations of length 2

In [37]:
np.random.seed(0)
N_ITER = 100

In [38]:
model = ExtraTreesClassifier(bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=50)

In [39]:
features = [101, 102, 103, 104, 105]
combinations = list(itertools.combinations(features, 2))
print(combinations)

[(101, 102), (101, 103), (101, 104), (101, 105), (102, 103), (102, 104), (102, 105), (103, 104), (103, 105), (104, 105)]


In [40]:
accuracies = {n: [] for n in combinations} 
accuracies_top = {n: [] for n in combinations} 
income = {n: [] for n in combinations} 

for i in range(N_ITER):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE, shuffle=True)
    print("Iteration: ", i)
    for selected in combinations:

        model.fit(X_train[:, selected], y_train)
        y_pred = model.predict(X_test[:, selected])
        y_proba = model.predict_proba(X_test[:, selected])
        
        accuracies_top[selected].append(calculate_income_1000_customers(len(selected), y_proba=y_proba, y_true=y_test, y_pred=y_pred)[0]) 
        income[selected].append(calculate_income_1000_customers(len(selected), y_proba=y_proba, y_true=y_test, y_pred=y_pred)[1])
        
        accuracies[selected].append(accuracy_score(y_test, y_pred))

Iteration:  0
Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Iteration:  20
Iteration:  21
Iteration:  22
Iteration:  23
Iteration:  24
Iteration:  25
Iteration:  26
Iteration:  27
Iteration:  28
Iteration:  29
Iteration:  30
Iteration:  31
Iteration:  32
Iteration:  33
Iteration:  34
Iteration:  35
Iteration:  36
Iteration:  37
Iteration:  38
Iteration:  39
Iteration:  40
Iteration:  41
Iteration:  42
Iteration:  43
Iteration:  44
Iteration:  45
Iteration:  46
Iteration:  47
Iteration:  48
Iteration:  49
Iteration:  50
Iteration:  51
Iteration:  52
Iteration:  53
Iteration:  54
Iteration:  55
Iteration:  56
Iteration:  57
Iteration:  58
Iteration:  59
Iteration:  60
Iteration:  61
Iteration:  62
Iteration:  63
Iteration:  64
Iteration:  65
Iteration:  66
Itera

In [41]:
avg_accuracies = [np.mean(accuracies[n]) for n in combinations]
avg_accuracies_top = [np.mean(accuracies_top[n]) for n in combinations]
avg_income = [np.mean(income[n]) for n in combinations]

In [42]:
incomes = dict(zip(combinations, avg_income))
incomes = sorted(incomes.items(), key=lambda item: item[1], reverse=True)
for combination, income in incomes[:5]:
    print(f"Feature set: {combination}, Income: {income}")

Feature set: (102, 103), Income: 6692.5
Feature set: (102, 105), Income: 6688.0
Feature set: (101, 102), Income: 6661.0
Feature set: (103, 105), Income: 6596.0
Feature set: (102, 104), Income: 6587.0


In [43]:
print(max(avg_income))

6692.5


In [43]:
print(max(avg_income))

6692.5
