In [26]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from utils import calculate_income_1000_customers
from sklearn.model_selection import GridSearchCV

In [27]:
TRAIN_SIZE = 0.8

In [28]:
np.random.seed(0)
X = pd.read_csv('../data/x_train.txt', sep=' ', header=None).to_numpy()
y = pd.read_csv('../data/y_train.txt', sep=' ', header=None).to_numpy().ravel()

# GaussianNB

### [102, 103, 105]

In [29]:
accuracies = []
accuracies_top = []
income = []
selected = [101, 102, 103, 105]
model = GaussianNB(var_smoothing=1)

for _ in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE, shuffle=True)

    model.fit(X_train[:, selected], y_train)
    y_pred = model.predict(X_test[:, selected])
    y_proba = model.predict_proba(X_test[:, selected])
    
    accuracies_top.append(calculate_income_1000_customers(len(selected), y_proba=y_proba, y_true=y_test, y_pred=y_pred)[0]) 
    income.append(calculate_income_1000_customers(len(selected), y_proba=y_proba, y_true=y_test, y_pred=y_pred)[1])
    
    accuracies.append(accuracy_score(y_test, y_pred))

In [30]:
avg_accuracies = [np.mean(accuracies)]
avg_accuracies_top = [np.mean(accuracies_top)]
avg_income = [np.mean(income)]

In [31]:
print(avg_income)

[7032.228436165547]


In [23]:
print(avg_accuracies)

[0.5888969999999999]


In [24]:
print(avg_accuracies_top)

[0.76653444191064]


In [25]:
np.std(income)

324.3595527144095

### Different feature sets (length 3)

In [17]:
sets = [[101, 102, 103], [102, 103, 105], [101, 102, 105]]
param_grid = {
    'var_smoothing': np.logspace(0, 1, num=10000)
}
results = {}

In [18]:
for selected in sets:
    np.random.seed(0)
    accuracies = []
    accuracies_top = []
    income = []

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE, shuffle=True)
    grid_model = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid, cv=5)
    grid_model.fit(X_train[:, selected], y_train)
    best_params = grid_model.best_params_
    print(best_params)
    model = GaussianNB(var_smoothing=best_params['var_smoothing'])

    for _ in range(1000):
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE, shuffle=True)

        model.fit(X_train[:, selected], y_train)
        y_pred = model.predict(X_test[:, selected])
        y_proba = model.predict_proba(X_test[:, selected])

        accuracies_top.append(calculate_income_1000_customers(len(selected), y_proba=y_proba, y_true=y_test, y_pred=y_pred)[0]) 
        income.append(calculate_income_1000_customers(len(selected), y_proba=y_proba, y_true=y_test, y_pred=y_pred)[1])
        accuracies.append(accuracy_score(y_test, y_pred))

    avg_accuracies = np.mean(accuracies)
    avg_accuracies_top = np.mean(accuracies_top)
    avg_income = np.mean(income)

    results[str(selected)] = {
        'average_accuracy': avg_accuracies,
        'average_accuracy_top': avg_accuracies_top,
        'average_income': avg_income
    }

{'var_smoothing': 1.0}
{'var_smoothing': 1.0}
{'var_smoothing': 1.0}


In [16]:
for features, result in results.items():
    print(f"Features: {features}")
    print(f"Average Accuracy: {result['average_accuracy']}")
    print(f"Average Top Accuracy: {result['average_accuracy_top']}")
    print(f"Average Income: {result['average_income']}")
    print("-" * 40)

Features: [101, 102, 103]
Average Accuracy: 0.559577
Average Top Accuracy: 0.7698797626486686
Average Income: 7098.797626486686
----------------------------------------
Features: [102, 103, 105]
Average Accuracy: 0.567072
Average Top Accuracy: 0.7938762856704407
Average Income: 7338.762856704408
----------------------------------------
Features: [101, 102, 105]
Average Accuracy: 0.566052
Average Top Accuracy: 0.7851802925021972
Average Income: 7251.802925021972
----------------------------------------


## 4 features

In [4]:
param_grid = {
    'var_smoothing': np.logspace(0, 1, num=10000)
}
results = {}
selected = [101, 102, 103, 105]
np.random.seed(0)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE, shuffle=True)
grid_model = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid, cv=5)
grid_model.fit(X_train[:, selected], y_train)
best_params = grid_model.best_params_

In [6]:
best_params

{'var_smoothing': 1.0}

In [7]:
model = GaussianNB(var_smoothing=best_params['var_smoothing'])

In [8]:
accuracies = []
accuracies_top = []
income = []

for _ in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE, shuffle=True)

    model.fit(X_train[:, selected], y_train)
    y_pred = model.predict(X_test[:, selected])
    y_proba = model.predict_proba(X_test[:, selected])
    
    accuracies_top.append(calculate_income_1000_customers(len(selected), y_proba=y_proba, y_true=y_test, y_pred=y_pred)[0]) 
    income.append(calculate_income_1000_customers(len(selected), y_proba=y_proba, y_true=y_test, y_pred=y_pred)[1])
    
    accuracies.append(accuracy_score(y_test, y_pred))

In [9]:
avg_accuracies = [np.mean(accuracies)]
avg_accuracies_top = [np.mean(accuracies_top)]
avg_income = [np.mean(income)]

In [10]:
print(avg_income)

[7031.30242069166]
