In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

from utils.datasplit import SplitByThirds
from algorithms.classifiers import s21LogisticRegression, s21KNN, s21NaiveBayes
from utils.features import FeatureGenerator
from utils.pipeline_manager import PipelineManager
from utils.metrics import Calculation

In [9]:
df = pd.read_csv("../datasets/data/training.csv")

splitter = SplitByThirds(df, date_column="PurchDate")
df_train, df_val, df_test = splitter.split()

In [10]:
TARGET = "IsBadBuy"
drop_cols = [TARGET, "PurchDate"]

X_train = df_train.drop(columns=drop_cols)
X_val = df_val.drop(columns=drop_cols)
X_test = df_test.drop(columns=drop_cols)

y_train = df_train[TARGET]
y_val = df_val[TARGET]
y_test = df_test[TARGET]

In [11]:
Xs = [X_train, X_val]
ys = [y_train, y_val]

base_configs = [
    ("Logistic Regression", LogisticRegression(max_iter=5000)),
    ("GaussianNB", GaussianNB()),
    ("KNN", KNeighborsClassifier(n_neighbors=5)),
    ("s21 Logistic Regression", s21LogisticRegression()),
    ("s21 Naive Bayes", s21NaiveBayes()),
    ("s21 KNN", s21KNN(n_neighbors=5)),
]

for name, clf in base_configs:
    manager = PipelineManager(clf, name, Xs, ys)
    manager.estimate()


=== Logistic Regression ===
ROC AUC : 0.7241   | Gini: 0.4483
PR AUC  : 0.3656
F1      : 0.2722
Precision: 0.6618 | Recall: 0.1713
S21 ROC AUC: 0.7241
S21 GINI: 0.4483
=== GaussianNB ===
ROC AUC : 0.7229   | Gini: 0.4458
PR AUC  : 0.3369
F1      : 0.1132
Precision: 0.5946 | Recall: 0.0626
S21 ROC AUC: 0.7229
S21 GINI: 0.4458
=== KNN ===
ROC AUC : 0.5872   | Gini: 0.1744
PR AUC  : 0.1779
F1      : 0.0925
Precision: 0.4293 | Recall: 0.0518
S21 ROC AUC: 0.5872
S21 GINI: 0.1744
=== s21 Logistic Regression ===
ROC AUC : 0.7328   | Gini: 0.4655
PR AUC  : 0.3802
F1      : 0.2648
Precision: 0.7283 | Recall: 0.1618
S21 ROC AUC: 0.7328
S21 GINI: 0.4655
=== s21 Naive Bayes ===
ROC AUC : 0.7229   | Gini: 0.4458
PR AUC  : 0.3369
F1      : 0.1132
Precision: 0.5946 | Recall: 0.0626
S21 ROC AUC: 0.7229
S21 GINI: 0.4458
=== s21 KNN ===
ROC AUC : 0.5872   | Gini: 0.1744
PR AUC  : 0.1779
F1      : 0.0925
Precision: 0.4293 | Recall: 0.0518
S21 ROC AUC: 0.5872
S21 GINI: 0.1744


Лучший Gini score у логистической регрессии (0.4483), но все три модели показали значение метрики > 0.15. Имплементированный мной ROC-AUC выдает те же значения что и имплементация sklearn.

У логистических регрессий результаты отличаются из-за использования SGD в версии, имплементированной мною. У моделей KNN и Naive Bayes результаты совпадают.

In [12]:
fg = FeatureGenerator(
	ratios=[
		("VehBCost", "MMRCurrentRetailAveragePrice"),
		("VehOdo",   "VehicleAge"),
		("WarrantyCost", "VehBCost"),
	],
	groupbys=[
		("Make",   "VehBCost"),
		("Model",  "VehBCost"),
		("Auction","VehBCost"),
		("VNST",   "VehOdo"),
	],
)

X_train_fg, X_val_fg, X_test_fg = fg.transform_many(X_train, X_val, X_test)

In [None]:
Xs_fg = [X_train_fg, X_val_fg]
ys_fg = ys

feature_configs = [
    ("Logistic Regression feature engineered", LogisticRegression(max_iter=5000)),
    ("GaussianNB feature engineered", GaussianNB()),
    ("KNN feature engineered", KNeighborsClassifier(n_neighbors=5)),
]

feature_managers = {}
for name, clf in feature_configs:
    manager = PipelineManager(clf, name, Xs_fg, ys_fg)
    manager.estimate()
    feature_managers[name] = manager

model_lr_fg = feature_managers["Logistic Regression feature engineered"]

=== Logistic Regression feature engineered ===
ROC AUC : 0.6962   | Gini: 0.3925
PR AUC  : 0.3337
F1      : 0.1784
Precision: 0.6267 | Recall: 0.1040
S21 ROC AUC: 0.6962
S21 GINI: 0.3925
=== GaussianNB feature engineered ===
ROC AUC : 0.7135   | Gini: 0.4270
PR AUC  : 0.3171
F1      : 0.0411
Precision: 0.4789 | Recall: 0.0215
S21 ROC AUC: 0.7135
S21 GINI: 0.4270
=== KNN feature engineered ===
ROC AUC : 0.5787   | Gini: 0.1575
PR AUC  : 0.1743
F1      : 0.0954
Precision: 0.3909 | Recall: 0.0544
S21 ROC AUC: 0.5787
S21 GINI: 0.1575


gini упал как и остальные метрики, возможно нужно использовать другие комбинации для генерации нелинейных признаков.

In [14]:
coef = model_lr_fg.pipe.named_steps['clf'].coef_.ravel()
feature_names = model_lr_fg.pipe.named_steps['preprocess'].get_feature_names_out()
coef_table = pd.DataFrame({'feature': feature_names, 'weight': coef}).sort_values('weight')
display(coef_table)

Unnamed: 0,feature,weight
32,cat__WheelType,-6.059237
27,cat__Model,-2.350728
35,cat__TopThreeAmericanName,-1.589989
37,cat__AUCGUART,-0.756284
36,cat__PRIMEUNIT,-0.756284
7,num__MMRAcquisitionRetailAveragePrice,-0.541763
8,num__MMRAcquisitonRetailCleanPrice,-0.49785
1,num__VehYear,-0.463385
3,num__WheelTypeID,-0.42579
10,num__MMRCurrentAuctionCleanPrice,-0.310771


In [15]:
columns_to_drop = []

for row in coef_table.itertuples(index=False):
    if abs(row.weight) < 0.05:
        feature_name = row.feature.split("__", 1)[1]
        columns_to_drop.append(feature_name)

Xms = []

Xm_train = X_train_fg.drop(columns=columns_to_drop)
Xm_val   = X_val_fg.drop(columns=columns_to_drop)
Xm_test  = X_test_fg.drop(columns=columns_to_drop)

Xms.extend([Xm_train, Xm_val])
yms = ys

In [16]:
lr_manual = PipelineManager(
	LogisticRegression(max_iter=5000),
	"Logistic Regression (manual)",
	Xms, yms
)

lr_manual.estimate()

=== Logistic Regression (manual) ===
ROC AUC : 0.6993   | Gini: 0.3987
PR AUC  : 0.3334
F1      : 0.1878
Precision: 0.6115 | Recall: 0.1109
S21 ROC AUC: 0.6993
S21 GINI: 0.3987


In [17]:
lr_l1 = PipelineManager(
	LogisticRegression(max_iter=5000),
	"Logistic Regression (L1 search)",
	Xs_fg, ys_fg
)

lr_l1.l1_search()

nonzero weights = 38

=== Logistic Regression (L1 search) ===
ROC AUC : 0.7323   | Gini: 0.4646
PR AUC  : 0.3755
F1      : 0.3050
Precision: 0.6583 | Recall: 0.1985


Gini лучше при использовании l1 регуляризации.

In [18]:
import joblib 
#tune_result = lr_l1.param_tune(lr_l1.pipe, (Xs_fg, ys_fg))

# saving pipe & results of .param_tune()
# joblib.dump((tune_result, lr_l1.pipe), 'saved/tune_result.pkl')

tune_result, best_model = joblib.load('saved/tune_result.pkl')

print(tune_result['results'])

   clf__penalty clf__solver  clf__C  clf__max_iter clf__class_weight  \
3            l1        saga    0.25           5000          balanced   
1            l1        saga    0.25           3000          balanced   
79   elasticnet        saga    0.25           5000          balanced   
77   elasticnet        saga    0.25           3000          balanced   
75   elasticnet        saga    0.25           5000          balanced   
..          ...         ...     ...            ...               ...   
50           l2       lbfgs    5.00           3000              None   
52           l2       lbfgs    5.00           5000              None   
58           l2       lbfgs   10.00           5000              None   
54           l2       lbfgs   10.00           1000              None   
56           l2       lbfgs   10.00           3000              None   

        gini  clf__l1_ratio  
3   0.470478            NaN  
1   0.470475            NaN  
79  0.470367            0.9  
77  0.470366   

Наибольшее влияние оказывает class_weight (balanced), и C (чем меньше, тем лучше), наилучшие результаты показала l1.

In [19]:
Xs_fg.append(X_test_fg)
ys_fg.append(y_test)

print(f'Train Gini: {PipelineManager.gini_eval(best_model, Xs_fg[0], ys_fg[0])}')
print(f'Valid Gini: {PipelineManager.gini_eval(best_model, Xs_fg[1], ys_fg[1])}')
print(f'Test  Gini: {PipelineManager.gini_eval(best_model, Xs_fg[2], ys_fg[2])}')

Train Gini: 0.5010034539052626
Valid Gini: 0.47047754014308163
Test  Gini: 0.5266277578937562


Gini на тесте дал результаты лучше чем на val и train, что говорит об отсутствии переобучения.

In [20]:
calc = Calculation()

best_proba = best_model.predict_proba(X_test_fg)[:, 1]
report = calc.compare_to_original(y_test, best_proba)

In [21]:
diff = {k: report["s21"][k] - report["sklearn"][k] for k in report["s21"]}
print(diff)

{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'auc_pr': 0.0}


При проверке автомобилей на класс 'lemon' важнее минимизировать fn, потому что пропущенный lemon ведёт к жалобам, возвратам и убыткам. Fp случаи стоят дешевле — их можно отправить на повторную диагностику. Recall показывает, какую долю реальных 'lemon' модель способна обнаружить. (Если классифицировать lemon как 1, а не lemon как 0)