# Supervised Learning. Classification
## Imports 


In [None]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import  StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from category_encoders import CountEncoder
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score , confusion_matrix, f1_score, average_precision_score
from warnings import filterwarnings
filterwarnings('ignore')

## Preprocessing

In [145]:
df_raw = pd.read_csv('data/training.csv',index_col='RefId')
df_raw['PurchDate']= pd.to_datetime(df_raw['PurchDate'])



In [370]:
df_raw

Unnamed: 0_level_0,IsBadBuy,PurchDate,Auction,VehYear,VehicleAge,Make,Model,Trim,SubModel,Color,...,MMRCurrentRetailAveragePrice,MMRCurrentRetailCleanPrice,PRIMEUNIT,AUCGUART,BYRNO,VNZIP1,VNST,VehBCost,IsOnlineSale,WarrantyCost
RefId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,2009-12-07,ADESA,2006,3,MAZDA,MAZDA3,i,4D SEDAN I,RED,...,11597.0,12409.0,,,21973,33619,FL,7100.0,0,1113
2,0,2009-12-07,ADESA,2004,5,DODGE,1500 RAM PICKUP 2WD,ST,QUAD CAB 4.7L SLT,WHITE,...,11374.0,12791.0,,,19638,33619,FL,7600.0,0,1053
3,0,2009-12-07,ADESA,2005,4,DODGE,STRATUS V6,SXT,4D SEDAN SXT FFV,MAROON,...,7146.0,8702.0,,,19638,33619,FL,4900.0,0,1389
4,0,2009-12-07,ADESA,2004,5,DODGE,NEON,SXT,4D SEDAN,SILVER,...,4375.0,5518.0,,,19638,33619,FL,4100.0,0,630
5,0,2009-12-07,ADESA,2005,4,FORD,FOCUS,ZX3,2D COUPE ZX3,SILVER,...,6739.0,7911.0,,,19638,33619,FL,4000.0,0,1020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73010,1,2009-12-02,ADESA,2001,8,MERCURY,SABLE,GS,4D SEDAN GS,BLACK,...,4836.0,5937.0,,,18111,30212,GA,4200.0,0,993
73011,0,2009-12-02,ADESA,2007,2,CHEVROLET,MALIBU 4C,LS,4D SEDAN LS,SILVER,...,10151.0,11652.0,,,18881,30212,GA,6200.0,0,1038
73012,0,2009-12-02,ADESA,2005,4,JEEP,GRAND CHEROKEE 2WD V,Lar,4D WAGON LAREDO,SILVER,...,11831.0,14402.0,,,18111,30212,GA,8200.0,0,1893
73013,0,2009-12-02,ADESA,2006,3,CHEVROLET,IMPALA,LS,4D SEDAN LS,WHITE,...,10099.0,11228.0,,,18881,30212,GA,7000.0,0,1974


### Spliting dataset into train, validation and test parts

In [146]:
val_cut = np.quantile(df_raw['PurchDate'],1/3)
test_cut = np.quantile(df_raw['PurchDate'],2/3)

In [147]:

def train_val_test_date_split(
    df: pd.DataFrame,
    date_col: str,
    validation_date: str,
    test_date: str,
    drop_na_dates: bool = True,
    sort_by_time: bool = True,
):
    if date_col not in df.columns:
        raise ValueError(f"Column '{date_col}' not found in X")

    dates = pd.to_datetime(df[date_col], errors="coerce", utc=True)
    cut_val  = pd.to_datetime(validation_date, utc=True)
    cut_test = pd.to_datetime(test_date,      utc=True)

    if not (cut_val < cut_test):
        raise ValueError("validation_date must be strictly earlier than test_date")


    if drop_na_dates:
        keep = dates.notna()
        df,  dates = df.loc[keep], dates.loc[keep]

    if sort_by_time:
        order = dates.sort_values().index
        df, dates = df.loc[order], dates.loc[order]


    m_train = dates <  cut_val
    m_val   = (dates >= cut_val) & (dates <  cut_test)
    m_test  = dates >= cut_test

    return df.loc[m_train], df.loc[m_val], df.loc[m_test]


In [148]:
df_train, df_val, df_test = train_val_test_date_split(df_raw,'PurchDate',val_cut,test_cut)

In [149]:
print(df_train['PurchDate'].max()<df_val['PurchDate'].max()<df_test['PurchDate'].max())
print(len(df_train)/len(df_raw),len(df_val)/len(df_raw),len(df_test)/len(df_raw))



True
0.33202252579367797 0.3345162572105833 0.3334612169957387


### Encoding categorical variables

In [150]:
categorical_cols = df_train.select_dtypes(include=[object,'category']).columns
num_cols = df_train.drop(columns='IsOnlineSale').select_dtypes(include=[np.number]).columns




In [151]:
low = []
high = []
for col in categorical_cols:
    if len(df_train[col].unique())>20 :
        high.append(col)
    else:
        low.append(col)

    

In [152]:
preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]), num_cols.drop('IsBadBuy')),
        ("num_nan", MissingIndicator(features="missing-only", error_on_new=False), num_cols.drop('IsBadBuy')),
        ("ohe", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("enc", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
        ]), low),
        ("ohe_nan", MissingIndicator(features="missing-only",error_on_new=False), low),

        ("cnt", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("enc", CountEncoder(normalize=True)),
        ]), high),
        ("cnt_nan", MissingIndicator(features="missing-only",error_on_new=False), high),
    ],
    remainder='drop', 
    verbose_feature_names_out=True

)


In [153]:
encoded_train = preprocess.fit_transform(df_train)
encoded_val = preprocess.transform(df_val)
encoded_test = preprocess.transform(df_test)
columns = preprocess.get_feature_names_out()

In [154]:
X_train = pd.DataFrame(encoded_train,columns=columns)
X_val = pd.DataFrame(encoded_val,columns=columns)
X_test = pd.DataFrame(encoded_test,columns=columns)


In [258]:
y_train = df_train['IsBadBuy']
y_val = df_val['IsBadBuy']
y_test = df_test['IsBadBuy']


### Train LogisticRegression, GaussianNB, KNN from sklearn on the training dataset and check the quality of your algorithms on the validation dataset.

In [None]:
log_reg = LogisticRegression(random_state=21,max_iter=1000)
log_reg.fit(X_train, y_train)
y_logreg = log_reg.predict(X_val)
proba_logreg = log_reg.predict_proba(X_val)[:,1]
print(f"Accuracy = {accuracy_score(y_val,y_logreg)}")
print(f"Precision = {precision_score(y_val,y_logreg)}")
print(f"Recall = {recall_score(y_val,y_logreg)}")
print(f"Gini score = {roc_auc_score(y_val,proba_logreg)*2-1}")
print(f"Confusion matrix\n {confusion_matrix(y_val,y_logreg)}")


Accuracy = 0.8776931268944048
Precision = 0.6256684491978609
Recall = 0.14744801512287334
Gini score = 0.40403481322468204
Confusion matrix
 [[20960   280]
 [ 2706   468]]


In [157]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)          
y_gnb = gnb.predict(X_val)       
proba_gnb  = gnb.predict_proba(X_val)[:,1]
print(f"Accuracy = {accuracy_score(y_val,y_gnb)}")
print(f"Precision = {precision_score(y_val,y_gnb)}")
print(f"Recall = {recall_score(y_val,y_gnb)}")
print(f"Gini score = {roc_auc_score(y_val,proba_gnb)*2-1}")
print(f"Confusion matrix\n {confusion_matrix(y_val,y_gnb)}")

Accuracy = 0.38277217989678053
Precision = 0.15965665236051502
Recall = 0.8790170132325141
Gini score = 0.42460774750592445
Confusion matrix
 [[ 6555 14685]
 [  384  2790]]


In [371]:

knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, y_train)

y_knn = knn.predict(X_val)        
proba_knn  = knn.predict_proba(X_val) [:,1]
print(f"Accuracy = {accuracy_score(y_val,y_knn)}")
print(f"Precision = {precision_score(y_val,y_knn)}")
print(f"Recall = {recall_score(y_val,y_knn)}")
print(f"Gini score = {roc_auc_score(y_val,proba_knn)*2-1}")
print(f"Confusion matrix\n {confusion_matrix(y_val,y_knn)}")


Accuracy = 0.8749078397640698
Precision = 0.6492537313432836
Recall = 0.08223062381852551
Gini score = 0.25650298980535124
Confusion matrix
 [[21099   141]
 [ 2913   261]]


### Лучшей моделью является Gaussian Naive Bayes, потому что она показала максимальный Gini-скор (≈ 0.425). Эта модель лучше ранжирует объекты по вероятности плохой покупки и обладает очень высоким Recall (≈ 0.88), то есть практически не пропускает «плохие покупки».

### Implement Gini score calculation. You can use the 2*ROC AUC - 1 approach, so you need to implement the ROC AUC calculation. Check if your metric is approximately equal to 2 * sklearn.metrics.roc_auc_score - 1.

In [159]:

def roc_auc_score21(y_true, y_proba):
    desc_order = np.argsort(-y_proba)
    y_true = np.array(y_true)[desc_order]
    
    P = y_true.sum()
    N = len(y_true) - P


    tpr = np.cumsum(y_true) / P
    fpr = np.cumsum(1 - y_true) / N

  
    tpr = np.insert(tpr, 0, 0)
    fpr = np.insert(fpr, 0, 0)

    auc = np.trapezoid(tpr, fpr)
    return auc


In [160]:
def gini_score(y_true, proba):
    return 2*roc_auc_score21(y_true, proba)-1
gini_score(y_val,proba_logreg)

np.float64(0.40403481322468204)

In [340]:
np.isclose(gini_score(y_val, proba_logreg),
           2*roc_auc_score(y_val, proba_logreg)-1, rtol=1e-7, atol=1e-9)

np.True_

In [215]:
def report(name, y_true, y_pred, score_like_proba):
    gini = gini_score(y_true, score_like_proba) 
    print(f"[{name}]  Gini={gini:.4f}  Acc={accuracy_score(y_true,y_pred):.4f}  "
          f"Prec={precision_score(y_true,y_pred):.4f}  Rec={recall_score(y_true,y_pred):.4f}")
    print("Confusion:\n", confusion_matrix(y_true, y_pred))

### Implement your own versions of LogisticRegression, KNN and NaiveBayes classifiers. For LogisticRegression compute gradients with respect to the loss and use stochastic gradient descent.

In [None]:

class LogisticRegressionSGD:
    def __init__(
        self,
        n_epochs: int = 1000,
        learning_rate: float = 0.001,
        random_state: int = 21,
        tol: float = 0.001,
        verbose: bool = False,
    ):
        self.epochs = n_epochs
        self.lr = learning_rate
        self.random_state = random_state
        self.tol = tol
        self.verbose = verbose

    @staticmethod
    def _sigmoid(z):
        #σ(z) = 1 / (1 + e^(−z)) || σ(z) = e^(z) / (1 + e^(z))
        return np.where(
            z >= 0,
            1.0 / (1.0 + np.exp(-z)),
            np.exp(z) / (1.0 + np.exp(z)),
        )

    @staticmethod
    def _logloss(y, p, eps: float = 1e-15):
        p = np.clip(p, eps, 1.0 - eps)
        return -(y * np.log(p) + (1.0 - y) * np.log(1.0 - p)).mean()

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)

        uy = set(np.unique(y))
        if not (uy <= {0.0, 1.0} and len(uy) == 2):
            raise ValueError(f"target must be binary {{0,1}}, got {len(uy)} classes")

        self.b = 0.0
        self.w = np.zeros(X.shape[1], dtype=float)

        rng = np.random.RandomState(self.random_state)
        best_loss = np.inf
        no_improve = 0
        for epoch in range(self.epochs):
            idx = rng.permutation(X.shape[0])
            for i in idx:
                z = X[i] @ self.w + self.b
                p = self._sigmoid(z)
                error = p - y[i]

                grad_w = error * X[i]
                grad_b = error

                self.w -= self.lr * grad_w
                self.b -= self.lr * grad_b

            p_all = self._sigmoid(X @ self.w + self.b)
            loss = self._logloss(y, p_all)

            if best_loss - loss > self.tol:
                best_loss = loss
                no_improve = 0
            else:
                no_improve += 1
                if no_improve >= 5:
                    if self.verbose:
                        print(f"Early stop on epoch {epoch + 1}")
                    break

        self.coef_ = self.w[None, :]
        self.intercept_ = np.array([self.b])
        return self

    def predict_proba(self, X):
        X = np.asarray(X)
        z = X @ self.w + self.b
        p = self._sigmoid(z)
        return np.c_[1.0 - p, p]

    def predict(self, X, threshold: float = 0.5):
        return (self.predict_proba(X)[:, 1] >= threshold).astype(int)


In [162]:

class GaussianNB21:
    def fit(self,X,y):
        X, y = np.asarray(X), np.asarray(y)
        self.classes_ = np.unique(y)
        n_classes,n_features = len(self.classes_), X.shape[1]

        self.means_ = np.zeros((n_classes,n_features))
        self.variances_ = np.zeros((n_classes,n_features))
        self.priors_ = np.zeros(n_classes)

        for idx, k in enumerate(self.classes_):
            Xk = X[y==k]

            # μ_{jk} = среднее признака j для класса k
            self.means_[idx] = Xk.mean(axis=0)

            # σ²_{jk} = дисперсия признака j для класса k
            self.variances_[idx] = Xk.var(axis=0)

            # P(y=k) = доля объектов класса k
            self.priors_[idx] = Xk.shape[0]/X.shape[0]
        return self

    def _log_gaussian(self,X,eps = 1e-9):
        X = np.asarray(X)
        var = self.variances_ + eps

        # diff = x_j - μ_{jk}
        diff = X[:, None, :] - self.means_[None,:,:]            

        # log P(x_j|y=k) = -0.5 * [ log(2πσ²_{jk}) + (x_j-μ_{jk})² / σ²_{jk} ]
        log_prob = -0.5 * (np.log(2.0 * np.pi * var) + (diff**2) / var)

        # ∑_j log P(x_j|y=k)
        return log_prob.sum(axis=2) 
    
    def predict_proba(self, X):
        X = np.asarray(X, dtype=float)

        # log P(x|y=k)
        log_lik  = self._log_gaussian(X)                

        # log P(y=k|x) ∝ log P(y=k) + log P(x|y=k)
        log_post = log_lik + np.log(self.priors_)       

        m = log_post.max(axis=1, keepdims=True)
        P = np.exp(log_post - m)
        return P / P.sum(axis=1, keepdims=True)

    def predict(self, X):
        return self.predict_proba(X).argmax(axis=1)


In [163]:

class KNeighborsClassifier21:
    def __init__(self, n_neighbors: int = 5):
        if n_neighbors < 1:
            raise ValueError("n_neighbors must be >= 1")
        self.k = int(n_neighbors)

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y)
        if X.shape[0] != y.shape[0]:
            raise ValueError("X and y must have the same number of samples")

        self.X_train_ = X
        self.n_samples_ = X.shape[0]

        self.classes_, self.y_train_idx_ = np.unique(y, return_inverse=True)
        self.n_classes_ = self.classes_.shape[0]

        if self.k > self.n_samples_:
            raise ValueError("n_neighbors cannot exceed number of training samples")
        return self

    @staticmethod
    def _euclidean_distances(A, B):

        A2 = np.einsum('ij,ij->i', A, A)[:, None]    
        B2 = np.einsum('ij,ij->i', B, B)[None, :]    
        d2 = A2 - 2.0 * (A @ B.T) + B2               
        np.maximum(d2, 0.0, out=d2)
        return np.sqrt(d2, out=d2)                   

    def _kneighbors(self, X):
        X = np.asarray(X, dtype=float)
        D = self._euclidean_distances(X, self.X_train_)        
        nn_idx = np.argpartition(D, self.k - 1, axis=1)[:, :self.k]  

        return nn_idx

    def predict(self, X):
        nn_idx = self._kneighbors(X)            
        votes_idx = self.y_train_idx_[nn_idx]  
        m = votes_idx.shape[0]
        counts = np.zeros((m, self.n_classes_), dtype=int)    
        rows = np.arange(m)[:, None]
        np.add.at(counts, (rows, votes_idx), 1)

        pred_idx = counts.argmax(axis=1)                     
        return self.classes_[pred_idx]                       

    def predict_proba(self, X):
        nn_idx = self._kneighbors(X)           
        votes_idx = self.y_train_idx_[nn_idx]                  

        m = votes_idx.shape[0]
        counts = np.zeros((m, self.n_classes_), dtype=float)  
        rows = np.arange(m)[:, None]
        np.add.at(counts, (rows, votes_idx), 1.0)

        counts /= counts.sum(axis=1, keepdims=True)    
        return counts                                         


In [216]:
log_reg21 = LogisticRegressionSGD()
log_reg21.fit(X_train, y_train)
y_logreg21 = log_reg21.predict(X_val)
proba_logreg21 = log_reg21.predict_proba(X_val)[:,1]
report('custom logreg',y_val,y_logreg21,proba_logreg21)
sgd = SGDClassifier(loss='log_loss',penalty=None,learning_rate='constant',random_state=21,eta0=0.001)
sgd.fit(X_train,y_train)
y_sgd = sgd.predict(X_val)
proba_sgd = sgd.predict_proba(X_val)[:,1]
report('sklearn logregsgd',y_val,y_sgd,proba_sgd)


[custom logreg]  Gini=0.4772  Acc=0.8808  Prec=0.6126  Rec=0.2262
Confusion:
 [[20786   454]
 [ 2456   718]]
[sklearn logregsgd]  Gini=0.4783  Acc=0.8812  Prec=0.6180  Rec=0.2253
Confusion:
 [[20798   442]
 [ 2459   715]]


In [311]:
gnb21 = GaussianNB21()
gnb21.fit(X_train, y_train)          
y_gnb21 = gnb21.predict(X_val)       
proba_gnb21  = gnb21.predict_proba(X_val)[:,1]
report('custom GaussianNB',y_val,y_gnb21,proba_gnb21)
report('sklearn GaussianNB',y_val,y_gnb,proba_gnb)


[custom GaussianNB]  Gini=0.4246  Acc=0.3828  Prec=0.1597  Rec=0.8790
Confusion:
 [[ 6555 14685]
 [  384  2790]]
[sklearn GaussianNB]  Gini=0.4310  Acc=0.3405  Prec=0.1532  Rec=0.8998
Confusion:
 [[ 5458 15782]
 [  318  2856]]


In [312]:
knn21 = KNeighborsClassifier21(n_neighbors=4)
knn21.fit(X_train, y_train)

y_knn21 = knn21.predict(X_val)        
proba_knn21  = knn21.predict_proba(X_val) [:,1]
report('custom KNN',y_val,y_knn21,proba_knn21)
report('sklearn KNN',y_val,y_knn,proba_knn)


[custom KNN]  Gini=0.2565  Acc=0.8749  Prec=0.6493  Rec=0.0822
Confusion:
 [[21099   141]
 [ 2913   261]]
[sklearn KNN]  Gini=0.2412  Acc=0.8728  Prec=0.5798  Rec=0.0778
Confusion:
 [[21061   179]
 [ 2927   247]]


### Add new non-linear features to your pipeline, repeat step 4.

In [359]:
df_train, df_val, df_test = train_val_test_date_split(df_raw,'PurchDate',val_cut,test_cut)
make_avg_cost = df_train.groupby("Make")["VehBCost"].mean()
model_avg_odo = df_train.groupby("Model")["VehOdo"].mean()
make_avg_odo  = df_train.groupby("Make")["VehOdo"].mean()
for df in (df_train, df_val, df_test):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df["Make_avg_cost"] = df["Make"].map(make_avg_cost)
    df["Make_avg_odo"]  = df["Make"].map(make_avg_odo)
    df["Model_avg_odo"] = df["Model"].map(model_avg_odo)
    df["Rel_odo_model"] = df["VehOdo"] / df["Model_avg_odo"]
    df["Auction_to_retail"] = df["MMRCurrentAuctionAveragePrice"] / (
        df["MMRCurrentRetailAveragePrice"].replace(0, np.nan)
    )
    df["log_cost"] = np.log1p(df["VehBCost"])
    df["log_odo"]  = np.log1p(df["VehOdo"])





TARGET = "IsBadBuy"
categorical_cols = df_train.drop(columns=TARGET).select_dtypes(include=[object, "category"]).columns
num_cols = df_train.drop(columns=TARGET).select_dtypes(include=[np.number]).columns

low, high = [], []
for col in categorical_cols:
    (high if df_train[col].nunique() > 20 else low).append(col)

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]), num_cols),
        ("num_nan", MissingIndicator(features="missing-only", error_on_new=False), num_cols),

        ("ohe", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("enc", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
        ]), low),
        ("ohe_nan", MissingIndicator(features="missing-only", error_on_new=False), low),

        ("cnt", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("enc", CountEncoder(normalize=True)),
        ]), high),
        ("cnt_nan", MissingIndicator(features="missing-only", error_on_new=False), high),
    ],
    remainder="drop",
    verbose_feature_names_out=True,
)
y_train = df_train[TARGET].copy()
y_val   = df_val[TARGET].copy()
y_test  = df_test[TARGET].copy()

X_train_nf = preprocess.fit_transform(df_train)
X_val_nf   = preprocess.transform(df_val)
X_test_nf  = preprocess.transform(df_test)
cols = preprocess.get_feature_names_out()
X_train_nf = pd.DataFrame(X_train_nf,columns=cols)
X_val_nf = pd.DataFrame(X_val_nf,columns=cols)
X_test_nf = pd.DataFrame(X_test_nf,columns=cols)




In [360]:
log_reg_nf = LogisticRegression(random_state=21,max_iter=1000)
log_reg_nf.fit(X_train_nf, y_train)
y_logreg_nf = log_reg_nf.predict(X_val_nf)
proba_logreg_nf = log_reg_nf.predict_proba(X_val_nf)[:,1]
report('New features logreg',y_val,y_logreg_nf,proba_logreg_nf)


gnb_nf = GaussianNB()
gnb_nf.fit(X_train_nf, y_train)          
y_gnb_nf = gnb_nf.predict(X_val_nf)       
proba_gnb_nf  = gnb_nf.predict_proba(X_val_nf)[:,1]
report('New features GaussianNB',y_val,y_gnb_nf,proba_gnb_nf)

knn_nf = KNeighborsClassifier(n_neighbors=4)
knn_nf.fit(X_train_nf, y_train)
y_knn_nf = knn_nf.predict(X_val_nf)        
proba_knn_nf  = knn_nf.predict_proba(X_val_nf) [:,1]
report('New features KNN',y_val,y_knn_nf,proba_knn_nf)


[New features logreg]  Gini=0.4449  Acc=0.8782  Prec=0.5897  Rec=0.2083
Confusion:
 [[20780   460]
 [ 2513   661]]
[New features GaussianNB]  Gini=0.4268  Acc=0.4073  Prec=0.1630  Rec=0.8604
Confusion:
 [[ 7213 14027]
 [  443  2731]]
[New features KNN]  Gini=0.1385  Acc=0.8710  Prec=0.5833  Rec=0.0265
Confusion:
 [[21180    60]
 [ 3090    84]]


### Determine the best features for the problem using the coefficients of the logistic model. Try to eliminate useless features by hand and by L1 regularization. Which approach is better in terms of Gini score?

In [361]:
mask_h = (np.abs(log_reg_nf.coef_[0]) > 0.2)
mask_h = pd.Series(mask_h, index=X_train_nf.columns)


In [362]:

X_train_nf_filtered = X_train_nf.loc[:, mask_h]
X_val_nf_filtered   = X_val_nf.loc[:, mask_h]
X_test_nf_filtered  = X_test_nf.loc[:, mask_h]



In [363]:
log_reg_h = LogisticRegression(random_state=21,max_iter=1000)
log_reg_h.fit(X_train_nf_filtered, y_train)
y_logreg_h = log_reg_h.predict(X_val_nf_filtered)
proba_logreg_h = log_reg_h.predict_proba(X_val_nf_filtered)[:,1]
report('New features filtered by hand Logreg',y_val,y_logreg_h,proba_logreg_h)


gnb_h = GaussianNB()
gnb_h.fit(X_train_nf_filtered, y_train)          
y_gnb_h = gnb_h.predict(X_val_nf_filtered)       
proba_gnb_h  = gnb_h.predict_proba(X_val_nf_filtered)[:,1]
report('New features filtered by hand GaussianNB',y_val,y_gnb_h,proba_gnb_h)

knn_h = KNeighborsClassifier(n_neighbors=4)
knn_h.fit(X_train_nf_filtered, y_train)
y_knn_h = knn_h.predict(X_val_nf_filtered)        
proba_knn_h  = knn_h.predict_proba(X_val_nf_filtered) [:,1]
report('New features filtered by hand KNN',y_val,y_knn_h,proba_knn_h)



[New features filtered by hand Logreg]  Gini=0.4406  Acc=0.8789  Prec=0.6025  Rec=0.2010
Confusion:
 [[20819   421]
 [ 2536   638]]
[New features filtered by hand GaussianNB]  Gini=0.4103  Acc=0.8753  Prec=0.5412  Rec=0.2687
Confusion:
 [[20517   723]
 [ 2321   853]]
[New features filtered by hand KNN]  Gini=0.2918  Acc=0.8774  Prec=0.5996  Rec=0.1708
Confusion:
 [[20878   362]
 [ 2632   542]]


In [364]:
log_reg_l = LogisticRegression(
    random_state=21, max_iter=1000, penalty='l1', solver='liblinear', C=1.1081580084003715
).fit(X_train_nf, y_train)

mask_l = (log_reg_l.coef_[0] != 0)
mask_l = pd.Series(mask_l, index=X_train_nf.columns)

X_train_nf_filtered_l = X_train_nf.loc[:, mask_l]
X_val_nf_filtered_l   = X_val_nf.loc[:, mask_l]
X_test_nf_filtered_l  = X_test_nf.loc[:, mask_l]
top_features = X_train_nf_filtered_l.columns


In [365]:
log_reg_l = LogisticRegression(random_state=21,max_iter=1000)
log_reg_l.fit(X_train_nf_filtered_l, y_train)
y_logreg_l = log_reg_l.predict(X_val_nf_filtered_l)
proba_logreg_l = log_reg_l.predict_proba(X_val_nf_filtered_l)[:,1]
report('New features filtered by L1 Logreg',y_val,y_logreg_l,proba_logreg_l)


gnb_l = GaussianNB()
gnb_l.fit(X_train_nf_filtered_l, y_train)          
y_gnb_l = gnb_l.predict(X_val_nf_filtered_l)       
proba_gnb_l  = gnb_l.predict_proba(X_val_nf_filtered_l)[:,1]
report('New features filtered by L1 GaussianNB',y_val,y_gnb_l,proba_gnb_l)

knn_l = KNeighborsClassifier(n_neighbors=4)
knn_l.fit(X_train_nf_filtered_l, y_train)

y_knn_l = knn_l.predict(X_val_nf_filtered_l)        
proba_knn_l  = knn_l.predict_proba(X_val_nf_filtered_l) [:,1]
report('New features filtered by L1 KNN',y_val,y_knn_l,proba_knn_l)


[New features filtered by L1 Logreg]  Gini=0.4437  Acc=0.8799  Prec=0.6191  Rec=0.1982
Confusion:
 [[20853   387]
 [ 2545   629]]
[New features filtered by L1 GaussianNB]  Gini=0.4057  Acc=0.8741  Prec=0.5309  Rec=0.2703
Confusion:
 [[20482   758]
 [ 2316   858]]
[New features filtered by L1 KNN]  Gini=0.1405  Acc=0.8701  Prec=0.5057  Rec=0.0280
Confusion:
 [[21153    87]
 [ 3085    89]]


По валидации Gini(L1-маска) 0.4437 > Gini(ручной порог) 0.4406, следовательно L1-регуляризация отбирает признаки лучше.

### sklearn SVC with a nonlinear kernel

In [None]:
def objective(trial):
    C     = trial.suggest_float('C', 1e-3, 1e+3, log=True)
    gamma = trial.suggest_float('gamma', 1e-6, 1e-1, log=True)

    clf = SVC(kernel='rbf', C=C, gamma=gamma, class_weight='balanced', random_state=21)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)
    fold_scores = []
    for tr, va in cv.split(X_train_nf_filtered_l, y_train):
        Xtr = X_train_nf_filtered_l.iloc[tr] 
        Xva = X_train_nf_filtered_l.iloc[va] 
        ytr = y_train.iloc[tr] 
        yva = y_train.iloc[va] 
        clf.fit(Xtr, ytr)
        scores = clf.decision_function(Xva)        
        fold_scores.append(gini_score(yva, scores))

        trial.report(np.mean(fold_scores), step=len(fold_scores))
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(fold_scores))

study = optuna.create_study(direction='maximize',
                            pruner=optuna.pruners.MedianPruner(n_startup_trials=8))
study.optimize(objective, n_trials=50, show_progress_bar=True)

[I 2025-09-03 00:29:54,954] A new study created in memory with name: no-name-db849101-5223-4f92-b1fc-a6090cf4cd72


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-09-03 00:30:43,805] Trial 0 finished with value: 0.41419558280069235 and parameters: {'C': 0.3715241887543975, 'gamma': 0.00019118303293722902}. Best is trial 0 with value: 0.41419558280069235.
[I 2025-09-03 00:31:26,463] Trial 1 finished with value: 0.4945152342079185 and parameters: {'C': 212.63687212243548, 'gamma': 5.2512093706748105e-05}. Best is trial 1 with value: 0.4945152342079185.
[I 2025-09-03 00:32:07,439] Trial 2 finished with value: 0.495098150535736 and parameters: {'C': 0.9483355903583, 'gamma': 0.00300136331091651}. Best is trial 2 with value: 0.495098150535736.
[I 2025-09-03 00:32:53,216] Trial 3 finished with value: 0.4645456172390011 and parameters: {'C': 0.24273884072901472, 'gamma': 0.0010294003635799118}. Best is trial 2 with value: 0.495098150535736.
[I 2025-09-03 00:33:41,173] Trial 4 finished with value: 0.4356460315237557 and parameters: {'C': 90.08445733587166, 'gamma': 1.3511127715427829e-06}. Best is trial 2 with value: 0.495098150535736.
[I 2025-0

In [314]:
C, gamma = study.best_params.values()
print(C, gamma)

2.8476896339852433 0.004964244836259638


In [None]:
svc_rbf = SVC(kernel='rbf', C=C,gamma=gamma, class_weight='balanced', decision_function_shape='ovo' ,random_state=21)
svc_rbf.fit(X_train_nf_filtered_l,y_train)


0,1,2
,C,2.8476896339852433
,kernel,'rbf'
,degree,3
,gamma,0.004964244836259638
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [None]:
y_rbf = svc_rbf.predict(X_val_nf_filtered_l)       
proba_rbf  = svc_rbf.decision_function(X_val_nf_filtered_l)
report('New features filtered by L1 SVC RBF',y_val,y_rbf,proba_rbf)


[New features filtred by L1 SVC RBF]  Gini=0.4407  Acc=0.8206  Prec=0.3444  Rec=0.4206
Confusion:
 [[18699  2541]
 [ 1839  1335]]


### Select your best model (algorithm + feature set) and tweak its hyperparameters to increase the Gini score on the validation dataset. Which hyperparameters have the most impact?

In [None]:
def objective(trial):
    C = trial.suggest_float('C', 1e-5,1e+3, log=True)
    l1_ratio = trial.suggest_float('l1_ratio',0,1)
    
    model = LogisticRegression(
        penalty='elasticnet',
        max_iter=100,
        random_state = 21,
        C=C,
        l1_ratio=l1_ratio,
        solver='saga'
    )

    model.fit(X_train_nf_filtered_l,y_train)
    proba = model.predict_proba(X_val_nf_filtered_l)[:,1]
    score = gini_score(y_val,proba)

    return float(np.mean(score))

study = optuna.create_study(direction='maximize',
                            pruner=optuna.pruners.MedianPruner(n_startup_trials=8))
study.optimize(objective, n_trials=300, show_progress_bar=True)
    

[I 2025-09-03 00:01:23,436] A new study created in memory with name: no-name-a3d80dc4-e23a-4b8e-8ac0-a8b7e44c7137


  0%|          | 0/300 [00:00<?, ?it/s]

[I 2025-09-03 00:01:24,386] Trial 0 finished with value: 0.450008662662855 and parameters: {'C': 831.760861805232, 'l1_ratio': 0.6705874968299803}. Best is trial 0 with value: 0.450008662662855.
[I 2025-09-03 00:01:24,521] Trial 1 finished with value: 0.4688082133910527 and parameters: {'C': 0.004306822203602211, 'l1_ratio': 0.8767672045532895}. Best is trial 1 with value: 0.4688082133910527.
[I 2025-09-03 00:01:25,223] Trial 2 finished with value: 0.46286742150500126 and parameters: {'C': 0.01330411147590187, 'l1_ratio': 0.32738876590406085}. Best is trial 1 with value: 0.4688082133910527.
[I 2025-09-03 00:01:26,159] Trial 3 finished with value: 0.46796817242733724 and parameters: {'C': 0.047187188081750875, 'l1_ratio': 0.05483789600945066}. Best is trial 1 with value: 0.4688082133910527.
[I 2025-09-03 00:01:26,172] Trial 4 finished with value: 0.0 and parameters: {'C': 1.2564851211076643e-05, 'l1_ratio': 0.5380108008167874}. Best is trial 1 with value: 0.4688082133910527.
[I 2025-09-

In [306]:
study.best_params

{'C': 0.17462684862308017, 'l1_ratio': 0.9975814544794199}

In [None]:
C, l1_ratio = study.best_params.values()
best_model = LogisticRegression(
        penalty='elasticnet',
        max_iter=100,
        random_state = 21,
        C=C,
        l1_ratio=l1_ratio,
        solver='saga'
    )
best_model.fit(X_train_nf_filtered_l,y_train)
best_proba_val = best_model.predict_proba(X_val_nf_filtered_l)[:,1]
best_y_pred_val = best_model.predict(X_val_nf_filtered_l)
report('Logreg Elasticnet on validation sample',y_val,best_y_pred_val,best_proba_val)

[Logreg Elasticnet on validation sample]  Gini=0.4708  Acc=0.8795  Prec=0.5983  Rec=0.2215
Confusion:
 [[20768   472]
 [ 2471   703]]


### Оба параметра влияли на результат: C — контролировал силу регуляризации, а l1_ratio — тип регуляризации. В данной задаче наибольшую роль сыграл сильный L1-регуляризатор при малом C, который обнулил неинформативные признаки и повысил устойчивость модели.

### Check the Gini scores on all three datasets for your best model: training Gini, valid Gini, test Gini. Do you see a drop in performance when comparing the valid quality to the test quality? Is your model overfitted or not? Explain.

In [None]:
best_proba_train = best_model.predict_proba(X_train_nf_filtered_l)[:,1]
best_y_pred_train = best_model.predict(X_train_nf_filtered_l)
best_proba_test = best_model.predict_proba(X_test_nf_filtered_l)[:,1]
best_y_pred_test = best_model.predict(X_test_nf_filtered_l)
report('Logreg Elasticnet on train sample',y_train,best_y_pred_train,best_proba_train)
report('Logreg Elasticnet on validation sample',y_val,best_y_pred_val,best_proba_val)
report('Logreg Elasticnet on test sample',y_test,best_y_pred_test,best_proba_test)

[Logreg Elasticnet on train sample]  Gini=0.5137  Acc=0.9011  Prec=0.7213  Rec=0.2260
Confusion:
 [[21206   243]
 [ 2154   629]]
[Logreg Elasticnet on validation sample]  Gini=0.4708  Acc=0.8795  Prec=0.5983  Rec=0.2215
Confusion:
 [[20768   472]
 [ 2471   703]]
[Logreg Elasticnet on test sample]  Gini=0.4317  Acc=0.8949  Prec=0.7489  Rec=0.2302
Confusion:
 [[21085   233]
 [ 2324   695]]


### Анализ Gini на трёх выборках

Для лучшей модели (Logistic Regression с ElasticNet) получены следующие значения:

- **Gini на трейне:** 0.514  
- **Gini на валидации:** 0.471  
- **Gini на тесте:** 0.432  

Мы видим постепенное снижение качества от трейна к валидации и далее к тесту.  
Падение невелико, поэтому можно сказать, что модель **не сильно переобучена** и сохраняет способность обобщать на новых данных.


### Implement calculation of Recall, Precision, F1 score and AUC PR metrics.


In [352]:
def recall_score21(y_true,y_pred):
    tp = np.sum((y_true==1) & (y_pred==1))
    fn = np.sum((y_true==1) & (y_pred==0))
    return tp/(tp+fn) if (tp+fn) > 0 else 0.0
    

In [353]:
def precision_score21(y_true, y_pred):
    tp = np.sum((y_true==1) & (y_pred==1))
    fp = np.sum((y_true==0) & (y_pred==1))
    return tp/(tp+fp) if (tp+fp) > 0 else 0.0

In [354]:
def f1_score21(y_true,y_pred):
    p = precision_score21(y_true, y_pred)
    r = recall_score21(y_true, y_pred)
    return 2*p*r/(p+r) if (p+r) > 0 else 0.0

In [None]:
def auc_pr_score21(y_true, y_proba):
    y_true = np.asarray(y_true).astype(int)
    y_proba = np.asarray(y_proba, dtype=float)
    P = y_true.sum()
    if P == 0:
        return 0.0
    order = np.argsort(-y_proba)
    y = y_true[order]
    tp = np.cumsum(y)
    fp = np.cumsum(1 - y)
    recall = tp / P
    precision = tp / (tp + fp)
    return np.sum((recall[1:] - recall[:-1]) * precision[1:])


In [356]:
print(f"Custom recall {recall_score21(y_test,best_y_pred_test)}, sklearn recall {recall_score(y_test,best_y_pred_test)}")
print(f"Custom precision {precision_score21(y_test,best_y_pred_test)}, sklearn precision {precision_score(y_test,best_y_pred_test)}")
print(f"Custom f1_score {f1_score21(y_test,best_y_pred_test)}, sklearn f1_score {f1_score(y_test,best_y_pred_test)}")
print(f"Custom auc_pr_score {auc_pr_score21(y_test,best_proba_test)}, sklearn auc_pr_score {average_precision_score(y_test,best_proba_test)}")


Custom recall 0.2302086783703213, sklearn recall 0.2302086783703213
Custom precision 0.7489224137931034, sklearn precision 0.7489224137931034
Custom f1_score 0.35216620217887, sklearn f1_score 0.35216620217887
Custom auc_pr_score 0.369700869530375, sklearn auc_pr_score 0.36970086953037495


### Compare your algorithms on the test dataset using AUC PR metric.

In [None]:
pairs = {
    "LR ElasticNet (best)": best_model.predict_proba(X_test_nf_filtered_l)[:,1],
    "GNB (L1 features)": gnb_l.predict_proba(X_test_nf_filtered_l)[:,1],
    "KNN-4 (L1 features)": knn_l.predict_proba(X_test_nf_filtered_l)[:,1],
    "SVC RBF (L1 features)": svc_rbf.decision_function(X_test_nf_filtered_l),
}
for name, s in pairs.items():
    ap_my = auc_pr_score21(y_test, s)
    ap_sk = average_precision_score(y_test, s)
    print(f"{name}:{ap_my:.4f}")


LR ElasticNet (best):0.3697
GNB (L1 features):0.1814
KNN-4 (L1 features):0.1518
SVC RBF (L1 features):0.4005


### Which hard label metric do you prefer for the task of detecting "lemon" cars?
Recall так как нам важнее не пропустить плохую машину(FN) , чем назвать хорошую машину плохой (FP)