In [None]:
from sklearn.datasets import fetch_openml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.model_selection import cross_val_score
import sklearn.model_selection as skm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve, auc

In [None]:
#dane
data = fetch_openml(data_id=31)

# Utwórz DataFrame z danych i etykiet
df = pd.DataFrame(data.data, columns=data.feature_names)

# Dodaj kolumnę z etykietami
df['target'] = pd.Series(data.target, name='target')

df = pd.get_dummies(df)
df.replace({True: 1, False: 0}, inplace=True)
y = df['target_good'] 
X = df.drop(['target_good', 'target_bad'], axis=1)
# zbiór testowy i treningowy 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=335723)
kfold = skm.KFold(5,
                  random_state=335723,
                  shuffle=True)

In [None]:

#model regresji logistycznej
model_lr = LogisticRegression(penalty = None , max_iter = 300).fit(X_train,y_train) 

y_pred = model_lr.predict(X_test)
y_proba = model_lr.predict_proba(X_test)[:, 1] 

#miary treningowe
y_pred_train = model_lr.predict(X_train)
y_proba_train = model_lr.predict_proba(X_train)[:, 1] 

accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train)
recall = recall_score(y_train, y_pred_train)
auc_value = roc_auc_score(y_train, y_proba_train)

print(f'Dokładność trenigowa: {accuracy:.4f}')
print(f'Precyzja treningowa: {precision:.4f}')
print(f'Czułość trenigowa: {recall:.4f}')
print(f'Wartość AUC treningowa: {auc_value:.4f}')

#miary testowe
y_pred = model_lr.predict(X_test)
y_proba = model_lr.predict_proba(X_test)[:, 1] 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc_value = roc_auc_score(y_test, y_proba)

print(f'Dokładność: {accuracy:.4f}')
print(f'Precyzja: {precision:.4f}')
print(f'Czułość: {recall:.4f}')
print(f'Wartość AUC: {auc_value:.4f}')


In [None]:
#model regresji logistycznej z regularyzacją L1
model_l1 = LogisticRegression(penalty='l1') 
param_grid = {
    'C': [100,10, 5, 2, 1, 0.5, 0.1, 0.01, 0.005],
    'solver' : ['liblinear','saga'],
    'max_iter' : [100,300]
}


# Utwórz obiekt GridSearchCV
grid_search = GridSearchCV(model_l1, param_grid, cv=kfold, scoring='accuracy')

# Dopasuj model do danych treningowych
grid_search.fit(X_train, y_train)

# Wydrukuj najlepsze parametry
print("Najlepsze parametry:", grid_search.best_params_)

#miary treningowe
y_pred_train = grid_search.predict(X_train)
y_proba_train = grid_search.predict_proba(X_train)[:, 1] 

accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train)
recall = recall_score(y_train, y_pred_train)
auc_value = roc_auc_score(y_train, y_proba_train)

print(f'Dokładność trenigowa: {accuracy:.4f}')
print(f'Precyzja treningowa: {precision:.4f}')
print(f'Czułość trenigowa: {recall:.4f}')
print(f'Wartość AUC treningowa: {auc_value:.4f}')

#miary testowe
y_pred = grid_search.predict(X_test)
y_proba = grid_search.predict_proba(X_test)[:, 1] 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc_value = roc_auc_score(y_test, y_proba)

print(f'Dokładność: {accuracy:.4f}')
print(f'Precyzja: {precision:.4f}')
print(f'Czułość: {recall:.4f}')
print(f'Wartość AUC: {auc_value:.4f}')



In [None]:

#model regresji logistycznej z regularyzacją L2
model_l2 = LogisticRegression(penalty='l2') 
param_grid = {
    'C': [100,10, 5, 2, 1, 0.5, 0.1, 0.01, 0.005],
    'solver' : ['lbfgs','liblinear','newton-cg','newton-cholesky',
                'sag', 'saga'],
    'max_iter' : [100,300]
}


# Utwórz obiekt GridSearchCV
grid_search = GridSearchCV(model_l2, param_grid, cv=kfold, scoring='accuracy')

# Dopasuj model do danych treningowych
grid_search.fit(X_train, y_train)

# Wydrukuj najlepsze parametry
print("Najlepsze parametry:", grid_search.best_params_)


#miary treningowe
y_pred_train = grid_search.predict(X_train)
y_proba_train = grid_search.predict_proba(X_train)[:, 1] 

accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train)
recall = recall_score(y_train, y_pred_train)
auc_value = roc_auc_score(y_train, y_proba_train)

print(f'Dokładność trenigowa: {accuracy:.4f}')
print(f'Precyzja treningowa: {precision:.4f}')
print(f'Czułość trenigowa: {recall:.4f}')
print(f'Wartość AUC treningowa: {auc_value:.4f}')

#miary testowe
y_pred = grid_search.predict(X_test)
y_proba = grid_search.predict_proba(X_test)[:, 1] 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc_value = roc_auc_score(y_test, y_proba)

print(f'Dokładność: {accuracy:.4f}')
print(f'Precyzja: {precision:.4f}')
print(f'Czułość: {recall:.4f}')
print(f'Wartość AUC: {auc_value:.4f}')


In [None]:
# 3 na raz
model_lr = LogisticRegression(penalty = None , max_iter = 300).fit(X_train,y_train) 
model_l1 = LogisticRegression(penalty='l1' ,C =  0.5, max_iter =  100, solver= 'liblinear').fit(X_train, y_train)
model_l2 = LogisticRegression(penalty='l2',C= 10, max_iter= 100, solver = 'newton-cg').fit(X_train, y_train) 

pred1 = model_lr.predict_proba(X_test)
pred2 = model_l1.predict_proba(X_test)
pred3 = model_l2.predict_proba(X_test)


fpr, tpr, thresholds = roc_curve(y_test, pred1[:,1])
plt.plot(fpr,tpr,label="None, AUC="+str(round(roc_auc_score(y_test, pred1[:,1]), 4)))
fpr, tpr, thresholds = roc_curve(y_test, pred2[:,1])
plt.plot(fpr,tpr,label="l1, max_depth=3, AUC="+str(round(roc_auc_score(y_test, pred2[:,1]), 4)))
fpr, tpr, thresholds = roc_curve(y_test, pred3[:,1])
plt.plot(fpr,tpr,label="l2, AUC="+str(round(roc_auc_score(y_test, pred3[:,1]), 4)))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Krzywa ROC')
plt.legend(loc='lower right')
plt.show()

In [None]:
coefs_l1 = pd.Series(model_l1.coef_[0], index=model_l1.feature_names_in_)
non_zero_coefs_l1 = coefs_l1[np.abs(coefs_l1) >= 1e-2]

print(f'Ilość współczynników: {len(coefs_l1)}, ilość (prawie) niezerowych współczynników: {len(non_zero_coefs_l1)}')

X_train_reduced, X_test_reduced = X_train[non_zero_coefs_l1.index], X_test[non_zero_coefs_l1.index]
nazwy_kolumn = X_train_reduced.columns
print(nazwy_kolumn)

In [None]:
svm_linear = SVC(kernel = 'linear', C=10)
svm_linear.fit(X_train, y_train)


#miary treningowe
y_pred_train = svm_linear.predict(X_train)

accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train)
recall = recall_score(y_train, y_pred_train)
auc_value = roc_auc_score(y_train, y_proba_train)

print(f'Dokładność trenigowa: {accuracy:.4f}')
print(f'Precyzja treningowa: {precision:.4f}')
print(f'Czułość trenigowa: {recall:.4f}')
print(f'Wartość AUC treningowa: {auc_value:.4f}')

#miary testowe
y_pred = grid_search.predict(X_test)
y_proba = grid_search.predict_proba(X_test)[:, 1] 

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc_value = roc_auc_score(y_test, y_proba)

print(f'Dokładność: {accuracy:.4f}')
print(f'Precyzja: {precision:.4f}')
print(f'Czułość: {recall:.4f}')
print(f'Wartość AUC: {auc_value:.4f}')


In [None]:
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_estimator(svm_linear, X_test, y_test)

In [None]:
X_test.shape