In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import warnings
import plotly.express as px
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from scipy import stats
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                           f1_score, classification_report, confusion_matrix,
                           roc_auc_score, roc_curve, precision_recall_curve)
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from collections import Counter
from sklearn.model_selection import cross_val_score




warnings.filterwarnings('ignore')

In [None]:
#pip install imbalanced-learn


1. Wczytywanie i podstawowe info o danych




In [None]:
hr: pd.DataFrame =  pd.read_csv("aug_train.csv")

In [None]:
hr_test: pd.DataFrame =  pd.read_csv("aug_test.csv")

In [None]:
submission_sample: pd.DataFrame = pd.read_csv('sample_submission.csv')


In [None]:
hr_test.head(n=5)

In [None]:
hr.head(n=5)

In [None]:
print(f"Rozmiar datasetu: {hr.shape}")
print(f"\nPodstawowe informacje:\n")
print(hr.info())

Zmienne:


enrollee_id : unikalne ID dla enroliego

city: kod pocztowy

citydevelopmentindex: Skala rozwoju miasta

gender: Płec

relevant_experience: istotne doświadczenie

enrolled_university: rodzaj uniwerystetu

education_level: stopień edukacji

major_discipline : ukierunkowanie

experience: doświadczenie w latach

company_size: ilosc pracownikow w firmie w ktorej pracuje

company_type : rodzaj zatrudnienia

lastnewjob: roznica lat miedzy zatrudnieniami

training_hours: ile godzin treningu

target: 0 – nie szuka pracy, 1 – szuka pracy



2. Eksploracyjna analiza danych

In [None]:
print(f'Podstawowe statystyki train:\n')
print(hr.describe())

In [None]:
print(f"\nprocent udziału null w kolumnie:")
for col in hr.columns:
    null_val = hr[col].isnull().sum()
    null_prec = (null_val * 100) / hr.shape[0]
    print(f'missing: {col}, ({null_prec:.1f}%)')

In [None]:
print(f"\nRozkład zmiennej target:")
target_counts = hr['target'].value_counts()
print(target_counts)
print(f"Proporcje: {target_counts / len(hr)}")

Wizualizacje

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Rozkład target
axes[0,0].pie(target_counts.values, labels=['Nie szuka pracy (0)', 'Szuka pracy (1)'],
       autopct='%1.1f%%', startangle=90)
axes[0,0].set_title('Rozkład zmiennej target')

# Rozkład wskaźnika rozwoju miasta
axes[0,1].hist(hr['city_development_index'].dropna(), bins=20, alpha=0.7, color='skyblue')
axes[0,1].set_title('Rozkład wskaźnika rozwoju miasta')
axes[0,1].set_xlabel('City Development Index')

# Godziny szkolenia vs Target
hr.boxplot(column='training_hours', by='target', ax=axes[1,0])
axes[1,0].set_title('Godziny szkolenia vs Target')
axes[1,0].set_xlabel('Target')

# Korelacja tylko dla zmiennych numerycznych
numeric_cols = hr.select_dtypes(include=[np.number]).columns
corr_matrix = hr[numeric_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[1,1])
axes[1,1].set_title('Macierz korelacji - zmienne numeryczne')

plt.tight_layout()
plt.show()

In [None]:
categorical_cols = hr_processed.select_dtypes(include=['object']).columns.tolist()
numerical_cols = hr_processed.select_dtypes(include=[np.number]).columns.tolist()
if 'target' in numerical_cols:
    numerical_cols.remove('target')
if 'enrollee_id' in numerical_cols:
    numerical_cols.remove('enrollee_id')
if 'enrollee_id' in categorical_cols:
    categorical_cols.remove('enrollee_id')

print(f"Zmienne kategoryczne: {categorical_cols}")
print(f"Zmienne numeryczne: {numerical_cols}")

3. Oczyszczanie danych

Lista zmiennych porządkowych:

education_level

company_size

experience

last_new_job

company_type

Mapowania dla kodowania zmiennych kategorycznych

In [None]:
def prepare_data(data, is_train=True):
    data_processed = data.copy()

    #mapowania dla kodowania zmiennych kategorycznych zainspirowane kodem z kaggle to rozwiązanie, mogę znaleźć link
    mappings = {
        'gender': {'Female': 2, 'Male': 1, 'Other': 0},
        'relevent_experience': {'Has relevent experience': 1, 'No relevent experience': 0},
        'enrolled_university': {'no_enrollment': 0, 'Full time course': 1, 'Part time course': 2},
        'education_level': {'Primary School': 0, 'High School': 1, 'Graduate': 2, 'Masters': 3, 'Phd': 4},
        'major_discipline': {'STEM': 0, 'Business Degree': 1, 'Arts': 2, 'Humanities': 3, 'No Major': 4, 'Other': 5},
        'experience': {'<1': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9,
                      '10': 10, '11': 11, '12': 12, '13': 13, '14': 14, '15': 15, '16': 16, '17': 17,
                      '18': 18, '19': 19, '20': 20, '>20': 21},
        'company_type': {'Pvt Ltd': 0, 'Funded Startup': 1, 'Early Stage Startup': 2, 'Other': 3, 'Public Sector': 4, 'NGO': 5},
        'company_size': {'<10': 0, '10/49': 1, '50-99': 2, '100-500': 3, '500-999': 4, '1000-4999': 5, '5000-9999': 6, '10000+': 7},
        'last_new_job': {'never': 0, '1': 1, '2': 2, '3': 3, '4': 4, '>4': 5}
    }

    #aplikacja mapowań
    for col, mapping in mappings.items():
        if col in data_processed.columns:
            data_processed[col] = data_processed[col].map(mapping)
            print(f"Zakodowano zmienną: {col}")

    #wypełnienie brakujących wartości medianą
    numeric_columns = data_processed.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        if col not in ['enrollee_id', 'target']:
            median_val = data_processed[col].median()
            data_processed[col].fillna(median_val, inplace=True)

    #usunięcie duplikatów dla zbioru treningowego
    if is_train:
        duplicates = data_processed.duplicated().sum()
        if duplicates > 0:
            data_processed = data_processed.drop_duplicates()
            print(f"Usunięto {duplicates} duplikatów")

    return data_processed

hr_processed = prepare_data(hr, is_train=True)
hr_test_processed = prepare_data(hr_test, is_train=False)


4. Future Engineering

In [None]:
def prepare_features(data, target_col='target'):
    """Przygotowanie cech do modelowania"""
    cols_to_drop = ['enrollee_id']
    if target_col in data.columns:
        cols_to_drop.append(target_col)

    X = data.drop(columns=cols_to_drop, errors='ignore')

    #wybieram tylko kolumny numeryczne
    X = X.select_dtypes(include=[np.number])

    print(f"Cechy wybrane do modelu:{list(X.columns)}")
    print(f"Kształt macierzy cech:{X.shape}")

    return X

X = prepare_features(hr_processed)
y = hr_processed['target']


In [None]:
print(f"Kształt target: {y.shape}")
print(f"Rozkład target: {y.value_counts().to_dict()}")


6. Podział danych na zbiory treningowy i testowy

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Rozmiar zbioru treningowego: {X_train.shape}")
print(f"Rozmiar zbioru walidacyjnego: {X_val.shape}")
print(f"Proporcje w zbiorze treningowym: {y_train.value_counts(normalize=True)}")
print(f"Proporcje w zbiorze walidacyjnym: {y_val.value_counts(normalize=True)}")


5. Werfykacja założeń regresji liniowej

In [None]:
low_variance_cols = X.columns[X.var() < 0.01]
if len(low_variance_cols) > 0:
    print(f"cechy o niskiej wariancji (< 0.01):{list(low_variance_cols)}")
else:
    print("wszystkie cechy mają odpowiednią wariancję")


In [None]:
X_with_const = sm.add_constant(X)
vif_data = pd.DataFrame()
vif_data["Cecha"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X_with_const.values, i+1)
                   for i in range(len(X.columns))]

print("Współczynniki VIF (Variance Inflation Factor):")
print(vif_data)
print("\nInterpretacja VIF:")
print("VIF < 5: Brak problemów z wielokolinearnością")
print("5 ≤ VIF < 10: Umiarkowana wielokolinearność")
print("VIF ≥ 10: Wysoka wielokolinearność")


Najwyższy VIF 1.62, dużo poniżej progu, zmienne niezależne nie są silnie skorelowane ze sobą, więc model nie będzie miał problemu z niestabilnymi współczynnikami regresji

In [None]:
def evaluate_model(y_true, y_pred, y_pred_proba, model_name):
    print(f"\n{model_name} - WYNIKI:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"F1-score: {f1_score(y_true, y_pred):.4f}")
    print(f"AUC-ROC: {roc_auc_score(y_true, y_pred_proba):.4f}")

    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred),
        'auc_roc': roc_auc_score(y_true, y_pred_proba)
    }

In [None]:
balancing_methods = {
    'Original': (X_train, y_train),
    'SMOTE': None,
    'SMOTEENN': None,
    'UnderSampling': None
}

Skalowanie danych

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


SMOTE

In [None]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
balancing_methods['SMOTE'] = (X_train_smote, y_train_smote)


SMOTEEN (over i under sampling)

In [None]:
smoteenn = SMOTEENN(random_state=42)
X_train_smoteenn, y_train_smoteenn = smoteenn.fit_resample(X_train, y_train)
balancing_methods['SMOTEENN'] = (X_train_smoteenn, y_train_smoteenn)

random under sampling

In [None]:
under_sampler = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)
balancing_methods['UnderSampling'] = (X_train_under, y_train_under)


In [None]:
print(f"\nRozkłady po wyważeniu:")
for method, (X_balanced, y_balanced) in balancing_methods.items():
    print(f"{method}: {Counter(y_balanced)}")


Najlepiej wypada SMOTE

7. Budowanie modelu oraz jego Ewaluacja

model regresji liniowej

In [None]:
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

RANDOM FOREST

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_val_pred_rf = rf_model.predict(X_val)
y_val_pred_proba_rf = rf_model.predict_proba(X_val)[:, 1]

rf_scores = evaluate_model(y_val, y_val_pred_rf, y_val_pred_proba_rf, "RANDOM FOREST")


gradient boosting

In [None]:
gb_model = GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        random_state=42)

gb_model.fit(X_train, y_train)

y_val_pred_gb = gb_model.predict(X_val)
y_val_pred_proba_gb = gb_model.predict_proba(X_val)[:, 1]

gb_scores = evaluate_model(y_val, y_val_pred_gb, y_val_pred_proba_gb, "GRADIENT BOOSTING")


In [None]:
results_df = pd.DataFrame({
    'Logistic Regression': log_reg_scores,
    'Random Forest': rf_scores,
    'Gradient Boosting': gb_scores
}).T

print(results_df.round(4))

In [None]:
best_model = results_df['f1'].idxmax()
print(f"\nNajlepszy model: {best_model} (F1-score: {results_df.loc[best_model, 'f1']:.4f})")


In [None]:
gb_model_optimized = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    min_samples_split=10,
    min_samples_leaf=5,
    subsample=0.8,
    random_state=42
)

In [None]:
gb_model_optimized.fit(X_train_smote, y_train_smote)


In [None]:
y_train_pred_log = log_reg.predict(X_train_scaled)
y_val_pred_log = log_reg.predict(X_val_scaled)
y_val_pred_proba_log = log_reg.predict_proba(X_val_scaled)[:, 1]

log_reg_scores = evaluate_model(y_val, y_val_pred_log, y_val_pred_proba_log, "REGRESJA LOGISTYCZNA")


In [None]:
y_val_pred_gb = gb_model.predict(X_val)
y_val_pred_proba_gb = gb_model.predict_proba(X_val)[:, 1]

gb_scores = evaluate_model(y_val, y_val_pred_gb, y_val_pred_proba_gb, "GRADIENT BOOSTING")


In [None]:
y_val_pred_gb_opt = gb_model_optimized.predict(X_val)
y_val_pred_proba_gb_opt = gb_model_optimized.predict_proba(X_val)[:, 1]

gb_opt_scores = evaluate_model(y_val, y_val_pred_gb_opt, y_val_pred_proba_gb_opt, "GRADIENT BOOSTING OPTIMIZED")


Macierz pomyłek

In [None]:
models = [
    ('Logistic Regression', y_val_pred_log),
    ('Random Forest', y_val_pred_rf),
    ('Gradient Boosting', y_val_pred_gb),
    ('Gradient Boosting opt', y_val_pred_gb_opt)
]

fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.flatten()

for i, (name, y_pred) in enumerate(models):
    cm = confusion_matrix(y_val, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
    axes[i].set_title(f'Confusion Matrix - {name}')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('Actual')

plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve


Krzywe ROC

In [None]:
models_with_proba = [
    ('Logistic Regression', y_val_pred_log, y_val_pred_proba_log),
    ('Random Forest', y_val_pred_rf, y_val_pred_proba_rf),
    ('Gradient Boosting', y_val_pred_gb, y_val_pred_proba_gb),
    ('Gradient Boosting opt', y_val_pred_gb_opt, y_val_pred_proba_gb_opt)
]

plt.figure(figsize=(10, 8))
for name, y_pred, y_pred_proba in models_with_proba:
    if y_pred_proba is not None and len(y_pred_proba) > 0:
        fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
        auc_score = roc_auc_score(y_val, y_pred_proba)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {auc_score:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Krzywe ROC dla wszystkich modeli')
plt.legend()
plt.grid(True)
plt.show()

analiza najważniejszych cech regresji logistycznej

In [None]:
feature_importance_log = pd.DataFrame({
    'feature': X.columns,
    'coefficient': log_reg.coef_[0]
})
feature_importance_log['abs_coefficient'] = np.abs(feature_importance_log['coefficient'])
feature_importance_log = feature_importance_log.sort_values('abs_coefficient', ascending=False)
print(feature_importance_log.head(10))


analiza najważniejszych cench gradient boosting

In [None]:
feature_importance_gb = pd.DataFrame({
    'feature': X.columns,
    'importance': gb_model.feature_importances_

})
feature_importance_gb = feature_importance_gb.sort_values('importance', ascending=False)
print(feature_importance_gb.head(10))

In [None]:
feature_importance_gb_opt = pd.DataFrame({
    'feature': X.columns,
    'importance': gb_model_optimized.feature_importances_

})
feature_importance_gb_opt = feature_importance_gb_opt.sort_values('importance', ascending=False)
print(feature_importance_gb_opt.head(10))

walidacja krzyżowa

In [None]:

# Random Forest
cv_scores_rf = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='roc_auc')
print(f"Random Forest - AUC CV: {cv_scores_rf.mean():.4f} (+/- {cv_scores_rf.std() * 2:.4f})")

# Gradient Boosting
cv_scores_gb = cross_val_score(gb_model, X_train, y_train, cv=5, scoring='roc_auc')
print(f"Gradient Boosting - AUC CV: {cv_scores_gb.mean():.4f} (+/- {cv_scores_gb.std() * 2:.4f})")

#Gradient Boosting Opt
cv_scores_gb_opt = cross_val_score(gb_model_optimized, X_train, y_train, cv=5, scoring='roc_auc')
print(f"Gradient Boosting Opt - AUC CV: {cv_scores_gb_opt.mean():.4f} (+/- {cv_scores_gb_opt.std() * 2:.4f})")


Gradient Boosting Opt - AUC CV: 0.7719 (+/- 0.0131)


In [None]:
print("REGRESJA LOGISTYCZNA:")
print(classification_report(y_val, y_val_pred_log))

print("\nRANDOM FOREST:")
print(classification_report(y_val, y_val_pred_rf))

print("\nGRADIENT BOOSTING:")
print(classification_report(y_val, y_val_pred_gb))

print("\nGRADIENT BOOSTING OPT:")
print(classification_report(y_val, y_val_pred_gb_opt))

REGRESJA LOGISTYCZNA:
              precision    recall  f1-score   support

         0.0       0.79      0.94      0.86      2877
         1.0       0.58      0.25      0.35       955

    accuracy                           0.77      3832
   macro avg       0.69      0.59      0.60      3832
weighted avg       0.74      0.77      0.73      3832


RANDOM FOREST:
              precision    recall  f1-score   support

         0.0       0.83      0.89      0.86      2877
         1.0       0.56      0.44      0.49       955

    accuracy                           0.77      3832
   macro avg       0.69      0.66      0.67      3832
weighted avg       0.76      0.77      0.76      3832


GRADIENT BOOSTING:
              precision    recall  f1-score   support

         0.0       0.83      0.89      0.86      2877
         1.0       0.58      0.45      0.50       955

    accuracy                           0.78      3832
   macro avg       0.70      0.67      0.68      3832
weighted avg    

In [None]:
print(f"\nKOŃCOWE PODSUMOWANIE")
print(f"Najlepszy model: {best_model}")
print(f"Użyta metoda wyważenia: SMOTE")
print(f"Rozmiar danych treningowych po SMOTE: {X_train_smote.shape}")
print(f"Najważniejsze metryki najlepszego modelu:")
for metric, value in results_df.loc[best_model].items():
    print(f"  {metric.capitalize()}: {value:.4f}")


KOŃCOWE PODSUMOWANIE
Najlepszy model: Gradient Boosting
Użyta metoda wyważenia: SMOTE
Rozmiar danych treningowych po SMOTE: (23008, 11)
Najważniejsze metryki najlepszego modelu:
  Accuracy: 0.7805
  Precision: 0.5774
  Recall: 0.4450
  F1: 0.5027
  Auc_roc: 0.8005


maksymalna czułość (recall) i f1-score klasy 1, to lepszy jest GB opt, jak mówiły wyniki metryk wcześniej.
model o najwyższym AUC i stabilności (niska wariancja w CV), to zwykły GB jest również dobry i prostszy.