In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Charger les données
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
df = pd.read_csv(url, header=None, names=columns, na_values=" ?", skipinitialspace=True)

# Prétraitement des données
df = df.dropna()
label_encoders = {}
for column in df.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

X = df.drop("income", axis=1)
y = df["income"]

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardiser les données
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier()

param_grid = {
#     'n_neighbors': range(1, 31),
    'n_neighbors': range(1, 11),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_knn = grid_search.best_estimator_
print("Best parameters for k-NN:", grid_search.best_params_)


Best parameters for k-NN: {'metric': 'manhattan', 'n_neighbors': 10, 'weights': 'uniform'}


In [12]:
from sklearn.metrics import accuracy_score, classification_report

y_pred_knn = best_knn.predict(X_test)
print("k-NN Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))


k-NN Accuracy: 0.8433901427913404
              precision    recall  f1-score   support

           0       0.87      0.94      0.90      4942
           1       0.74      0.55      0.63      1571

    accuracy                           0.84      6513
   macro avg       0.80      0.74      0.76      6513
weighted avg       0.84      0.84      0.83      6513



In [15]:
grid_search.cv_results_

{'mean_fit_time': array([0.00839272, 0.00626202, 0.00627141, 0.00635419, 0.0062314 ,
        0.00620561, 0.00644574, 0.00628672, 0.00615616, 0.00623841,
        0.0061903 , 0.00615821, 0.00608616, 0.00615101, 0.00614991,
        0.00618763, 0.00611281, 0.00610766, 0.00613537, 0.00610662,
        0.00619683, 0.00613174, 0.00616732, 0.00617323, 0.00612998,
        0.00616727, 0.00607843, 0.00617561, 0.00614114, 0.00615201,
        0.0061523 , 0.006112  , 0.00615163, 0.00616159, 0.00614901,
        0.00616326, 0.00606976, 0.00609932, 0.00619302, 0.00608301,
        0.00610104, 0.00605989, 0.00618386, 0.00619669, 0.00624065,
        0.00612998, 0.00630817, 0.00612936, 0.00616374, 0.00622764,
        0.00614953, 0.00609088, 0.00611558, 0.00613074, 0.00620422,
        0.0061832 , 0.00611582, 0.0061111 , 0.00613761, 0.00623236]),
 'std_fit_time': array([4.48339344e-03, 1.72875890e-04, 1.17458205e-04, 2.62042848e-04,
        2.07369984e-04, 1.61795310e-04, 2.45093492e-04, 1.90716898e-04,
     