In [1]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbalanced_pipeline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

In [2]:
# Load train data, remove samples with missing values and drop duplicate samples
train_data = pd.read_csv('data/adult.train.csv', header=0)
train_data.replace(' ?', pd.NA, inplace=True)
train_data.dropna(inplace=True)
train_data.drop_duplicates(inplace=True)

In [3]:
# Load test data, remove samples with missing values and drop duplicate samples
test_data = pd.read_csv('data/adult.test.csv', header=0)
test_data.replace(' ?', pd.NA, inplace=True)
test_data.dropna(inplace=True)
test_data.drop_duplicates(inplace=True)

In [4]:
# Define features and target variable for train set
X_train = train_data.drop('income', axis=1)
y_train = train_data['income'].map({' >50K': 1, ' <=50K': 0})

In [5]:
# Define features and target variable for test set
X_test = test_data.drop('income', axis=1)
y_test = test_data['income'].map({' >50K': 1, ' <=50K': 0})

In [6]:
# Split numerical and categorical features for preprocessing
numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

In [7]:
# # Apply label encoding to categorical features
# label_encoder = LabelEncoder()

# for feature in categorical_features:
#     X_train[feature] = label_encoder.fit_transform(X_train[feature])
#     X_test[feature] = label_encoder.transform(X_test[feature])

In [8]:
# Create preprocessor for separate preprecessing steps for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
])

In [9]:
# k_values = range(1, 21)
# accuracies = []

# # Get the accuracies for each number of neighbors
# for k in k_values:
#     knn_model = model = Pipeline(steps=[
#         ('preprocessor', preprocessor),
#         ('classifier', KNeighborsClassifier(k))
#     ])
#     knn_model.fit(X_train, y_train)
#     accuracy = knn_model.score(X_test, y_test)
#     accuracies.append(accuracy)

# # Plot the number of neighbors vs accuracy chart 
# plt.plot(k_values, accuracies, marker='o')
# plt.title('KNN Accuracy for Different k Values')
# plt.xlabel('Number of Neighbors (k)')
# plt.ylabel('Accuracy')
# plt.show()

In [None]:
# # Define the k-nearest neighbors model
# model = imbalanced_pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('smote', SMOTE(sampling_strategy='auto', random_state=42)),
#     ('classifier', KNeighborsClassifier(7))
# ])

In [None]:
# # Define the k-nearest neighbors model
# model = imbalanced_pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('oversampler', RandomOverSampler(sampling_strategy='auto', random_state=42)),  # Adjust parameters as needed
#     ('classifier', KNeighborsClassifier(7))
# ])

In [10]:
# Define the k-nearest neighbors model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(7))
])

In [11]:
# Train the model on the train set
model.fit(X_train, y_train)

In [12]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [13]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.8310


In [14]:
# Display confusion matrix
print(confusion_matrix(y_test, y_pred))

[[10260  1095]
 [ 1449  2251]]


In [15]:
# Display classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89     11355
           1       0.67      0.61      0.64      3700

    accuracy                           0.83     15055
   macro avg       0.77      0.76      0.76     15055
weighted avg       0.83      0.83      0.83     15055

