You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Describe the bug
When generating the learning curve on the Adult Income dataset using KNN or SVC, the learning curve returns many nan depending on dataset size and it does not do this on an unpatched version of sklearn or a patched version of sklearn using the Wine dataset
To Reproduce
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import multiprocessing as mp
import pickle
from sklearnex import patch_sklearn, unpatch_sklearn
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler
from pathlib import Path
patch_sklearn()
HALF_CPUS = mp.cpu_count() // 2
headers = ['age', 'work_class', 'final_weight', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']
some_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', sep=',', names=headers)
more_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', sep=',', names=headers, header=None, skiprows=1)
data = pd.concat([some_data, more_data], ignore_index=True)
# Start cleaning
# Shuffle the rows
data = data.sample(frac=1, random_state=0)
# Trim all whitespace
data_obj = data.select_dtypes(['object'])
data[data_obj.columns] = data_obj.apply(lambda x: x.str.strip())
data['income'] = data['income'].str.strip()
# This is technically metadata
data = data.drop(columns='final_weight')
# Covered by education-num
data = data.drop(columns='education')
# Non-US native_country rows are 19.3% >= 50k and make up 3930 rows
data = data.drop(data[data['native_country'] != 'United-States'].index)
data = data.drop(columns='native_country')
# Drop any row with ?
data = data.replace('?', np.nan)
data = data.dropna()
# # Drop capital gain and loss
# data = data.drop(columns='capital_gain')
# data = data.drop(columns='capital_loss')
# Simplify marital status
data['marital_status'] = data['marital_status'].replace(['Married-civ-spouse', 'Married-AF-spouse'], 'Married')
data['marital_status'] = data['marital_status'].replace(['Married-spouse-absent', 'Separated', 'Divorced'], 'Separated')
data['marital_status'] = data['marital_status'].replace(['Widowed'], 'Widowed')
data['marital_status'] = data['marital_status'].replace(['Never-married'], 'Never-married')
# Make income a binary value
data['income'] = data['income'].map({ '>50K.': 1, '>50K': 1, '<=50K.': 0, '<=50K': 0 })
data['sex'] = data['sex'].map({ 'Male': 0, 'Female': 1 })
# One hot encode everything
data = pd.get_dummies(data, columns=data.select_dtypes(['object']).columns)
# Split data
np.random.seed(0)
train, validate, test = np.split(data, [int(0.6 * len(data)), int(0.8 * len(data))])
X_train = train.drop(columns='income')
X_validate = validate.drop(columns='income')
X_test = test.drop(columns='income')
X = data.drop(columns='income')
y_train = train['income']
y_validate = validate['income']
y_test = test['income']
y = data['income']
# K-Nearest Neighbors Classifier
from sklearn.neighbors import KNeighborsClassifier
fig, ax = plt.subplots(ncols=2, sharey=True)
fig.suptitle("Validation Curves for KNN Classifier (Income)")
fig.supylabel("accuracy")
fig.set_tight_layout(True)
# Create learning curve for K-Nearest Neighbors Classifier
tempa = KNeighborsClassifier(n_neighbors=5)
tempb = np.linspace(0.1, 1, 10)
train_sizes, train_scores, test_scores = learning_curve(tempa, X.tail(5000), y.tail(5000), train_sizes=tempb, cv=5)
fig, ax = plt.subplots()
fig.suptitle("Learning Curve for KNN Classifier (Income)")
fig.supylabel("accuracy")
fig.set_tight_layout(True)
ax.set_xlabel("training examples")
ax.plot(train_sizes, np.mean(train_scores, axis=1), marker="o", label="train", ms=2)
ax.plot(train_sizes, np.mean(test_scores, axis=1), marker="o", label="test", ms=2)
ax.legend()
plt.show()
Expected behavior
Describe what your are expecting from steps above
Output/Screenshots
If applicable, add output/screenshots to help explain your problem.
Environment:
OS: Ubuntu 22.04
Compiler: gcc 11.4.0
Version: 2024.1.0
The text was updated successfully, but these errors were encountered:
Describe the bug
When generating the learning curve on the Adult Income dataset using KNN or SVC, the learning curve returns many nan depending on dataset size and it does not do this on an unpatched version of sklearn or a patched version of sklearn using the Wine dataset
To Reproduce
Expected behavior
Describe what your are expecting from steps above
Output/Screenshots
If applicable, add output/screenshots to help explain your problem.
Environment:
The text was updated successfully, but these errors were encountered: