In [79]:
import warnings
warnings.filterwarnings("ignore")

In [1]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.3-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.3


In [80]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

In [81]:
import pandas as pd
import numpy as np

In [82]:
X.shape, y.shape

((48842, 14), (48842, 1))

In [83]:
X.replace('?', pd.NA, inplace=True)

In [84]:
X.isna().sum()

age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
dtype: int64

In [85]:
X['occupation'].unique()

array(['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners',
       'Prof-specialty', 'Other-service', 'Sales', 'Craft-repair',
       'Transport-moving', 'Farming-fishing', 'Machine-op-inspct',
       'Tech-support', <NA>, 'Protective-serv', 'Armed-Forces',
       'Priv-house-serv', nan], dtype=object)

In [86]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


# Data Imputation

In [87]:
from sklearn.impute import SimpleImputer

In [88]:
# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(exclude=['object']).columns
numerical_features

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')

In [89]:
# Create transformers for imputation
categorical_imputer = SimpleImputer(strategy='most_frequent')
numerical_imputer = SimpleImputer(strategy='mean')


In [None]:
# Impute categorical features with the most frequent value
for col in categorical_features:
    X[col] = X[col].fillna(X[col].mode()[0])

# Impute numerical features with the mean
for col in numerical_features:
    X[col] = X[col].fillna(X[col].mean())

In [91]:
X.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64

In [92]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


# LabelEcode

In [93]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder for each categorical feature
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le


In [94]:
X

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,6,77516,9,13,4,0,1,4,1,2174,0,40,38
1,50,5,83311,9,13,2,3,0,4,1,0,0,13,38
2,38,3,215646,11,9,0,5,1,4,1,0,0,40,38
3,53,3,234721,1,7,2,5,0,2,1,0,0,40,38
4,28,3,338409,9,13,2,9,5,2,0,0,0,40,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,3,215419,9,13,0,9,1,4,0,0,0,36,38
48838,64,3,321403,11,9,6,9,2,2,1,0,0,40,38
48839,38,3,374983,9,13,2,9,0,4,1,0,0,50,38
48840,44,3,83891,9,13,0,0,3,1,1,5455,0,40,38


In [95]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)

In [96]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [97]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [98]:
k_values = [5, 10]  # Add more values as needed
mean_accuracies = []
std_accuracies = []

In [99]:
for k in k_values:
    model = LogisticRegression()
    cv_scores = cross_val_score(model, X_train, y_train, cv=k, scoring='accuracy')
    mean_accuracies.append(cv_scores.mean())
    std_accuracies.append(cv_scores.std())

    print(f'Accuracy for k={k}: {cv_scores}')
    print(f'Mean Accuracy: {cv_scores.mean()}')
    print(f'Standard Deviation of Accuracy: {cv_scores.std()}\n')


Accuracy for k=5: [0.5513308  0.5495759  0.55147704 0.54635858 0.55316659]
Mean Accuracy: 0.5503817831703286
Standard Deviation of Accuracy: 0.0023104529283484985

Accuracy for k=10: [0.54928342 0.55337818 0.55220825 0.54548113 0.55016087 0.55250073
 0.54928342 0.54343375 0.55279321 0.55295494]
Mean Accuracy: 0.550147790434174
Standard Deviation of Accuracy: 0.0032163030704611736



In [100]:
# Step 3: Find accuracy on the test set
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f'Accuracy on Test Set: {test_accuracy}')


Accuracy on Test Set: 0.5513852872935717


In [101]:
# Step 4: Find accuracy on the complete dataset
model.fit(X, y)
y_pred_full = model.predict(X)
full_dataset_accuracy = accuracy_score(y, y_pred_full)
print(f'Accuracy on Complete Dataset: {full_dataset_accuracy}')

Accuracy on Complete Dataset: 0.5497932107612301
