In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
data = pd.read_csv('/content/drive/MyDrive/City Tech/Spring 2025/Machine Learning Fundamentals CST 4702/Datasets & Collabs/UCI Adult Income Dataset - Classification/uci_adult.csv')
data

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [5]:
# Check for missing values
print("Missing values before imputation:\n", data.isnull().sum())

print('-'*35)
# Handling the categorical column

# df_copy = data.copy()
lbe = LabelEncoder() #   Use a label encoding instead of binarizer because encoding can handle more than 2 variables in categorical columns

# Fitting the column
categorical_columns = ['workclass', 'education','marital-status', 'occupation',
                       'relationship', 'race', 'gender', 'native-country', 'income']

for col in categorical_columns:
  data[col] = lbe.fit_transform(data[col])

# Split dataset into features and target
X = data.drop('income', axis=1)
y = data['income']

# Apply KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
X_imputed = knn_imputer.fit_transform(X)

# Convert the imputed data back to a DataFrame
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

# Check missing values after imputation
print("Missing values after imputation:\n", X_imputed_df.isnull().sum())

Missing values before imputation:
 age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64
-----------------------------------
Missing values after imputation:
 age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
dtype: int64


In [6]:
data

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,4,226802,1,7,4,7,3,2,1,0,0,40,39,0
1,38,4,89814,11,9,2,5,0,4,1,0,0,50,39,0
2,28,2,336951,7,12,2,11,0,4,1,0,0,40,39,1
3,44,4,160323,15,10,2,7,0,2,1,7688,0,40,39,1
4,18,0,103497,15,10,4,0,3,4,0,0,0,30,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,4,257302,7,12,2,13,5,4,0,0,0,38,39,0
48838,40,4,154374,11,9,2,7,0,4,1,0,0,40,39,1
48839,58,4,151910,11,9,6,1,4,4,0,0,0,40,39,0
48840,22,4,201490,11,9,4,1,3,4,1,0,0,20,39,0


In [7]:
# Additional preprocessing before K-Fold cross-validation

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed_df)


In [8]:
# Let's apply K-mean clustering

kmeans = KMeans(n_clusters=5, random_state=42)
X_clustered = kmeans.fit_predict(X_scaled)

# Cluster label to feature set
X_imputed_df['cluster'] = X_clustered

In [9]:
# K-Fold cross-validation

model = RandomForestClassifier(n_estimators=100, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf_scores = cross_val_score(model, X_imputed_df, y, cv=kf, scoring='accuracy')

# Accuracy
print(f"Accuracy scores for each fold: {kf_scores}")
print(f"Average accuracy score: {np.mean(kf_scores)}")

Accuracy scores for each fold: [0.86324086 0.85576825 0.85370598 0.85452498 0.85626536]
Average accuracy score: 0.8567010849895483


In [10]:
# Stratisfied K-Fold Cross-Validation

stratified_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
stratified_kf_scores = cross_val_score(model, X_imputed_df, y, cv=stratified_kf, scoring='accuracy')

# Accuracy
print(f"Accuracy scores for each stratified fold: {stratified_kf_scores}")
print(f"Average accuracy score for stratified k-fold: {np.mean(stratified_kf_scores)}")

Accuracy scores for each stratified fold: [0.85699662 0.85904391 0.85831286 0.85861998 0.85145373]
Average accuracy score for stratified k-fold: 0.8568854209554384


**K-Fold Cross-Validation:**
K-Fold cross-validation is a solid way to evaluate a model. It splits the dataset into K subsets, trains the model on K-1 folds, and tests it on the remaining fold. This process repeats K times, so each subset gets tested once. The results are then averaged to give a better idea of how well the model generalizes. I’ve used K=5 since it’s a common choice and ensures the model gets tested on different parts of the data.


**Stratified K-Fold Cross-Validation:**
Stratified K-Fold cross-validation is especially useful for imbalanced datasets. It makes sure each fold has the same class distribution as the whole dataset, unlike standard K-Fold. This is really important when there’s class imbalance, as it helps the model perform better on underrepresented classes by making sure each fold reflects the overall class distribution.

**Clustering (K-Means/DBSCAN):**
Clustering methods like K-Means and DBSCAN can be really helpful in adding features to the model. K-Means groups data points into clusters based on similarity, while DBSCAN finds dense areas, which can reveal hidden patterns. Using these cluster labels as additional features can help the model spot patterns that might not have been obvious in the original data.

**Random Forest Classifier:**
The Random Forest classifier is a strong ensemble method that builds multiple decision trees and combines their predictions. It helps reduce overfitting by averaging results, making it great for both numerical and categorical data. It’s especially useful when there are lots of features, as it can handle complex data without losing much in terms of interpretability or accuracy.

In [11]:
# The income column's values are either greater than, or less or equal to 50k
# So it is suitable for logistic regression model

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
lgr = LogisticRegression(max_iter=1000)

lgr.fit(X_train, y_train)

# Predictions
y_pred = lgr.predict(X_test)

# Evaluation of the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of the logistic regression: {accuracy}")

Accuracy of the logistic regression: 0.8295229645806319
