In [1]:
from ucimlrepo import fetch_ucirepo 

import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder 
from sklearn.mixture import GaussianMixture 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables) 

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

In [3]:
X.shape

(48842, 14)

In [4]:
y.head()

Unnamed: 0,income
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K


In [5]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [6]:
X_sample = X.sample(n=40000, random_state=42)

y_sample = y.loc[X_sample.index]
# Convert y to a 1D array
y_sample = y_sample.values.ravel()

In [7]:
X_sample = pd.get_dummies(X_sample, drop_first=True)
le = LabelEncoder()
y_sample = le.fit_transform(y_sample)
y_sample = pd.Series(y_sample)
# standardizing features 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_sample)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_sample, test_size = 0.2)

gmm = GaussianMixture(n_components = 2)

gmm.fit(X_train)

y_pred = gmm.predict(X_test)

mapped_y_pred = [0 if label == y_test.mode()[0] else 1 for label in y_pred]







In [8]:
print("Accuracy:", accuracy_score(y_test, mapped_y_pred))
print("Classification Report:\n", classification_report(y_test, mapped_y_pred))

Accuracy: 0.257625
Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.01      0.03      3986
           1       0.25      0.98      0.40      2043
           2       0.00      0.00      0.00      1316
           3       0.00      0.00      0.00       655

    accuracy                           0.26      8000
   macro avg       0.20      0.25      0.11      8000
weighted avg       0.34      0.26      0.12      8000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
