In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1. Load and clean data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
           'marital-status', 'occupation', 'relationship', 'race', 'sex',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

df = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)
df.dropna(inplace=True)
df.drop(['fnlwgt', 'education'], axis=1, inplace=True)

# Save occupation for later mapping
df['occupation_orig'] = df['occupation']
df

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,occupation_orig
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,Adm-clerical
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,Exec-managerial
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,Handlers-cleaners
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,Handlers-cleaners
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,Prof-specialty
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K,Tech-support
32557,40,Private,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K,Machine-op-inspct
32558,58,Private,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K,Adm-clerical
32559,22,Private,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K,Adm-clerical


In [2]:
# 2. One-hot encode
df_encoded = pd.get_dummies(df.drop(columns=['occupation_orig']), drop_first=True)
X = df_encoded.drop('income_>50K', axis=1)
y = df_encoded['income_>50K']


In [3]:
# 3. Normalize
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


In [4]:
# 4. Separate by income
X_high = X_scaled[y == 1]
X_low = X_scaled[y == 0]
df_high = df[y == 1].reset_index(drop=True)  # original high-income rows


In [5]:
# 5. Select one low-income person
sim_user = X_low.sample(1)


In [6]:
# 6. Compute similarity to high-income users
similarities = cosine_similarity(sim_user, X_high)
top_indices = similarities[0].argsort()[-10:][::-1]  # top 10 matches
top_matches = df_high.iloc[top_indices]


In [7]:
# 7. Recommend occupations from top matches
recommended_jobs = top_matches['occupation_orig'].value_counts()


In [8]:
# 8. Output results
print("Simulated Low-Income User (Standardized):")
print(sim_user.iloc[0].sort_values(ascending=False).head(5))

print("\nRecommended Occupations Based on Similar High-Income Users:")
for job, count in recommended_jobs.items():
    print(f" - {job} (appears in top {count} matches)")

# Show original job of the low-income user
original_user_index = X_low.index[sim_user.index[0]]
original_job = df.loc[original_user_index, 'occupation_orig']
print(f"\nOriginal Job: {original_job}")

Simulated Low-Income User (Standardized):
occupation_Craft-repair         2.635080
relationship_Not-in-family      1.708991
marital-status_Never-married    1.431058
workclass_Private               0.659286
race_White                      0.413020
Name: 4362, dtype: float64

Recommended Occupations Based on Similar High-Income Users:
 - Craft-repair (appears in top 10 matches)

Original Job: Craft-repair
