In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1. Load and clean data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
           'marital-status', 'occupation', 'relationship', 'race', 'sex',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

df = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)
df.dropna(inplace=True)
df.drop(['fnlwgt', 'education'], axis=1, inplace=True)

# Save occupation for later mapping
df['occupation_orig'] = df['occupation']

# 2. One-hot encode
df_encoded = pd.get_dummies(df.drop(columns=['occupation_orig']), drop_first=True)
X = df_encoded.drop('income_>50K', axis=1)
y = df_encoded['income_>50K']

# 3. Normalize
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# 4. Separate by income
X_high = X_scaled[y == 1]
X_low = X_scaled[y == 0]
df_high = df[y == 1].reset_index(drop=True)  # original high-income rows

# 5. Select one low-income person
sim_user = X_low.sample(1)

# 6. Compute similarity to high-income users
similarities = cosine_similarity(sim_user, X_high)
top_indices = similarities[0].argsort()[-10:][::-1]  # top 10 matches
top_matches = df_high.iloc[top_indices]

# 7. Recommend occupations from top matches
recommended_jobs = top_matches['occupation_orig'].value_counts()

# 8. Output results
print("👤 Simulated Low-Income User (Standardized):")
print(sim_user.iloc[0].sort_values(ascending=False).head(5))

print("\n🎯 Recommended Occupations Based on Similar High-Income Users:")
for job, count in recommended_jobs.items():
    print(f" - {job} (appears in top {count} matches)")

# Optional: Show original job of the low-income user
original_user_index = X_low.index[sim_user.index[0]]
original_job = df.loc[original_user_index, 'occupation_orig']
print(f"\n👔 Original Job: {original_job}")


👤 Simulated Low-Income User (Standardized):
relationship_Own-child               2.329125
marital-status_Never-married         1.431058
race_White                           0.413020
native-country_United-States         0.340954
native-country_Holand-Netherlands   -0.005542
Name: 13608, dtype: float64

🎯 Recommended Occupations Based on Similar High-Income Users:
 - Adm-clerical (appears in top 3 matches)
 - Prof-specialty (appears in top 3 matches)
 - ? (appears in top 1 matches)
 - Craft-repair (appears in top 1 matches)
 - Other-service (appears in top 1 matches)
 - Exec-managerial (appears in top 1 matches)

👔 Original Job: Machine-op-inspct
