In [46]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# 1. Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
           'marital-status', 'occupation', 'relationship', 'race', 'sex',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

df = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)



In [47]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [48]:
df.income.value_counts()

income
<=50K    24720
>50K      7841
Name: count, dtype: int64

In [49]:
# 2. Basic cleaning
df.dropna(inplace=True)  # Remove missing values

# Drop redundant or highly correlated features
df.drop(['fnlwgt', 'education'], axis=1, inplace=True)

# Save occupation categories
occupation_cats = df['occupation'].unique().tolist()

# Encode categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)

# Separate features and target
X = df_encoded.drop('income_>50K', axis=1)
y = df_encoded['income_>50K']

# # 3. Normalize features for cosine similarity
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
# X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [53]:
# Train classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Simulate a user from low-income group
X_low_income = X[y == 0]
sample_user = X_low_income.sample(1, random_state=42).copy()

# Get all occupation columns
occupation_cols = [col for col in X.columns if col.startswith('occupation_')]

# Set all occupation fields to 0
original_occupation = sample_user[occupation_cols].copy()

# Store results
job_results = []

for col in occupation_cols:
    modified_user = sample_user.copy()
    
    # Reset all jobs
    modified_user[occupation_cols] = 0
    modified_user[col] = 1  # Set this job

    prob = clf.predict_proba(modified_user)[0][1]  # Probability of >50K
    job_results.append((col.replace('occupation_', ''), prob))

# Sort by probability
job_results.sort(key=lambda x: -x[1])

# Show top 5 recommendations
print("Job Change Recommendations (based on income probability):")
for job, prob in job_results[:5]:
    print(f" - {job:25s}: predicted >50K income probability = {prob:.2f}")

# Show original job and its prediction
original_job = original_occupation.idxmax(axis=1).values[0].replace('occupation_', '')
original_prob = clf.predict_proba(sample_user)[0][1]
print(f"\n👤 Original job: {original_job} — predicted income >50K probability: {original_prob:.2f}")


Job Change Recommendations (based on income probability):
 - Exec-managerial          : predicted >50K income probability = 0.40
 - Prof-specialty           : predicted >50K income probability = 0.15
 - Protective-serv          : predicted >50K income probability = 0.14
 - Other-service            : predicted >50K income probability = 0.05
 - Adm-clerical             : predicted >50K income probability = 0.04

👤 Original job: Farming-fishing — predicted income >50K probability: 0.04


### Another Approach

In [27]:
# 4. Separate high-income and low-income groups
X_high = X[y == 1].copy()
X_low = X[y == 0].copy()

In [28]:
# 5. Pick a random low-income user to simulate
sim_user = X_low.sample(1, random_state=42)
sim_user

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
3326,32,10,0,1590,54,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False


In [29]:
# 6. Compute similarity to high-income users
similarities = cosine_similarity(sim_user, X_high)
top_indices = similarities[0].argsort()[-5:][::-1]  # Top 5 most similar

top_matches = X_high.iloc[top_indices]
top_matches


Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
16025,38,13,0,1902,65,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
405,41,14,0,1977,65,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
22000,46,15,0,2415,80,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
5965,42,13,0,1977,70,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
10627,37,9,0,1848,65,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [30]:
# 7. Compare feature differences
sim_user_values = sim_user.iloc[0]
top_avg = top_matches.mean()

recommendations = (top_avg - sim_user_values).sort_values(ascending=False)
recommendations

capital-loss                         433.8
hours-per-week                        15.0
age                                    8.8
education-num                          2.8
marital-status_Married-civ-spouse      1.0
                                     ...  
native-country_Yugoslavia              0.0
race_White                            -0.2
workclass_Private                     -0.8
relationship_Not-in-family            -1.0
occupation_Farming-fishing            -1.0
Length: 84, dtype: object

In [31]:
sim_user

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
3326,32,10,0,1590,54,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False


In [32]:
# 8. Output top suggested changes
print("\n Simulated Low-Income User Profile (Standardized):")
print(sim_user.iloc[0].sort_values(ascending=False).head(10))



 Simulated Low-Income User Profile (Standardized):
capital-loss                     1590
hours-per-week                     54
age                                32
education-num                      10
relationship_Not-in-family       True
occupation_Farming-fishing       True
native-country_United-States     True
race_White                       True
workclass_Private                True
native-country_India            False
Name: 3326, dtype: object


In [33]:
print("\n Top Feature Recommendations (Standardized Shift):")
print(recommendations.head(10))



 Top Feature Recommendations (Standardized Shift):
capital-loss                         433.8
hours-per-week                        15.0
age                                    8.8
education-num                          2.8
marital-status_Married-civ-spouse      1.0
sex_Male                               1.0
workclass_Self-emp-not-inc             0.8
occupation_Exec-managerial             0.8
race_Asian-Pac-Islander                0.2
occupation_Prof-specialty              0.2
dtype: object


In [34]:
# Optional: Map back to original unscaled version
print("\n Interpreting Recommendation Example:")
for feature in recommendations.head(5).index:
    original_val = scaler.inverse_transform(sim_user)[0][X.columns.get_loc(feature)]
    suggested_val = scaler.inverse_transform([top_avg])[0][X.columns.get_loc(feature)]
    print(f" - {feature}: from {original_val:.2f} to {suggested_val:.2f}")



 Interpreting Recommendation Example:
 - capital-loss: from 640784.21 to 815585.67
 - hours-per-week: from 707.19 to 892.40
 - age: from 475.07 to 595.10
 - education-num: from 35.81 to 43.01
 - marital-status_Married-civ-spouse: from 0.46 to 0.96
