In [1]:
import pandas as pd
import numpy as np

In [2]:
# This script generates a synthetic dataset of numerical features for a user base of investors.
# The features include Age, Investment Horizon in Years, and Existing Savings in USD.
num_users = 1000
data = {}

# --- Generate Numerical Features ---

# 1. Age: Let's assume most investors are between 18 and 70 years old.
# np.random.randint(low, high, size) generates random integers in [low, high)
data['Age'] = np.random.randint(18, 71, num_users) # Age from 18 to 70

# 2. Investment_Horizon_Years: Let's say between 1 and 40 years.
data['Investment_Horizon_Years'] = np.random.randint(1, 41, num_users) # Horizon from 1 to 40

# 3. Existing_Savings_USD: This can vary widely. Let's use a log-normal distribution or a mix
# to represent a few high savers and many lower ones.
# For simplicity, we can also use uniform with a wider range, but let's try something more realistic.
# np.random.lognormal(mean=log_of_median, sigma=spread, size)
# Let's say most savings are between $1,000 and $100,000, with some outliers.
# A rough log-normal distribution for savings: median around e^8 = ~3000, spread
data['Existing_Savings_USD'] = np.round(np.random.lognormal(mean=np.log(20000), sigma=1.5, size=num_users)).astype(int)
# Ensure no negative savings and cap at a reasonable max to avoid extreme outliers
data['Existing_Savings_USD'] = np.maximum(data['Existing_Savings_USD'], 1000) # Minimum savings
data['Existing_Savings_USD'] = np.minimum(data['Existing_Savings_USD'], 1000000) # Maximum savings for this dataset

In [None]:
# --- Generate Categorical Features ---

# 1. Income_Level: Low, Medium, High
# Let's assume Medium is most common, then Low, then High.
income_levels = ['Low', 'Medium', 'High']
# Probabilities: 40% Low, 40% Medium, 20% High (sum to 1.0)
income_probabilities = [0.30, 0.50, 0.20] # Adjusted to give Medium more weight
data['Income_Level'] = np.random.choice(income_levels, num_users, p=income_probabilities)

# 2. Risk_Tolerance: Low, Medium, High
# Let's make Medium risk most common, with Low and High less so.
risk_tolerances = ['Low', 'Medium', 'High']
risk_probabilities = [0.25, 0.50, 0.25] # Equal chance of Low/High, twice for Medium
data['Risk_Tolerance'] = np.random.choice(risk_tolerances, num_users, p=risk_probabilities)

# 3. Has_Dependents: 0 (No) or 1 (Yes)
# Let's assume roughly 60% have dependents.
has_dependents_options = [0, 1]
dependents_probabilities = [0.40, 0.60] # 40% No, 60% Yes
data['Has_Dependents'] = np.random.choice(has_dependents_options, num_users, p=dependents_probabilities)


Generated Features (including categorical):
   Age  Investment_Horizon_Years  Existing_Savings_USD Income_Level  \
0   42                        34                 39365       Medium   
1   64                        40                 61069         High   
2   56                        28                  6217       Medium   
3   55                        24                  1489       Medium   
4   60                        25                 37083          Low   

  Risk_Tolerance  Has_Dependents  
0         Medium               0  
1            Low               0  
2           High               0  
3         Medium               1  
4         Medium               0  

Value counts for categorical features:
Income_Level:
 Medium    502
Low       307
High      191
Name: Income_Level, dtype: int64

Risk_Tolerance:
 Medium    469
High      275
Low       256
Name: Risk_Tolerance, dtype: int64

Has_Dependents:
 1    612
0    388
Name: Has_Dependents, dtype: int64


In [None]:
# --- Create the DataFrame first so we can apply rules row by row ---
df = pd.DataFrame(data)

# --- Generate Recommended_Investment Label based on Rules ---

def get_investment_recommendation(row):
    age = row['Age']
    income = row['Income_Level']
    risk = row['Risk_Tolerance']
    horizon = row['Investment_Horizon_Years']
    savings = row['Existing_Savings_USD']
    dependents = row['Has_Dependents']

    # Rule 1: Very Conservative (Safety First)
    if (risk == 'Low' and horizon < 10) or (age >= 60 and dependents == 1):
        return 'Conservative_Bonds'
    
    # Rule 2: Growth-Oriented (Young, High Risk, Long Horizon)
    if (age < 35 and risk == 'High' and horizon >= 15):
        return 'Growth_Stocks'

    # Rule 3: Balanced Approach (Medium Everything)
    if (risk == 'Medium' and horizon >= 5 and horizon < 15):
        return 'Balanced_Funds'
    
    # Rule 4: Real Estate (Higher Income/Savings, Medium-High Risk)
    if (income == 'High' and savings >= 200000 and risk != 'Low'):
        return 'Real_Estate_ETFs'

    # Rule 5: Precious Metals (Hedging/Diversification, generally moderate)
    if (risk == 'Medium' and savings < 50000): # For medium risk with lower savings, or diversification
        return 'Precious_Metals'

    # Default/Fallback recommendation if no specific rule applies (e.g., for edge cases)
    # This ensures every user gets a recommendation.
    return 'Balanced_Funds' # A general default if none above fit exactly

# Apply the function to each row to create the 'Recommended_Investment' column
# The 'axis=1' means apply the function row-wise
df['Recommended_Investment'] = df.apply(get_investment_recommendation, axis=1)

# Display the first few rows with the new recommendation
print("\nDataFrame with 'Recommended_Investment' labels:")
print(df.head())

# Check the distribution of recommendations
print("\nDistribution of Recommended_Investment:")
print(df['Recommended_Investment'].value_counts())