**Dataset Creation (Artificial Data)**

In this case we just have synthetic data, we can fine tune it and do logic-based mappings (we define rule or weights)

In [7]:
import pandas as pd
import random

# Define possible careers
careers = [
    "Software Engineer", "Data Scientist", "Doctor", "Teacher", "Lawyer",
    "UX Designer", "AI Researcher", "Chartered Accountant", "Psychologist", "Content Creator",
    "Entrepreneur", "Product Manager", "Civil Servant", "Digital Marketer", "Architect",
    "Chemist", "Biotechnologist", "Game Developer", "Financial Analyst", "HR Manager",
    "Journalist", "Professor", "Social Worker", "Musician", "Actor", "Pilot",
    "Nurse", "Statistician", "Cybersecurity Expert", "Astrophysicist"
]

# Skills mapping based on career relevance
career_skills = {
    "Software Engineer": ["Coding", "Maths", "Problem_Solving"],
    "Data Scientist": ["Maths", "Coding", "Logic_Reasoning", "Problem_Solving"],
    "Doctor": ["Biology", "Chemistry", "Public_Speaking", "Problem_Solving"],
    "Teacher": ["Writing", "Public_Speaking", "Creativity", "Leadership"],
    "Lawyer": ["Writing", "Public_Speaking", "Logic_Reasoning", "Problem_Solving"],
    "UX Designer": ["Designing", "Creativity", "Problem_Solving", "Public_Speaking"],
    "AI Researcher": ["Coding", "Maths", "Problem_Solving", "Interest_in_AI"],
    "Chartered Accountant": ["Finance", "Maths", "Logic_Reasoning", "Problem_Solving"],
    "Psychologist": ["Psychology", "Creativity", "Problem_Solving", "Public_Speaking"],
    "Content Creator": ["Writing", "Creativity", "Public_Speaking", "Interest_in_Research"],
    "Entrepreneur": ["Leadership", "Creativity", "Problem_Solving", "Management"],
    "Product Manager": ["Leadership", "Management", "Creativity", "Problem_Solving"],
    "Civil Servant": ["Public_Speaking", "Leadership", "Problem_Solving", "Creativity"],
    "Digital Marketer": ["Creativity", "Problem_Solving", "Writing", "Public_Speaking"],
    "Architect": ["Designing", "Creativity", "Maths", "Problem_Solving"],
    "Chemist": ["Chemistry", "Problem_Solving", "Biology", "Public_Speaking"],
    "Biotechnologist": ["Biology", "Chemistry", "Problem_Solving", "Research"],
    "Game Developer": ["Coding", "Creativity", "Problem_Solving", "Maths"],
    "Financial Analyst": ["Finance", "Maths", "Logic_Reasoning", "Problem_Solving"],
    "HR Manager": ["Management", "Leadership", "Public_Speaking", "Problem_Solving"],
    "Journalist": ["Writing", "Public_Speaking", "Creativity", "Research"],
    "Professor": ["Writing", "Public_Speaking", "Leadership", "Creativity"],
    "Social Worker": ["Creativity", "Public_Speaking", "Problem_Solving", "Leadership"],
    "Musician": ["Creativity", "Public_Speaking", "Problem_Solving"],
    "Actor": ["Creativity", "Public_Speaking", "Problem_Solving"],
    "Pilot": ["Maths", "Problem_Solving", "Public_Speaking", "Leadership"],
    "Nurse": ["Biology", "Problem_Solving", "Public_Speaking", "Leadership"],
    "Statistician": ["Maths", "Logic_Reasoning", "Problem_Solving", "Research"],
    "Cybersecurity Expert": ["Coding", "Maths", "Problem_Solving", "Logic_Reasoning"],
    "Astrophysicist": ["Physics", "Maths", "Problem_Solving", "Research"]
}

# Features (Subjects and Skills)
features = [
    "Maths", "Biology", "Chemistry", "Physics", "Coding", "Writing", "Public_Speaking",
    "Drawing", "Management", "Psychology", "Finance", "Logic_Reasoning", "Creativity",
    "CGPA", "Hobby_Tech", "Hobby_Reading", "Interest_in_AI", "Leadership",
    "Problem_Solving", "Interest_in_Research"
]

# Generate synthetic dataset with more realistic correlations
def generate_student_profile():
    profile = {
        "Maths": random.randint(0, 5),
        "Biology": random.randint(0, 5),
        "Chemistry": random.randint(0, 5),
        "Physics": random.randint(0, 5),
        "Coding": random.randint(0, 5),
        "Writing": random.randint(0, 5),
        "Public_Speaking": random.randint(0, 5),
        "Drawing": random.randint(0, 5),
        "Management": random.randint(0, 5),
        "Psychology": random.randint(0, 5),
        "Finance": random.randint(0, 5),
        "Logic_Reasoning": random.randint(0, 5),
        "Creativity": random.randint(0, 5),
        "CGPA": round(random.uniform(5.0, 10.0), 2),
        "Hobby_Tech": random.randint(0, 1),
        "Hobby_Reading": random.randint(0, 1),
        "Interest_in_AI": random.randint(0, 1),
        "Leadership": random.randint(0, 5),
        "Problem_Solving": random.randint(0, 5),
        "Interest_in_Research": random.randint(0, 5)
    }
    # Randomly assign a career, but we’ll fix this later based on skills
    return profile

# Create 100 student profiles
data = [generate_student_profile() for _ in range(100)]

# Convert to DataFrame
df = pd.DataFrame(data)

# Assign careers based on the skills that best match the career skills mapping
def assign_career(row):
    scores = {}
    for career, skills in career_skills.items():
        match_score = sum([row[skill] for skill in skills if skill in row])
        scores[career] = match_score
    # Get career with the highest match score
    return max(scores, key=scores.get)

df['Career'] = df.apply(assign_career, axis=1)

# Show the first few rows
print(df.head())

# To display the DataFrame with all columns
pd.set_option('display.max_columns', 10)
print(df)

df.to_csv("career_dataset.csv", index=False)


   Maths  Biology  Chemistry  Physics  Coding  ...  Interest_in_AI  \
0      4        2          1        4       5  ...               0   
1      1        5          2        2       2  ...               1   
2      1        3          1        1       4  ...               1   
3      2        1          3        4       0  ...               1   
4      5        2          3        1       2  ...               1   

   Leadership  Problem_Solving  Interest_in_Research          Career  
0           0                2                     5  Game Developer  
1           1                5                     1          Doctor  
2           1                4                     5          Lawyer  
3           4                3                     0   Civil Servant  
4           2                4                     4           Pilot  

[5 rows x 21 columns]
    Maths  Biology  Chemistry  Physics  Coding  ...  Interest_in_AI  \
0       4        2          1        4       5  ...        