In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load dataset
df = pd.read_csv('new_dataset.csv')
df.head()

Unnamed: 0,career_path,expertise_level,working_frequency,tech_news_frequency,willingness_to_learn,interest_visual_elements,interest_user_behavior,math_background,interest_working_with_data,interest_data_analysis,interest_data_insights,interest_ml_algorithms,interest_server_side,interest_innovation,interest_mobile_apps,concern_digital_security,interest_security_concepts,interest_design_thinking,interest_web
0,Frontend Developer,Intermediate,Weekly,Weekly,Yes,Yes,yes,No,No,No,No,No,No,Yes,No,No,No,Yes,Yes
1,Frontend Developer,Beginner,Weekly,Weekly,Yes,Yes,No,Yes,Yes,No,No,Yes,Yes,Yes,No,Yes,No,Yes,Yes
2,Backend Developer,Expert,Daily,Weekly,Yes,No,No,No,No,Yes,No,No,Yes,Yes,No,Yes,Yes,No,Yes
3,UI/UX Designer,Intermediate,Weekly,Daily,No,Yes,yes,No,No,No,No,No,No,No,No,No,No,No,No
4,Data Scientist,Beginner,Daily,Daily,Yes,No,No,Yes,Yes,Yes,Yes,No,No,No,No,No,No,Yes,No


In [3]:
# Encoding categorical features
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [4]:
df.head()

Unnamed: 0,career_path,expertise_level,working_frequency,tech_news_frequency,willingness_to_learn,interest_visual_elements,interest_user_behavior,math_background,interest_working_with_data,interest_data_analysis,interest_data_insights,interest_ml_algorithms,interest_server_side,interest_innovation,interest_mobile_apps,concern_digital_security,interest_security_concepts,interest_design_thinking,interest_web
0,3,2,4,4,1,1,1,0,0,0,0,0,0,1,0,0,0,1,1
1,3,0,4,4,1,1,0,1,1,0,0,1,1,1,0,1,0,1,1
2,0,1,0,4,1,0,0,0,0,1,0,0,1,1,0,1,1,0,1
3,6,2,4,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0,1,0


In [5]:
# Split features and target variable
X = df.drop(columns=['career_path'])
y = df['career_path']

In [6]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Feature scaling
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [8]:
# Train models
models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f'\n{name} Model Evaluation:')
    print(classification_report(y_test, y_pred))


Decision Tree Model Evaluation:
              precision    recall  f1-score   support

           0       0.84      0.76      0.80       270
           1       0.84      0.90      0.87       231
           2       0.86      0.83      0.85       218
           3       0.87      0.87      0.87       243
           4       0.87      0.89      0.88       242
           5       0.83      0.85      0.84       247
           6       0.87      0.90      0.88       233

    accuracy                           0.85      1684
   macro avg       0.85      0.86      0.86      1684
weighted avg       0.85      0.85      0.85      1684


Random Forest Model Evaluation:
              precision    recall  f1-score   support

           0       0.78      0.79      0.79       270
           1       0.89      0.90      0.89       231
           2       0.87      0.85      0.86       218
           3       0.90      0.87      0.89       243
           4       0.92      0.90      0.91       242
           5

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost Model Evaluation:
              precision    recall  f1-score   support

           0       0.80      0.79      0.79       270
           1       0.90      0.89      0.90       231
           2       0.86      0.85      0.85       218
           3       0.87      0.88      0.88       243
           4       0.88      0.89      0.89       242
           5       0.86      0.87      0.87       247
           6       0.87      0.88      0.88       233

    accuracy                           0.86      1684
   macro avg       0.86      0.86      0.86      1684
weighted avg       0.86      0.86      0.86      1684



In [9]:
# Compare models
best_model = max(results, key=results.get)
print(f'\nBest Performing Model: {best_model} with Accuracy: {results[best_model]:.4f}')


Best Performing Model: Random Forest with Accuracy: 0.8664


In [10]:
# Display encoded values and their corresponding real values
for col, le in label_encoders.items():
    print(f"\nColumn: {col}")
    for index, class_ in enumerate(le.classes_):
        print(f"{class_} -> {index}")


Column: career_path
Backend Developer -> 0
Cyber Security Engineer -> 1
Data Scientist -> 2
Frontend Developer -> 3
ML Engineer -> 4
Mobile App Developer -> 5
UI/UX Designer -> 6

Column: expertise_level
Beginner -> 0
Expert -> 1
Intermediate -> 2

Column: working_frequency
Daily -> 0
Monthly -> 1
Never -> 2
Rarely -> 3
Weekly -> 4

Column: tech_news_frequency
Daily -> 0
Monthly -> 1
Never -> 2
Rarely -> 3
Weekly -> 4

Column: willingness_to_learn
No -> 0
Yes -> 1

Column: interest_visual_elements
No -> 0
Yes -> 1

Column: interest_user_behavior
No -> 0
yes -> 1

Column: math_background
No -> 0
Yes -> 1

Column: interest_working_with_data
No -> 0
Yes -> 1

Column: interest_data_analysis
No -> 0
Yes -> 1

Column: interest_data_insights
No -> 0
Yes -> 1

Column: interest_ml_algorithms
No -> 0
Yes -> 1

Column: interest_server_side
No -> 0
Yes -> 1

Column: interest_innovation
No -> 0
Yes -> 1

Column: interest_mobile_apps
No -> 0
Yes -> 1

Column: concern_digital_security
No -> 0
Yes ->

In [11]:
# Step 1: Train the Random Forest model separately
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train, y_train)

# Step 2: Make predictions on the test set
y_pred = random_forest_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Model Accuracy: {accuracy:.4f}')
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Random Forest Model Accuracy: 0.8664

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.79      0.79       270
           1       0.89      0.90      0.89       231
           2       0.87      0.85      0.86       218
           3       0.90      0.87      0.89       243
           4       0.92      0.90      0.91       242
           5       0.84      0.88      0.86       247
           6       0.88      0.88      0.88       233

    accuracy                           0.87      1684
   macro avg       0.87      0.87      0.87      1684
weighted avg       0.87      0.87      0.87      1684



In [12]:
# Use NumPy indexing to select the first sample
sample = X_test[0:1]  # Select the first row as a sample (shape: (1, n_features))
sample_prediction = random_forest_model.predict(sample)

print(f'\nSample Prediction: {sample_prediction[0]}')
print(f'Actual Label: {y_test[0]}')  # Assuming y_test is also a NumPy array


Sample Prediction: 1


KeyError: 0