# Imports

In [1]:
#Import of relevant libraries, classes or methods (in order of apperance)
import os
import pandas as pd
import numpy as np
from collections import Counter
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import joblib
import shap
import json
import time
import pickle

In [2]:
# Get the current notebook directory
current_dir = os.getcwd()

# Go one level up and build the path to the data folder

data_path_df = os.path.join(current_dir, "..", "data", 
                         "kaggle_survey_2020_responses.csv")

data_path_encoder_assignment = os.path.join(current_dir, "..", "data", 
                         "encoder_assignment.csv")

data_path_unique_with_rank = os.path.join(current_dir, "..", "data", 
                         "unique_with_rank.csv")


# Load the CSV files and instantiate DataFrames

df                 = pd.read_csv(data_path_df,
                                 dtype = 'str')

encoder_assignment = pd.read_csv(data_path_encoder_assignment,
                                 sep = ';')


unique_with_rank = pd.read_csv(data_path_unique_with_rank, 
                               dtype = 'str',
                               sep = ';',
                               na_values=['nan', '', 'NaN'])

# Preparation

In [3]:
# Merge the question numbers with the question text for better readibility
 # We will shorten the column names later.

col_index = df.columns.tolist()
# print(col_index)

questions = df.iloc[0,:].tolist()
# print(questions)

# Generate the composed index in a new list. Use a list comprehenshion with built in zip function:
col_index_new = [f"{col}_{val}" for col, val in zip(col_index, questions)]

#  print(col_index_new)

# Step 2: Assign the new index to the DataFrame
df.columns = col_index_new

# Step 3: Delete the first row (no longer used)
df = df.iloc[1:,]

In [4]:
# Within the 'Selected Choice'-columns, it is better to transform the values
# into Booleans, because they are better to count and better to modell.
# All others are strings. An NaN is interpreted as "z_not selected"

mark_bool = '- Selected Choice -'                     # If the questions have "- Seleceted choice - ", the answer ist yes or no (Boolean)


for col in df.columns:                                # NaNs are interpreted as False
    if mark_bool in col:
        df[col] = df[col].apply(lambda x:
                                False if pd.isna(x)
                                else True)            # If it's not a NaN, the answer is given. We replace it with a True
    else:
        df[col] = df[col].fillna('z_Not selected')     # NaNs are interpreted as "z_Not selected". z as a prefix to secure that it is always on the right hand side of the plot.
        df[col] = df[col].astype(str)                  # The whole columns is transformed into a string

In [5]:
# We now choose the questions of boolean type and assign them do a DataFram df_bool::

# Questions with boolean character:

col_bool = ['^Q7', '^Q9', '^Q14', '^Q16', '^Q17', '^Q18', '^Q19',
            '^Q26_A', '^Q27_A', '^Q28_A', '^Q29_A', '^Q31_A', '^Q33_A', '^Q34_A', '^Q35_A', '^Q37']

# Filter rules
regex_pattern_bool = '|'.join(col_bool)

# Filter all Boolean columns
df_bool = df.filter(regex=regex_pattern_bool)

In [6]:
# Define a pattern to sum up the relevant columns

# Mapping Regex → Columns for summing up
sum_columns = {
    '^Q7':    'Q7_No. of Regular used programming languages?',
    '^Q9':    'Q9_No. of Specialized IDE?',
    '^Q14':   'Q14_No. of DataViz Libs or Tools?',
    '^Q16':   'Q16_No. of ML Framworks?',
    '^Q17':   'Q17_No. of ML algorithms?',
    '^Q18':   'Q18_No. of Computer Vsion methods?',
    '^Q19':   'Q19_No. of NLP methods?',
    '^Q26_A': 'Q26_A No. of Current Cloud platforms?',
    '^Q27_A': 'Q27_A No. of Current Cloud Products?',
    '^Q28_A': 'Q28_A No. of Current ML products?',
    '^Q29_A': 'Q29_A No. of Big Data Tools?',
    '^Q31_A': 'Q31_A No. of BI Tools used?',
    '^Q33_A': 'Q33_A No. of Automted ML Tools?',
    '^Q34_A': 'Q34_A No. of Auto ML Tools?',
    '^Q35_A': 'Q35_A No. of ML Experiment Management?',
    '^Q37':   'Q37_No. of Learning Platforms?'

}

for regex, new_col in sum_columns.items():
    matching_cols = df.filter(regex=regex).columns                                         # Select all columns that match
    valid_cols = [col for col in matching_cols if "- Selected Choice - None" not in col]   # Exclude columns with "none"
    df_bool[new_col] = df[valid_cols].sum(axis=1)                                          # sum up valid columns


  df_bool[new_col] = df[valid_cols].sum(axis=1)                                          # sum up valid columns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bool[new_col] = df[valid_cols].sum(axis=1)                                          # sum up valid columns
  df_bool[new_col] = df[valid_cols].sum(axis=1)                                          # sum up valid columns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bool[new_col] = df[valid_cols].sum(axis=1)                                          # sum up valid columns
  df_bool[new_col] = df[va

In [7]:
# We now choose the questions of string type (multiple choice) and assign them do a DataFram df_string::

col_string = ['Q1_What is your age (# years)?',
              'Q4_What is the highest level of formal education that you have attained or plan to attain within the next 2 years?',
              'Q5_Select the title most similar to your current role (or most recent title if retired): - Selected Choice',
              'Q6_For how many years have you been writing code and/or programming?',
              'Q8_What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice',
              'Q13_Approximately how many times have you used a TPU (tensor processing unit)?',
              'Q15_For how many years have you used machine learning methods?',
              'Q20_What is the size of the company where you are employed?',
              'Q22_Does your current employer incorporate machine learning methods into their business?',
              'Q24_What is your current yearly compensation (approximate $USD)?',
              'Q30_Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? - Selected Choice',
              'Q32_Which of the following business intelligence tools do you use most often? - Selected Choice']

# Filter rules
regex_pattern_str = '|'.join(col_string)

# Filter all Boolean columns
df_string = df[[col for col in col_string if col in df.columns]]

In [8]:
# OK, this is confusing but effective code. To be corrected later.

# Q5_Select the title most similar to your current role
old_name = df_string.columns[2]
new_name = "Q5_Select the title most similar to your current role"
df_string.rename(columns={old_name: new_name}, inplace=True)

# Q8_What programming language would you recommend an aspiring data scientist to learn first?
old_name = df_string.columns[4]
new_name = "Q8_What programming language would you recommend an aspiring data scientist to learn first?"
df_string.rename(columns={old_name: new_name}, inplace=True)

# Q30_Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often?
old_name = df_string.columns[10]
new_name = "Q30_Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often?"
df_string.rename(columns={old_name: new_name}, inplace=True)

# Q32_Which of the following business intelligence tools do you use most often?
old_name = df_string.columns[11]
new_name = 'Q32_Which of the following business intelligence tools do you use most often?'
df_string.rename(columns={old_name: new_name}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_string.rename(columns={old_name: new_name}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_string.rename(columns={old_name: new_name}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_string.rename(columns={old_name: new_name}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

In [9]:
# We merge qustions with boolean and string type into one DataFrame, then we drop
# columns with "- Selceted - " answers.

df_short = pd.concat([df_string, df_bool], axis=1)
df_short = df_short.drop(columns=df_short.filter(regex='Selected').columns)
df_short.columns

Index(['Q1_What is your age (# years)?',
       'Q4_What is the highest level of formal education that you have attained or plan to attain within the next 2 years?',
       'Q5_Select the title most similar to your current role',
       'Q6_For how many years have you been writing code and/or programming?',
       'Q8_What programming language would you recommend an aspiring data scientist to learn first?',
       'Q13_Approximately how many times have you used a TPU (tensor processing unit)?',
       'Q15_For how many years have you used machine learning methods?',
       'Q20_What is the size of the company where you are employed?',
       'Q22_Does your current employer incorporate machine learning methods into their business?',
       'Q24_What is your current yearly compensation (approximate $USD)?',
       'Q30_Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often?',
       'Q32_Which of the following business 

In [10]:
# We delete all rows with entries ['Other', 'Student', 'z_Not selected', 'Currently not employed ']
df_heat = df_short[~df_short['Q5_Select the title most similar to your current role'].isin(
    ['Other', 'Student', 'z_Not selected', 'Currently not employed'])]

In [11]:
# Preparing Columns transformation

## Assign colums to encoder type:

columns_to_keep = [col for col in encoder_assignment["column"] if col in df_short.columns]
lab_columns  = encoder_assignment.query("encoder == 'lab'")["column"].tolist()
ohe_columns  = encoder_assignment.query("encoder == 'ohe'")["column"].tolist()
ord_columns  = encoder_assignment.query("encoder == 'ord'")["column"].tolist()


## Generate order for categories in OrderEncoder
### Initial list


unique_with_rank.columns = unique_with_rank.columns.str.replace(r'\s+', ' ',
                                                                regex=True).str.strip()
### Clean list from NaNs

def clean_float_strings(val):
    # Werte, die echte NaN darstellen sollen
    if val in ['nan', 'NaN', '']:
        return np.nan
    try:
        f = float(val)
        if f.is_integer():
            return str(int(f))  #  '0.0' → '0'
        return str(f)           #  '2.5' bleibt '2.5'
    except:
        return val             # Text remains text

unique_with_rank_cl = unique_with_rank.applymap(clean_float_strings)

### Create finale list of ordered categories for OrdinalEncoder:

unique_dict = {}

for col in unique_with_rank_cl.columns:
    cats = unique_with_rank_cl[col].dropna().unique().tolist()   # Extract all values without NaNa and remove duplicates
    cats = [cat for cat in cats if cat != 'nan']                 # Remove nan as string
    unique_dict[col] = cats

### Final list for Ordninal Encoder

categories = list(unique_dict.values())

  unique_with_rank_cl = unique_with_rank.applymap(clean_float_strings)


# Generate Data Sets

In [12]:
# Data Set S for classical Data Science Roles vs. Software Engeneering

## We now reduce the data set to the above mentioned roles only:
selected_roles_S = [
     'Data Scientist',
        'Software Engineer',
        'Data Analyst'
        ]

df_heat_S = df_heat.loc[
    df_heat.iloc[:, 2].isin(selected_roles_S)
    ]

## Eliminate the target variables. X represents the matrix of explanatory variables for the ML model.
X_S = df_heat_S.drop(lab_columns, axis=1)

## Define y as the target for the ML model.
y_S = df_heat_S[lab_columns[0]]

## Split Train- and Test-Set:
X_train_S, X_test_S, y_train_S, y_test_S = train_test_split(X_S,
                                                            y_S,
                                                            test_size=0.2,
                                                            random_state=42)

In [13]:
# Data Set L for for Carrer paths Data Science vx. Tech

## List of job roles we want to include in the filtered DataFrame:
selected_roles_L = [
    'Data Scientist',
    'Software Engineer',
    'Data Analyst',
    'Research Scientist',
    'Machine Learning Engineer',
    'Data Engineer',
    'DBA/Database Engineer'
]

## Re-arragen the target columns such that we include the career path:
role_column = 'Q5_Select the title most similar to your current role'
df_heat_L = df_heat[df_heat[role_column].isin(selected_roles_L)]                 # Filter the original DataFrame to only include the selected roles
def role_DS(dframe, col_select):                                                 # Function to classify each role into a broader role group: DS, Tech, or Business
    ds_roles = [
        'Data Scientist',
        'Data Analyst',
        'Machine Learning Engineer',
        'Data Engineer',
        'Research Scientist'
    ]
    tech_roles = [
        'Software Engineer',
        'DBA/Database Engineer'
    ]
    def map_role(role):                                                          # Internal helper function to map job title to role group
      if role in ds_roles:
        return 'DS'
      elif role in tech_roles:
        return 'Tech'
      else:
        return 'Other'
    return dframe[col_select].apply(map_role)                                    # Apply the role mapping to the selected column

df_heat_L['role_group'] = role_DS(df_heat_L, role_column)                        # Add a new column to the DataFrame with the role group classification
df_heat_L = df_heat_L.drop(columns=[role_column])                                # Drop the old columns and ...
col_to_insert = df_heat_L.pop('role_group')                                      # ... insert the new one.
df_heat_L.insert(2, role_column, col_to_insert)


## Eliminate the target variables. X represents the matrix of explanatory variables for the ML model.
X_L = df_heat_L.drop(lab_columns, axis=1)

# Define y as the target for the ML model.
y_L = df_heat_L[lab_columns[0]]

## Split Train- and Test-Set:
X_train_L, X_test_L, y_train_L, y_test_L = train_test_split(X_L,
                                                            y_L,
                                                            test_size=0.2,
                                                            random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_heat_L['role_group'] = role_DS(df_heat_L, role_column)                        # Add a new column to the DataFrame with the role group classification


# Data preprocessing

In [14]:
# Apply Label Encoder for target

lab_L = LabelEncoder()
lab_S = LabelEncoder()


y_train_L = pd.Series(y_train_L)
y_test_L = pd.Series(y_test_L)
y_train_L = lab_L.fit_transform(y_train_L)
y_test_L = lab_L.transform(y_test_L)

y_train_S = pd.Series(y_train_S)
y_test_S = pd.Series(y_test_S)
y_train_S = lab_S.fit_transform(y_train_S)
y_test_S = lab_S.transform(y_test_S)


# Backward Transformation of targets of L
y_train_L_original_labels = lab_L.inverse_transform(y_train_L)
y_test_L_original_labels = lab_L.inverse_transform(y_test_L)


# Backward Transformation of targets of S
y_train_S_original_labels = lab_S.inverse_transform(y_train_S)
y_test_S_original_labels = lab_S.inverse_transform(y_test_S)

In [15]:
ohe_transformer = Pipeline([
    ('ohe',   OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

ord_transformer = Pipeline([

    ('ord', OrdinalEncoder(categories=categories, handle_unknown='use_encoded_value', unknown_value=-1))
])


preprocessor = ColumnTransformer([
    ('ohe', ohe_transformer, ohe_columns),
    ('ord', ord_transformer, ord_columns)
])



# Model Pipelines

## RandomForrest

In [16]:
pipe_rfc = Pipeline([
    ('preprocessing', preprocessor),
    ('rfc',  RandomForestClassifier(max_depth=50,
                                   random_state=42,
                                   n_estimators = 100,
                                  class_weight='balanced'  # this is crucial
                            ))
])

pipe_rfc_S = clone(pipe_rfc)
pipe_rfc_L = clone(pipe_rfc)

pipe_rfc_S.fit(X_train_S, y_train_S)
pipe_rfc_L.fit(X_train_L, y_train_L)


#---Predict and evaluate---#

y_rfc_S = pipe_rfc_S.predict(X_test_S)
y_rfc_L = pipe_rfc_L.predict(X_test_L)

print("Results for data set S with RFC (Data Science Roles)\n", classification_report(y_test_S, y_rfc_S, target_names=lab_S.classes_, digits=4))
print("\nResults for data set L  With RFC (General career paths)\n", classification_report(y_test_L, y_rfc_L, target_names=lab_L.classes_, digits=4))

Results for data set S with RFC (Data Science Roles)
                    precision    recall  f1-score   support

     Data Analyst     0.5755    0.4053    0.4756       301
   Data Scientist     0.6471    0.7917    0.7121       528
Software Engineer     0.6803    0.6304    0.6544       395

         accuracy                         0.6446      1224
        macro avg     0.6343    0.6091    0.6140      1224
     weighted avg     0.6402    0.6446    0.6353      1224


Results for data set L  With RFC (General career paths)
               precision    recall  f1-score   support

          DS     0.8174    0.9613    0.8835      1369
        Tech     0.7022    0.2983    0.4188       419

    accuracy                         0.8059      1788
   macro avg     0.7598    0.6298    0.6511      1788
weighted avg     0.7904    0.8059    0.7746      1788





In [17]:
# Define Paramter Space for RandomForestClassifier:

param_dist_rfc = {
    'rfc__n_estimators': [100, 200, 300],             # No. of trres
    'rfc__max_depth': [10, 30, 50, None],             # Max. Depth with none as no limit
    'rfc__min_samples_split': [2, 5, 10],             # Split of Trre
    'rfc__min_samples_leaf': [1, 2, 4],               # Minuum amout of leafs
    'rfc__max_features': ['sqrt', 'log2', None],      # Features for split
    'rfc__bootstrap': [True, False]                   # Bootstrapping on/off
}


# RandomizedSearchCV für Set S
random_search_rfc_S = RandomizedSearchCV(
    clone(pipe_rfc),
    param_distributions=param_dist_rfc,
    n_iter=10,
    cv=3,
    scoring='f1_macro',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# RandomizedSearchCV für Set L
random_search_rfc_L = RandomizedSearchCV(
    clone(pipe_rfc),
    param_distributions=param_dist_rfc,
    n_iter=10,
    cv=3,
    scoring='f1_macro',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# Fit auf Trainingsdaten
random_search_rfc_S.fit(X_train_S, y_train_S)
random_search_rfc_L.fit(X_train_L, y_train_L)

# Vorhersagen
y_pred_rfc_S = random_search_rfc_S.predict(X_test_S)
y_pred_rfc_L = random_search_rfc_L.predict(X_test_L)

# Auswertung
from sklearn.metrics import classification_report

print("Results for data set S (RFC)\n",
      classification_report(y_test_S, y_pred_rfc_S, target_names=lab_S.classes_, digits=4))

print("\nResults for data set L (RFC)\n",
      classification_report(y_test_L, y_pred_rfc_L, target_names=lab_L.classes_, digits=4))

# Beste Parameter
print("Best parameters for S (RFC):", random_search_rfc_S.best_params_)
print("Best parameters for L (RFC):", random_search_rfc_L.best_params_)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits




Results for data set S (RFC)
                    precision    recall  f1-score   support

     Data Analyst     0.5593    0.5482    0.5537       301
   Data Scientist     0.6934    0.7197    0.7063       528
Software Engineer     0.6850    0.6608    0.6727       395

         accuracy                         0.6585      1224
        macro avg     0.6459    0.6429    0.6442      1224
     weighted avg     0.6577    0.6585    0.6579      1224


Results for data set L (RFC)
               precision    recall  f1-score   support

          DS     0.8656    0.8612    0.8634      1369
        Tech     0.5540    0.5632    0.5586       419

    accuracy                         0.7914      1788
   macro avg     0.7098    0.7122    0.7110      1788
weighted avg     0.7926    0.7914    0.7920      1788

Best parameters for S (RFC): {'rfc__n_estimators': 200, 'rfc__min_samples_split': 10, 'rfc__min_samples_leaf': 1, 'rfc__max_features': 'log2', 'rfc__max_depth': None, 'rfc__bootstrap': True}
Best 

## Gradient Booster

In [18]:
# Create the HGB pipeline
pipe_hgb = Pipeline([
    ('preprocessing', preprocessor),  # preprocessing step
    ('hgb', HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_iter=200,
        max_depth=6,
        min_samples_leaf=20,
        l2_regularization=1.0,
        random_state=42
    ))
])

# Clone the pipeline for both datasets
pipe_hgb_S = clone(pipe_hgb)
pipe_hgb_L = clone(pipe_hgb)

# Fit the pipeline to training data
pipe_hgb_S.fit(X_train_S, y_train_S)
pipe_hgb_L.fit(X_train_L, y_train_L)


# Predict on test sets
y_hgb_S = pipe_hgb_S.predict(X_test_S)
y_hgb_L = pipe_hgb_L.predict(X_test_L)

# Print results
print("Results for data set S with HistGradientBoosting (Data Science Roles)\n",
      classification_report(y_test_S, y_hgb_S, target_names=lab_S.classes_, digits=4))

print("\nResults for data set L with HistGradientBoosting (General career paths)\n",
      classification_report(y_test_L, y_hgb_L, target_names=lab_L.classes_, digits=4))

Results for data set S with HistGradientBoosting (Data Science Roles)
                    precision    recall  f1-score   support

     Data Analyst     0.5659    0.4850    0.5224       301
   Data Scientist     0.6840    0.7216    0.7023       528
Software Engineer     0.6797    0.7038    0.6915       395

         accuracy                         0.6577      1224
        macro avg     0.6432    0.6368    0.6387      1224
     weighted avg     0.6536    0.6577    0.6546      1224


Results for data set L with HistGradientBoosting (General career paths)
               precision    recall  f1-score   support

          DS     0.8349    0.9196    0.8752      1369
        Tech     0.6071    0.4057    0.4864       419

    accuracy                         0.7992      1788
   macro avg     0.7210    0.6627    0.6808      1788
weighted avg     0.7815    0.7992    0.7841      1788





In [19]:
# Define Paramter Space for Gradient Booster:
param_dist_hgb = {
    'hgb__learning_rate': [0.01, 0.05, 0.1],          # shrinkage step size
    'hgb__max_iter': [100, 200, 300],                 # number of boosting iterations
    'hgb__max_depth': [3, 6, 9],                      # maximum depth of individual trees
    'hgb__min_samples_leaf': [10, 20, 30],            # minimum samples per leaf
    'hgb__l2_regularization': [0.0, 1.0, 10.0]         # L2 penalty for regularization
}


from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import clone

# Randomized search for dataset S
random_search_hgb_S = RandomizedSearchCV(
    clone(pipe_hgb),                     # avoid shared state
    param_distributions=param_dist_hgb,  # defined param space
    n_iter=10,                           # number of combinations
    cv=3,                                # 3-fold CV
    scoring='f1_macro',                  # macro F1 score
    random_state=42,
    n_jobs=-1,                           # use all CPUs
    verbose=1
)

# Randomized search for dataset L
random_search_hgb_L = RandomizedSearchCV(
    clone(pipe_hgb),
    param_distributions=param_dist_hgb,
    n_iter=10,
    cv=3,
    scoring='f1_macro',
    random_state=42,
    n_jobs=-1,
    verbose=1
)


# Fit search objects to training data
random_search_hgb_S.fit(X_train_S, y_train_S)
random_search_hgb_L.fit(X_train_L, y_train_L)

# Predict on test sets using best models
y_pred_hgb_S = random_search_hgb_S.predict(X_test_S)
y_pred_hgb_L = random_search_hgb_L.predict(X_test_L)

# Print classification reports
from sklearn.metrics import classification_report

print("Results for data set S after Randomized Search - HGB (Data Science Roles)\n",
      classification_report(y_test_S, y_pred_hgb_S, target_names=lab_S.classes_, digits=4))

print("\nResults for data set L after Randomized Search - HGB (General career paths)\n",
      classification_report(y_test_L, y_pred_hgb_L, target_names=lab_L.classes_, digits=4))

# Print best parameters
print("Best parameters for S (HGB):", random_search_hgb_S.best_params_)
print("Best parameters for L (HGB):", random_search_hgb_L.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Results for data set S after Randomized Search - HGB (Data Science Roles)
                    precision    recall  f1-score   support

     Data Analyst     0.5756    0.4551    0.5083       301
   Data Scientist     0.6911    0.7500    0.7193       528
Software Engineer     0.6828    0.7139    0.6980       395

         accuracy                         0.6658      1224
        macro avg     0.6498    0.6397    0.6419      1224
     weighted avg     0.6600    0.6658    0.6606      1224


Results for data set L after Randomized Search - HGB (General career paths)
               precision    recall  f1-score   support

          DS     0.8446    0.9131    0.8775      1369
        Tech     0.6136    0.4511    0.5199       419

    accuracy                         0.8048      1788
   macro avg     0.7291    0.6821    0.6987      1788
weighted avg     0.7905    0.8048    



## XGBoost

In [20]:
# Create the XGBoost pipeline
pipe_xgb = Pipeline([
    ('preprocessing', preprocessor),
    ('xgb', XGBClassifier(
    learning_rate=0.05,
    n_estimators=200,
    max_depth=6,
    reg_lambda=1.0,
    min_child_weight=20,
    objective='multi:softmax',
    num_class=4,
    eval_metric='mlogloss',
    random_state=42
))
])

pipe_xgb_S = clone(pipe_xgb)
pipe_xgb_L = clone(pipe_xgb)

pipe_xgb_S.fit(X_train_S, y_train_S)
pipe_xgb_L.fit(X_train_L, y_train_L)



#---Predict and evaluate---#

y_xgb_S = pipe_xgb_S.predict(X_test_S)
y_xgb_L = pipe_xgb_L.predict(X_test_L)

print("Results for data set S with XGBoost (Data Science Roles)\n", classification_report(y_test_S, y_xgb_S, target_names=lab_S.classes_, digits=4))
print("\nResults for data set L  With XGBoost (General career paths)\n", classification_report(y_test_L, y_xgb_L, target_names=lab_L.classes_, digits=4))

Results for data set S with XGBoost (Data Science Roles)
                    precision    recall  f1-score   support

     Data Analyst     0.6091    0.4917    0.5441       301
   Data Scientist     0.6949    0.7462    0.7196       528
Software Engineer     0.6763    0.7089    0.6922       395

         accuracy                         0.6716      1224
        macro avg     0.6601    0.6489    0.6520      1224
     weighted avg     0.6678    0.6716    0.6676      1224


Results for data set L  With XGBoost (General career paths)
               precision    recall  f1-score   support

          DS     0.8392    0.9153    0.8756      1369
        Tech     0.6068    0.4272    0.5014       419

    accuracy                         0.8009      1788
   macro avg     0.7230    0.6712    0.6885      1788
weighted avg     0.7848    0.8009    0.7879      1788





In [21]:
# Define parameter grid for RandomizedSearchCV
param_dist = {
    'xgb__learning_rate': [0.01, 0.05, 0.1],
    'xgb__n_estimators': [100, 200, 300],
    'xgb__max_depth': [3, 6, 9],
    'xgb__reg_lambda': [0.5, 1.0, 2.0],
    'xgb__min_child_weight': [1, 10, 20]
}

# Create RandomizedSearchCV objects for each dataset
random_search_S = RandomizedSearchCV(
    clone(pipe_xgb),          # clone to avoid shared state
    param_distributions=param_dist,
    n_iter=10,                # number of parameter settings sampled
    cv=3,                     # 3-fold cross-validation
    scoring='f1_macro',
    random_state=42,
    n_jobs=-1,                # use all CPU cores
    verbose=1
)

random_search_L = RandomizedSearchCV(
    clone(pipe_xgb),
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    scoring='f1_macro',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# Fit the RandomizedSearchCV to training data
random_search_S.fit(X_train_S, y_train_S)
random_search_L.fit(X_train_L, y_train_L)

# Predict on test sets using best found models
y_pred_S = random_search_S.predict(X_test_S)
y_pred_L = random_search_L.predict(X_test_L)

# Print classification reports with original label names
print("Results for data set S after Randomized Serach  - XGB (Data Science Roles)\n",
      classification_report(y_test_S, y_pred_S, target_names=lab_S.classes_, digits=4))

print("\nResults for data set L after Randomized Serach - XGB (General career paths)\n",
      classification_report(y_test_L, y_pred_L, target_names=lab_L.classes_, digits=4))

# Optionally print best parameters found by the search
print("Best parameters for S (XGB):", random_search_S.best_params_)
print("Best parameters for L:(XGB)", random_search_L.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Results for data set S after Randomized Serach  - XGB (Data Science Roles)
                    precision    recall  f1-score   support

     Data Analyst     0.5907    0.4651    0.5204       301
   Data Scientist     0.6850    0.7538    0.7178       528
Software Engineer     0.6872    0.7063    0.6966       395

         accuracy                         0.6675      1224
        macro avg     0.6543    0.6417    0.6449      1224
     weighted avg     0.6625    0.6675    0.6624      1224


Results for data set L after Randomized Serach - XGB (General career paths)
               precision    recall  f1-score   support

          DS     0.8448    0.9145    0.8783      1369
        Tech     0.6176    0.4511    0.5214       419

    accuracy                         0.8059      1788
   macro avg     0.7312    0.6828    0.6998      1788
weighted avg     0.7916    0.8059   



# Final ML Model: Optimized XGBoost

In [22]:
# Define the initial pipeline with default XGBoost parameters
pipe_xgb = Pipeline([
    ('preprocessing', preprocessor), # First step: apply preprocessing transformations
    ('xgb', XGBClassifier(
        # These parameters will be overridden later with the best found parameters.
        # They serve as initial defaults or fallback values.
        learning_rate=0.05,
        n_estimators=200,
        max_depth=6,
        reg_lambda=1.0,
        min_child_weight=20,
        objective='multi:softmax', # Objective for multi-class classification
        num_class=4,             # Number of target classes
        eval_metric='mlogloss',  # Evaluation metric for monitoring training
        random_state=42          # Seed for reproducibility
    ))
])

# Clone the initial pipeline to create separate instances for datasets S and L.
# This ensures that 'pipe_xgb_S' and 'pipe_xgb_L' are independent.
pipe_xgb_S = clone(pipe_xgb)
pipe_xgb_L = clone(pipe_xgb)

# Define the optimal hyperparameters found for Dataset S through Randomized Search
best_params_S = {
    'xgb__reg_lambda': 1.0,         # L2 regularization term
    'xgb__n_estimators': 200,       # Number of boosting rounds (trees)
    'xgb__min_child_weight': 10,    # Minimum sum of instance weight (hessian) needed in a child
    'xgb__max_depth': 3,            # Maximum depth of a tree
    'xgb__learning_rate': 0.1       # Step size shrinkage to prevent overfitting
}

# Define the optimal hyperparameters found for Dataset L through Randomized Search
best_params_L = {
    'xgb__reg_lambda': 2.0,
    'xgb__n_estimators': 300,
    'xgb__min_child_weight': 20,
    'xgb__max_depth': 6,
    'xgb__learning_rate': 0.1
}

# Apply the optimal hyperparameters to the 'pipe_xgb_S' pipeline.
# The 'xgb__' prefix tells set_params that these are parameters for the 'xgb' step.
pipe_xgb_S.set_params(**best_params_S)

# Apply the optimal hyperparameters to the 'pipe_xgb_L' pipeline.
pipe_xgb_L.set_params(**best_params_L)


# #---Train the models---#

pipe_xgb_S.fit(X_train_S, y_train_S)
pipe_xgb_L.fit(X_train_L, y_train_L)

# #---Predict and evaluate---#

y_xgb_S = pipe_xgb_S.predict(X_test_S)
y_xgb_L = pipe_xgb_L.predict(X_test_L)

print("Results for data set S with XGBoost (Data Science Roles)\n", classification_report(y_test_S, y_xgb_S, target_names=lab_S.classes_, digits=4))
print("\nResults for data set L  With XGBoost (General career paths)\n", classification_report(y_test_L, y_xgb_L, target_names=lab_L.classes_, digits=4))

Results for data set S with XGBoost (Data Science Roles)
                    precision    recall  f1-score   support

     Data Analyst     0.5907    0.4651    0.5204       301
   Data Scientist     0.6850    0.7538    0.7178       528
Software Engineer     0.6872    0.7063    0.6966       395

         accuracy                         0.6675      1224
        macro avg     0.6543    0.6417    0.6449      1224
     weighted avg     0.6625    0.6675    0.6624      1224


Results for data set L  With XGBoost (General career paths)
               precision    recall  f1-score   support

          DS     0.8448    0.9145    0.8783      1369
        Tech     0.6176    0.4511    0.5214       419

    accuracy                         0.8059      1788
   macro avg     0.7312    0.6828    0.6998      1788
weighted avg     0.7916    0.8059    0.7947      1788



