In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor  # Change to GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error

# Load dataset
data = pd.read_csv('job_salary_datasetnew.csv')  # Replace 'job_salary_dataset.csv' with your dataset filename
# Explore the dataset
print(data.head())

# Split dataset into features (X) and target variable (y)
X = data[['job_description', 'experience_years', 'education_level', 'location']]
y = data['salary']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for textual data and numerical/categorical data
text_preprocessor = TfidfVectorizer()
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Define which columns should be treated as numerical or categorical
numeric_features = ['experience_years']
categorical_features = ['education_level', 'location']

# Create a preprocessing transformer to handle different types of features
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_preprocessor, 'job_description'),
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Define the model (change to GradientBoostingRegressor)
model = GradientBoostingRegressor(random_state=42)

# Create a pipeline for preprocessing and model training
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Define hyperparameters to tune
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.05, 0.1, 0.2],  # Add learning rate hyperparameter
    'model__max_depth': [3, 5, 7]  # Add max_depth hyperparameter
}

# Perform grid search cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

# Get best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

# Update the pipeline with the best parameters
pipeline.set_params(**best_params)

# Fit the pipeline to the training data with the updated parameters
pipeline.fit(X_train, y_train)

# Make predictions on the testing data
predictions = pipeline.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
print('Mean Absolute Error:', mae)

# Calculate the average salary in the dataset
average_salary = data['salary'].mean()

# Calculate the mean absolute error in terms of percentage of the average salary
mae_percentage = (mae / average_salary) * 100

# Calculate the accuracy as the complement of the MAE percentage
accuracy_percentage = 100 - mae_percentage

print('Model Accuracy:', round(accuracy_percentage, 2), '%')


                                     job_description  salary  \
0  Software Engineer position requires expertise ...   80000   
1  Marketing Specialist responsible for developin...   60000   
2  Data Analyst role involves analyzing large dat...   70000   
3  Graphic Designer with proficiency in Adobe Cre...   55000   
4  Sales Associate responsible for generating lea...   50000   

   experience_years    education_level       location  
0                 3  Bachelor's Degree  San Francisco  
1                 2  Bachelor's Degree       New York  
2                 2  Bachelor's Degree        Seattle  
3                 2  Bachelor's Degree    Los Angeles  
4                 1  Bachelor's Degree        Chicago  
Mean Absolute Error: 8259.781608148656
Model Accuracy: 88.48 %
