<a href="https://colab.research.google.com/github/innocentmatutu/Machine-learning/blob/main/Data_Science_Careers_%26_Salaries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

df = pd.read_csv('/content/archive (11).zip')

df['salary'] = df['salary'].astype(str).str.replace('[â‚¬,]','',regex=True)
df['salary'] = df['salary'].apply(lambda x: x.split('-')[0] if '-' in x else x )
df['salary'] = pd.to_numeric(df['salary'],errors='coerce')

#Remove outliers
df = df[(df['salary'] > 10000) & (df['salary'] < 300000 )]

#Feature selection
features = ['job_title','seniority_level','skills','revenue',
            'industry','ownership','status','location','headquarter','company',]
X = df[features]
y = np.log1p(df['salary'])

#Train,test and split
X_train, X_valid, y_train, y_valid = train_test_split(X, y , test_size=0.2, random_state=1)

#Selecting column types
categorical_cols = [col for col in X.columns if X[col].dtype=='object']
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64','float64']]

#Numerical column transformer
numerical_transformer = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

#Categorical column transformer
categorical_transformer = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore',sparse_output=False))
])

#Bundle categorial transformer and numerical transformer into a pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num',numerical_transformer,numerical_cols),
    ('cat',categorical_transformer,categorical_cols)
])

#Model selection
model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=1
)

#Bundle model and preprocessed columns into a pipeline
my_pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',model)
])

#Pipeline fit
print('Fitting XGBoost....')
my_pipeline.fit(X_train,y_train)

#Predict and inverse transform log
preds = np.expm1(my_pipeline.predict(X_valid))
y_valid = np.expm1(y_valid)

#Metrics
mae = mean_absolute_error(y_valid, preds)
mse = mean_squared_error(y_valid, preds)
r2 = r2_score(y_valid, preds)

print(f"Mean Absolute Error: {mae:,.2f}")
print(f"Mean Squared Error: {mse:,.2f}")
print(f"R-squared: {r2:.4f}")




Fitting XGBoost....
Mean Absolute Error: 26,088.65
Mean Squared Error: 1,229,903,584.19
R-squared: 0.5427
