In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
df = pd.read_csv('ai_job_dataset.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.info()

In [None]:
df.describe()

## Exploratory Data Analysis

## Salary Distribution

In [None]:
sns.histplot(df['salary_usd'], bins=30, kde=True)
plt.title('Distribution of Salary (USD)')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.show()

## Outlier in Salary

In [None]:
sns.boxplot(x=df['salary_usd'])
plt.title('Boxplot of Salary (USD)')
plt.show()

# Preprocessing: Handle categorical & ordinal features

## Salary vs. Experience Level

In [None]:
sns.boxplot(x='experience_level', y='salary_usd', data=df)
plt.title('Salary by Experience Level')
plt.show()

In [None]:
# Map ordinal features
# Experience level
exp_map = {'EN': 0, 'MI': 1, 'SE': 2, 'EX': 3}
df['experience_level'] = df['experience_level'].map(exp_map)

# Education level (sample map — adjust based on actual data)
edu_map = {
    'High School': 0,
    'Associate': 1,
    'Bachelor': 2,
    'Master': 3,
    'PhD': 4
}
df['education_required'] = df['education_required'].map(edu_map)

# Company size
size_map = {'S': 0, 'M': 1, 'L': 2}
df['company_size'] = df['company_size'].map(size_map)

## Define Features and Target

In [None]:
target = 'salary_usd'

features = [
    'experience_level', 'education_required', 'company_size', 'remote_ratio',
    'employment_type','company_location', 'employee_residence'
]

X = df[features]
y = df[target]

## ColumnTransformer for encoding

In [None]:
# Define column types
numeric_features = ['experience_level', 'education_required', 'company_size', 'remote_ratio']
categorical_features = ['employment_type','company_location', 'employee_residence']

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

### Create pipeline with mode

In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

### Train-test split and model training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## Evaluate performance

In [None]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")