# AI jobs analaysis & predection

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Data preprocessing

In [2]:
df = pd.read_csv("../Data/ai_job_dataset.csv")
df.head()

Unnamed: 0,job_id,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
0,AI00001,AI Research Scientist,90376,USD,SE,CT,China,M,China,50,"Tableau, PyTorch, Kubernetes, Linux, NLP",Bachelor,9,Automotive,2024-10-18,2024-11-07,1076,5.9,Smart Analytics
1,AI00002,AI Software Engineer,61895,USD,EN,CT,Canada,M,Ireland,100,"Deep Learning, AWS, Mathematics, Python, Docker",Master,1,Media,2024-11-20,2025-01-11,1268,5.2,TechCorp Inc
2,AI00003,AI Specialist,152626,USD,MI,FL,Switzerland,L,South Korea,0,"Kubernetes, Deep Learning, Java, Hadoop, NLP",Associate,2,Education,2025-03-18,2025-04-07,1974,9.4,Autonomous Tech
3,AI00004,NLP Engineer,80215,USD,SE,FL,India,M,India,50,"Scala, SQL, Linux, Python",PhD,7,Consulting,2024-12-23,2025-02-24,1345,8.6,Future Systems
4,AI00005,AI Consultant,54624,EUR,EN,PT,France,S,Singapore,100,"MLOps, Java, Tableau, Python",Master,0,Media,2025-04-15,2025-06-23,1989,6.6,Advanced Robotics


In [3]:
df.describe()

Unnamed: 0,salary_usd,remote_ratio,years_experience,job_description_length,benefits_score
count,15000.0,15000.0,15000.0,15000.0,15000.0
mean,115348.965133,49.483333,6.2532,1503.314733,7.504273
std,60260.940438,40.812712,5.545768,576.127083,1.45087
min,32519.0,0.0,0.0,500.0,5.0
25%,70179.75,0.0,2.0,1003.75,6.2
50%,99705.0,50.0,5.0,1512.0,7.5
75%,146408.5,100.0,10.0,2000.0,8.8
max,399095.0,100.0,19.0,2499.0,10.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   job_id                  15000 non-null  object 
 1   job_title               15000 non-null  object 
 2   salary_usd              15000 non-null  int64  
 3   salary_currency         15000 non-null  object 
 4   experience_level        15000 non-null  object 
 5   employment_type         15000 non-null  object 
 6   company_location        15000 non-null  object 
 7   company_size            15000 non-null  object 
 8   employee_residence      15000 non-null  object 
 9   remote_ratio            15000 non-null  int64  
 10  required_skills         15000 non-null  object 
 11  education_required      15000 non-null  object 
 12  years_experience        15000 non-null  int64  
 13  industry                15000 non-null  object 
 14  posting_date            15000 non-null

#### Drop Unnecessary columns

In [5]:
df = df.drop(
    [
        "job_id",
        "job_title",
        "salary_currency",
        "posting_date",
        "application_deadline",
        "job_description_length",
        "benefits_score",
        "company_name",
        "required_skills",
        "employee_residence",
        "company_size",
        "industry",
        
    ],
    axis=1,
)

In [6]:
df.head()

Unnamed: 0,salary_usd,experience_level,employment_type,company_location,remote_ratio,education_required,years_experience
0,90376,SE,CT,China,50,Bachelor,9
1,61895,EN,CT,Canada,100,Master,1
2,152626,MI,FL,Switzerland,0,Associate,2
3,80215,SE,FL,India,50,PhD,7
4,54624,EN,PT,France,100,Master,0


#### Swaping the Target column

In [7]:
columns = list(df.columns)

salary = columns.index("salary_usd")
experience = columns.index("years_experience")

columns[salary] , columns[experience] = columns[experience],columns[salary]
df = df[columns]

df.head()


Unnamed: 0,years_experience,experience_level,employment_type,company_location,remote_ratio,education_required,salary_usd
0,9,SE,CT,China,50,Bachelor,90376
1,1,EN,CT,Canada,100,Master,61895
2,2,MI,FL,Switzerland,0,Associate,152626
3,7,SE,FL,India,50,PhD,80215
4,0,EN,PT,France,100,Master,54624


### Label encoding the ordinal columns

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['experience_level'] = le.fit_transform(df['experience_level'])
df['employment_type'] = le.fit_transform(df['employment_type'])
df['education_required'] = le.fit_transform(df['education_required'])

df.head()

Unnamed: 0,years_experience,experience_level,employment_type,company_location,remote_ratio,education_required,salary_usd
0,9,3,0,China,50,1,90376
1,1,0,0,Canada,100,2,61895
2,2,2,1,Switzerland,0,0,152626
3,7,3,1,India,50,3,80215
4,0,0,3,France,100,2,54624


In [None]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

[[7 3 2 'China' 100 2]
 [6 3 3 'Austria' 0 3]
 [14 1 0 'Austria' 50 3]
 ...
 [6 3 2 'Norway' 50 3]
 [2 2 1 'Denmark' 100 0]
 [10 1 2 'India' 0 2]]


### One hot encoding on categorical columns

In [23]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False), [3])
    ],
    remainder='passthrough'
)

X = np.array(ct.fit_transform(X))
column_names = ct.get_feature_names_out()

print(X)

[[1.0 0.0 0.0 ... 0 50 1]
 [1.0 0.0 1.0 ... 0 100 2]
 [1.0 0.0 1.0 ... 1 0 0]
 ...
 [1.0 0.0 1.0 ... 0 50 0]
 [1.0 0.0 1.0 ... 2 0 3]
 [0.0 1.0 1.0 ... 3 50 3]]


In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2
)

print(X_train)

[[1.0 0.0 0.0 ... 1 0 1]
 [1.0 0.0 0.0 ... 3 0 2]
 [1.0 0.0 0.0 ... 0 0 2]
 ...
 [1.0 0.0 0.0 ... 3 0 3]
 [1.0 0.0 0.0 ... 3 50 2]
 [1.0 0.0 0.0 ... 3 0 1]]


### Model training

In [34]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

model = RandomForestRegressor(
    n_estimators=200, random_state=42, n_jobs=-1, max_depth=10, min_samples_leaf=5
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)

print(f"R-squared: {r2:.4f}")

R-squared: 0.8115
