In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [4]:
df = pd.read_csv('salary_prediction_data.csv')
len(df)

1000

In [5]:
df.head()

Unnamed: 0,Education,Experience,Location,Job_Title,Age,Gender,Salary
0,High School,8,Urban,Manager,63,Male,84620.053665
1,PhD,11,Suburban,Director,59,Male,142591.255894
2,Bachelor,28,Suburban,Manager,61,Female,97800.255404
3,High School,29,Rural,Director,45,Male,96834.671282
4,PhD,25,Urban,Analyst,26,Female,132157.786175


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Education   1000 non-null   object 
 1   Experience  1000 non-null   int64  
 2   Location    1000 non-null   object 
 3   Job_Title   1000 non-null   object 
 4   Age         1000 non-null   int64  
 5   Gender      1000 non-null   object 
 6   Salary      1000 non-null   float64
dtypes: float64(1), int64(2), object(4)
memory usage: 54.8+ KB


In [20]:
print(df['Education'].value_counts())

Education
High School    255
Bachelor       253
PhD            251
Master         241
Name: count, dtype: int64


In [21]:
print(df['Experience'].value_counts())

Experience
17    48
20    43
6     43
5     42
18    42
16    40
8     39
24    38
2     38
13    37
25    37
4     36
19    36
22    35
1     35
7     35
28    34
3     33
11    33
12    31
29    31
27    30
14    29
21    28
26    27
23    27
15    26
9     25
10    22
Name: count, dtype: int64


In [22]:
print(df['Location'].value_counts())

Location
Suburban    345
Rural       345
Urban       310
Name: count, dtype: int64


In [23]:
print(df['Job_Title'].value_counts())

Job_Title
Director    275
Analyst     255
Manager     241
Engineer    229
Name: count, dtype: int64


In [24]:
print(df['Age'].value_counts())

Age
27    33
60    33
58    30
59    30
21    30
62    30
24    29
41    28
20    28
63    28
44    27
54    25
49    25
45    25
61    24
42    24
23    24
25    24
48    23
31    23
36    22
50    22
57    22
26    22
56    21
64    21
37    21
52    20
40    20
22    19
34    19
29    19
43    19
39    19
33    18
35    18
53    18
47    16
46    15
28    15
51    15
38    15
30    14
55    14
32    13
Name: count, dtype: int64


In [25]:
print(df['Gender'].value_counts())

Gender
Male      516
Female    484
Name: count, dtype: int64


In [8]:
X = df.drop('Salary', axis=1)
y = df['Salary']

In [9]:
categorical_features = ['Education', 'Location', 'Job_Title', 'Gender']
numeric_features = ['Experience', 'Age']

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [11]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
model.fit(X_train, y_train)

In [14]:
y_pred = model.predict(X_test)

In [15]:
rmse = mean_squared_error(y_test, y_pred, squared=False)



In [16]:
r2 = r2_score(y_test, y_pred)

In [17]:
print(f'RMSE: {rmse}, R²: {r2}')

RMSE: 11704.819730382664, R²: 0.8446606147339711


In [19]:
joblib.dump(model, 'salary_model.joblib')

['salary_model.joblib']