In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt


In [2]:
salary = pd.read_csv("../../dataset/Salary_Data_extend.csv")
salary

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0
...,...,...,...,...,...,...
370,35.0,Female,Bachelor's,Senior Marketing Analyst,8.0,85000.0
371,43.0,Male,Master's,Director of Operations,19.0,170000.0
372,29.0,Female,Bachelor's,Junior Project Manager,2.0,40000.0
373,34.0,Male,Bachelor's,Senior Operations Coordinator,7.0,90000.0


In [3]:
len(salary)

375

In [4]:
salary.isna().sum()

Age                    2
Gender                 2
Education Level        2
Job Title              2
Years of Experience    2
Salary                 2
dtype: int64

In [5]:
salary.dropna(inplace=True)

In [6]:
len(salary)

373

In [16]:
salary

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32,Male,Bachelor's,Software Engineer,5,90000
1,28,Female,Master's,Data Analyst,3,65000
2,45,Male,PhD,Senior Manager,15,150000
3,36,Female,Bachelor's,Sales Associate,7,60000
4,52,Male,Master's,Director,20,200000
...,...,...,...,...,...,...
370,35,Female,Bachelor's,Senior Marketing Analyst,8,85000
371,43,Male,Master's,Director of Operations,19,170000
372,29,Female,Bachelor's,Junior Project Manager,2,40000
373,34,Male,Bachelor's,Senior Operations Coordinator,7,90000


In [20]:
salary.columns

Index(['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience',
       'Salary'],
      dtype='object')

In [22]:
salary.columns = salary.columns.str.strip()
salary.columns

Index(['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience',
       'Salary'],
      dtype='object')

In [68]:
salary.isna().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64

In [9]:
len(salary)

373

In [36]:
salary['Age'] = salary['Age'].astype(int)
salary['Years of Experience'] = salary['Years of Experience'].astype(int)
salary['Salary'] = salary['Salary'].astype(int)

In [38]:
# Dependent variable
X = salary.drop("Salary", axis=1)

# Independent variable
y = salary['Salary']

In [44]:
from sklearn.model_selection import train_test_split

# Split dataset into test and training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [70]:
salary.info()

<class 'pandas.core.frame.DataFrame'>
Index: 373 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age                  373 non-null    int32 
 1   Gender               373 non-null    object
 2   Education Level      373 non-null    object
 3   Job Title            373 non-null    object
 4   Years of Experience  373 non-null    int32 
 5   Salary               373 non-null    int32 
dtypes: int32(3), object(3)
memory usage: 16.0+ KB


In [74]:
# get all number cols
num_cols = [ col for col in X_train.columns if X_train[col].dtypes != "object"]
num_cols


['Age', 'Years of Experience']

In [58]:
cat_cols = [ col for col in X_train.columns if X_train[col].dtypes == "object"]
cat_cols

['Gender', 'Education Level', 'Job Title']

In [104]:
# Turn the category into number
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# all category list 
one_hot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

transformer = ColumnTransformer([("one_hot", one_hot , cat_cols)], remainder="passthrough")

transform_X = transformer.fit_transform(X)
 
transform_X


array([[ 0.,  1.,  1., ...,  0., 32.,  5.],
       [ 1.,  0.,  0., ...,  0., 28.,  3.],
       [ 0.,  1.,  0., ...,  0., 45., 15.],
       ...,
       [ 1.,  0.,  1., ...,  0., 29.,  2.],
       [ 0.,  1.,  1., ...,  0., 34.,  7.],
       [ 1.,  0.,  0., ...,  0., 44., 15.]])

In [110]:
# import algorithm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Split dataset into test and training set
X_train, X_test, y_train, y_test = train_test_split(transform_X, y, test_size=0.2, random_state=0)

# Train the model
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)

# Evaluate score
model.score(X_test, y_test)


0.9324781239840154

## Regression model evaluation metrics
* R^2 Score
* Mean Absolute Error (MAE)
* Mean Square Error (MSE)

In [116]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

0.9324781239840154

In [132]:
mae = mean_absolute_error(y_test, y_pred)
mae

8254.22222222222

In [126]:
df = pd.DataFrame(data={"Actual Values" : y_test,
                       "Predicted Values" : y_pred})

df['defferences'] = df['Actual Values'] - df['Predicted Values']


In [128]:
df

Unnamed: 0,Actual Values,Predicted Values,defferences
45,40000,40000.0,0.0
274,40000,41650.0,-1650.0
54,50000,44950.0,5050.0
237,140000,164650.0,-24650.0
254,85000,85150.0,-150.0
...,...,...,...
56,65000,42300.0,22700.0
8,45000,37500.0,7500.0
199,110000,103650.0,6350.0
310,35000,36800.0,-1800.0


In [142]:
np.abs(df['defferences']).mean()

8254.22222222222

In [144]:
# mean square error
mse = mean_squared_error(y_test, y_pred)
mse

137226659.25925925