In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split  
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import pickle as pk
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [2]:
# dropping missing values
data = df = pd.read_csv("salarydata.csv")
data.dropna(how = 'any', inplace=True)
data.isnull().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64

In [3]:
# remove unnecessary columns
data.drop(["Job Title"], axis=1, inplace=True)
data.head(10)

Unnamed: 0,Age,Gender,Education Level,Years of Experience,Salary
0,32.0,Male,Bachelor's,5.0,90000.0
1,28.0,Female,Master's,3.0,65000.0
2,45.0,Male,PhD,15.0,150000.0
3,36.0,Female,Bachelor's,7.0,60000.0
4,52.0,Male,Master's,20.0,200000.0
5,29.0,Male,Bachelor's,2.0,55000.0
6,42.0,Female,Master's,12.0,120000.0
7,31.0,Male,Bachelor's,4.0,80000.0
8,26.0,Female,Bachelor's,1.0,45000.0
9,38.0,Male,PhD,10.0,110000.0


In [4]:
# removing duplicates
# data[data.duplicated()]
data.drop_duplicates(keep = 'first')
data.head(5)

Unnamed: 0,Age,Gender,Education Level,Years of Experience,Salary
0,32.0,Male,Bachelor's,5.0,90000.0
1,28.0,Female,Master's,3.0,65000.0
2,45.0,Male,PhD,15.0,150000.0
3,36.0,Female,Bachelor's,7.0,60000.0
4,52.0,Male,Master's,20.0,200000.0


In [5]:
# Encoding categorical data
# (1) gender 0 femal 1 male
data["Gender"] = OrdinalEncoder(categories=[["Female", "Male"]]).fit_transform(data[["Gender"]]) 
# (1) Education Level  0 Bachelor's 1 Master's	2 PhD	
data["Education Level"] = OrdinalEncoder(categories=[["Bachelor's", "Master's", "PhD"]]).fit_transform(data[["Education Level"]])
data.head(5)

Unnamed: 0,Age,Gender,Education Level,Years of Experience,Salary
0,32.0,1.0,0.0,5.0,90000.0
1,28.0,0.0,1.0,3.0,65000.0
2,45.0,1.0,2.0,15.0,150000.0
3,36.0,0.0,0.0,7.0,60000.0
4,52.0,1.0,1.0,20.0,200000.0


In [6]:
# spliting the data
x = data.drop("Salary", axis=1)
y = data["Salary"]
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=2)

In [7]:
# training
model = LinearRegression()
model.fit(x_train, y_train)

In [8]:
# predictions
y_pred = model.predict(x_test)

In [9]:
df = pd.DataFrame({"y_actual": y_test, "y_pred": y_pred})
df.head(5)

Unnamed: 0,y_actual,y_pred
174,45000.0,44491.490624
206,50000.0,60807.540211
70,65000.0,71973.459679
118,45000.0,42920.856035
32,75000.0,66933.424645


In [10]:
# test
Age = 37
Gender = 1
Education = 2
Experience = 12
Salary = model.predict([[Age,Gender, Education, Experience]])
print(Salary)

[128888.90642075]




In [11]:
# random forest
forest = RandomForestClassifier()
forest.fit(x_train, y_train)
y_pred2 = forest.predict(x_test)
df = pd.DataFrame({"y_actual": y_test, "y_pred2": y_pred2})
df.head(5)

Unnamed: 0,y_actual,y_pred2
174,45000.0,40000.0
206,50000.0,50000.0
70,65000.0,50000.0
118,45000.0,40000.0
32,75000.0,75000.0


In [12]:
# tree
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)
y_pred3 = tree.predict(x_test)
df = pd.DataFrame({"y_actual": y_test, "y_pred3": y_pred3})
df.head(5)

Unnamed: 0,y_actual,y_pred3
174,45000.0,40000.0
206,50000.0,50000.0
70,65000.0,90000.0
118,45000.0,40000.0
32,75000.0,75000.0


In [13]:
# compare 
# for linear
print(f"MAE : {metrics.mean_absolute_error(y_test, y_pred)}")
print(f"MSE : {metrics.mean_squared_error(y_test, y_pred)}")
print(f"RMSE : {np.sqrt(metrics.mean_squared_error(y_test, y_pred))}")

MAE : 11822.262549569183
MSE : 273119491.28782064
RMSE : 16526.32721713511


In [14]:
# compare 
# for forest
print(f"MAE : {metrics.mean_absolute_error(y_test, y_pred2)}")
print(f"MSE : {metrics.mean_squared_error(y_test, y_pred2)}")
print(f"RMSE : {np.sqrt(metrics.mean_squared_error(y_test, y_pred2))}")

MAE : 9862.0
MSE : 329008300.0
RMSE : 18138.585942680318


In [15]:
# compare 
# for forest
print(f"MAE : {metrics.mean_absolute_error(y_test, y_pred3)}")
print(f"MSE : {metrics.mean_squared_error(y_test, y_pred3)}")
print(f"RMSE : {np.sqrt(metrics.mean_squared_error(y_test, y_pred3))}")

MAE : 13995.333333333334
MSE : 561674966.6666666
RMSE : 23699.68283894674


In [16]:
# so the best model is the linear regression 
# now export
pk.dump(model, open('model.pkl', 'wb'))