In [34]:
# Import Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer



In [35]:
# Load data
ds = pd.read_csv("New_Cleaned_Data.csv")
ds = ds.loc[:, ~ds.columns.str.contains('^Unnamed')]



In [36]:
ds.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 323 entries, 0 to 322
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  323 non-null    float64
 1   Gender               323 non-null    object 
 2   Education Level      323 non-null    object 
 3   Job Title            323 non-null    object 
 4   Years of Experience  323 non-null    float64
 5   Salary               323 non-null    float64
dtypes: float64(3), object(3)
memory usage: 15.3+ KB


In [37]:
# Features and Target
x = ds[['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience']]
y = ds[['Salary']]


In [38]:
x

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience
0,32.0,Male,Bachelor's,Software Engineer,5.0
1,28.0,Female,Master's,Data Analyst,3.0
2,45.0,Male,PhD,Senior Manager,15.0
3,36.0,Female,Bachelor's,Sales Associate,7.0
4,52.0,Male,Master's,Director,20.0
...,...,...,...,...,...
318,28.0,Female,Bachelor's,Junior Operations Manager,1.0
319,36.0,Male,Bachelor's,Senior Business Development Manager,8.0
320,44.0,Female,PhD,Senior Data Scientist,16.0
321,31.0,Male,Bachelor's,Junior Marketing Coordinator,3.0


In [39]:
y

Unnamed: 0,Salary
0,90000.0
1,65000.0
2,150000.0
3,60000.0
4,200000.0
...,...
318,35000.0
319,110000.0
320,160000.0
321,55000.0


In [40]:
# Splitting data into Independent and dependent variables
x = ds[['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience']]
y = ds[['Salary']]



In [41]:
print(len(ds['Gender'].unique()))
print(len(ds['Education Level'].unique()))
print(len(ds['Job Title'].unique()))


2
3
174


In [42]:
# Implement OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

ohe = OneHotEncoder()
ohe.fit(x[["Gender", "Education Level", "Job Title"]])


In [43]:
ohe.categories_

[array(['Female', 'Male'], dtype=object),
 array(["Bachelor's", "Master's", 'PhD'], dtype=object),
 array(['Account Manager', 'Accountant', 'Administrative Assistant',
        'Business Analyst', 'Business Development Manager',
        'Business Intelligence Analyst', 'CEO', 'Chief Data Officer',
        'Chief Technology Officer', 'Content Marketing Manager',
        'Copywriter', 'Creative Director', 'Customer Service Manager',
        'Customer Service Rep', 'Customer Service Representative',
        'Customer Success Manager', 'Customer Success Rep', 'Data Analyst',
        'Data Entry Clerk', 'Data Scientist', 'Digital Content Producer',
        'Digital Marketing Manager', 'Director',
        'Director of Business Development', 'Director of Engineering',
        'Director of Finance', 'Director of HR',
        'Director of Human Capital', 'Director of Human Resources',
        'Director of Marketing', 'Director of Operations',
        'Director of Product Management', 'Director o

In [44]:
ct = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore', categories=ohe.categories_), ["Gender", "Education Level", "Job Title"]),
    remainder='passthrough',
    force_int_remainder_cols=False,
    sparse_threshold=0
)
ct


In [45]:
# Making pipeline
from sklearn.linear_model import LinearRegression
reg = LinearRegression()



In [46]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(ct, reg)
pipe


In [48]:
# Splitting dataset into training and testing
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Splitting dataset into training and testing with best random_state
scores = []
for i in range(0, 101):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=i)
    pipe.fit(x_train, y_train)
    result = pipe.predict(x_test)
    score = r2_score(y_test, result)
    scores.append(score)


In [49]:
# Finding best value
bestindex = np.argmax(scores)
print("Best R2 Score:", scores[bestindex])




Best R2 Score: 0.9326896521945274


In [50]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=bestindex)
pipe.fit(x_train, y_train)


In [51]:
# Predict for user_input

gender = input("Enter Gender (Male/Female): ")
education = input("Enter Education Level (Bachelor's/Master's/PhD): ")
job_title = input("Enter Job Title: ")
age = float(input("Enter Age: "))
experience = float(input("Enter Years of Experience: "))

columns = ["Age", "Gender", "Education Level", "Job Title", "Years of Experience"]
myinput = pd.DataFrame(columns=columns, data=[[age, gender, education, job_title, experience]])

result = pipe.predict(myinput)
print("Predicted Salary is ~:", abs(round(result[0, 0])))


Enter Gender (Male/Female):  Female
Enter Education Level (Bachelor's/Master's/PhD):  Bachelor's
Enter Job Title:  Account Manager
Enter Age:  40
Enter Years of Experience:  5


Predicted Salary is ~: 97580


In [52]:
import pickle as pkl

In [53]:
pkl.dump(pipe,open("SalaryProject.pkl","wb"))