**Use-case:**
Salary Prediction

**Description:**
We are being hired by a hiring agency. The goal here is to create a model that can predict the salary of the employee based on employee's years of experience

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Salary_Data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearsExperience  30 non-null     float64
 1   Salary           30 non-null     float64
dtypes: float64(2)
memory usage: 608.0 bytes


In [4]:
data.describe()

Unnamed: 0,YearsExperience,Salary
count,30.0,30.0
mean,5.313333,76003.0
std,2.837888,27414.429785
min,1.1,37731.0
25%,3.2,56720.75
50%,4.7,65237.0
75%,7.7,100544.75
max,10.5,122391.0


In [5]:
# Rules for Regression Implementation using sci-kit learn package
# 1. Feature and Label must be in the form of numpy array
# 2. Feature must be a 2d array
# 3. Label must be a 2d array

In [10]:
#Seperate data as features and label
features = data.iloc[:,[0]].values
label = data.iloc[:,[1]].values

In [13]:
# ML Coding Begins
# 1. Create Train Test Split
# 2. Build the model using train split
# 3. Check the quality of the model
# 4. Deploy model (Optional stage for ML engg)

In [16]:
# 1. Create Train Test Split

# Train Split is 80% and test split is 20% ---- parameter responsible is test_size which takes value for test split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=10)

In [17]:
# Build the model | Train the model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

#fit(featureArray, labelArray) responsible to accept training data and return trained object

In [18]:
# =================================================================================================================
# Check the quality of the model
#
#
# SL(Significance Level) or Alpha Value  -------- Its all about defining the error tolerance in the project
#
# SL = 0.05
#
# Quality of the model
#
# As a ML engg, our goal is to produce a GENERALIZED model.
#
# Generalized Model means the model must work best with both known and unknown data/features.
#
#=================================================================================================================

In [20]:
#===================================================================================================================
#
# A generalized model is something that satisfies the following logic
#
# testScore > trainScore and testScore >= CL
#
#
# testScore is the evaluation score evaulated using testing dataset
# trainScore is the evaluation score evaluated usuing training set
# CL is confidence level (1-SL)
#
#===================================================================================================================

In [24]:
#Extract the testScore and trainScore
# score(feature,label)

testScore = model.score(X_test,y_test)
trainScore = model.score(X_train,y_train)

CL = 0.95
if testScore > trainScore and testScore >= CL:
  print("Approve Model")
else:
  print("Discard Model")

print("TestScore is {} and trainScore is {}".format(testScore,trainScore))

Approve Model
TestScore is 0.9816423482070253 and trainScore is 0.9494673013344646


In [26]:
#
# y = mx + c
#
# salary = b0 + b1(yExp)
#
# Extract b0 and b1 from model

print(model.coef_)
print(model.intercept_)

# salary = 26089.09663242 + (9356.86299354 * yExp)

[[9356.86299354]]
[26089.09663242]


In [29]:
# model is acceptable model
#
# Deploy... We will help app dev to understand how the model works with the app

yExp = float(input("Enter years of Experience: "))

predictedSalary = model.predict(np.array([[yExp]]))

print("Predicted Salary is $ {}".format(predictedSalary))

Enter years of Experience: 5
Predicted Salary is $ [[72873.41160011]]


In [23]:
import pickle
pickle.dump(model , open("SalaryPredictor.mdl",'wb'))