# Feature Engineering: Feature Scaling, Data Cleaning, Train test split
Problem Statement: Build a model that can predict the employee salaries on basis of their experience


Step 1  : Data Gathering


In [1]:
import pandas as pd
# path = r"https://raw.githubusercontent.com/khushi-2003/Datasets/refs/heads/main/Salary_dataset.csv"
path = r"C:\Users\Khushi Kanade\OneDrive\Desktop\ML\Salary_dataset.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0.1,Unnamed: 0,YearsExperience,Salary
0,0,1.2,39344.0
1,1,1.4,46206.0
2,2,1.6,37732.0
3,3,2.1,43526.0
4,4,2.3,39892.0


Step 2 : Perform basic data quality checks

In [2]:
df.shape

(30, 3)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       30 non-null     int64  
 1   YearsExperience  30 non-null     float64
 2   Salary           30 non-null     float64
dtypes: float64(2), int64(1)
memory usage: 848.0 bytes


In [None]:
# check for duplicate rows
df.duplicated().sum()

np.int64(0)

In [6]:
# handle future duplicate cases
df = df.drop_duplicates()

Step 3 : Seperate X and Y features
    
X  : YearsExperience  
Y  : Salary

In [7]:
df.columns

Index(['Unnamed: 0', 'YearsExperience', 'Salary'], dtype='object')

In [8]:
X = df[['YearsExperience']]
Y = df[['Salary']]

In [9]:
X.head()

Unnamed: 0,YearsExperience
0,1.2
1,1.4
2,1.6
3,2.1
4,2.3


In [10]:
Y.head()

Unnamed: 0,Salary
0,39344.0
1,46206.0
2,37732.0
3,43526.0
4,39892.0


Step 4: Feature Engineering

In [12]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [14]:
num_pipeline = make_pipeline(
    #data cleaning
    SimpleImputer(strategy='mean'),
    # feature scaling
    StandardScaler()
).set_output(transform='pandas')

In [15]:
num_pipeline

0,1,2
,steps,"[('simpleimputer', ...), ('standardscaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [16]:
X_pre = num_pipeline.fit_transform(X)
X_pre.head()

Unnamed: 0,YearsExperience
0,-1.510053
1,-1.438373
2,-1.366693
3,-1.187494
4,-1.115814


Step 5 :  Split data into training and testing

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X_pre,Y,train_size=0.8,random_state=20)

In [18]:
xtrain.head()

Unnamed: 0,YearsExperience
10,-0.506537
8,-0.757416
27,1.536336
20,0.532819
16,-0.076458


In [21]:
xtest.head()

Unnamed: 0,YearsExperience
21,0.640339
17,-0.004779
4,-1.115814
1,-1.438373
2,-1.366693


In [22]:
ytrain.head()

Unnamed: 0,Salary
10,63219.0
8,64446.0
27,112636.0
20,91739.0
16,66030.0


In [24]:
ytest.head()

Unnamed: 0,Salary
21,98274.0
17,83089.0
4,39892.0
1,46206.0
2,37732.0


Step 6 :  Model Building 

In [26]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain,ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [None]:
# r2 score for training data
model.score(xtrain,ytrain)

0.9522511614008535

Step 7 : Model Evaluation

In [30]:
ypred = model.predict(xtest)

In [31]:
ypred[:5]

array([[92420.45023985],
       [75417.85462327],
       [46135.60661695],
       [37634.30880866],
       [39523.48609939]])

In [32]:
ytest[:5]

Unnamed: 0,Salary
21,98274.0
17,83089.0
4,39892.0
1,46206.0
2,37732.0


Step 8 : Evaluation Metrics

In [42]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

mse  = mean_absolute_error(ytest,ypred)
rmse = mse**0.5
mae = mean_absolute_error(ytest,ypred)
r2_score = r2_score(ytest, ypred)

print(f""" 
MSE : {mse} 
RMSE : {rmse} 
MAE  : {mae} 
R2 Score : {r2_score:.2f}%""")

 
MSE : 5064.460989349328 
RMSE : 71.16502644803364 
MAE  : 5064.460989349328 
R2 Score : 0.97%


In [None]:
xtest['predicted_salary']=ypred
xtest

Unnamed: 0,YearsExperience,predicted_salary
21,0.640339,92420.45024
17,-0.004779,75417.854623
4,-1.115814,46135.606617
1,-1.438373,37634.308809
2,-1.366693,39523.486099
28,1.787215,122647.286892


In [48]:
# saving results to csv
xtest.to_csv("RegressionModel_salaryprediction",index = False)