In [1]:
import numpy as np
import pandas as pd

**Importing Dataset**

In [2]:
df=pd.read_csv('Salary Data.csv')
df.shape

(375, 6)

In [3]:
df.sample(5)

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
279,49.0,Female,Master's,Director of Operations,21.0,180000.0
103,33.0,Male,Bachelor's,Junior Web Developer,5.0,50000.0
160,39.0,Female,Bachelor's,Senior Sales Representative,12.0,90000.0
95,39.0,Female,Bachelor's,Training Specialist,12.0,65000.0
214,44.0,Male,Bachelor's,Senior Software Engineer,14.0,130000.0


**Information of the data**

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  373 non-null    float64
 1   Gender               373 non-null    object 
 2   Education Level      373 non-null    object 
 3   Job Title            373 non-null    object 
 4   Years of Experience  373 non-null    float64
 5   Salary               373 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.7+ KB


**Null Values**

In [5]:
df.isnull().sum()

Age                    2
Gender                 2
Education Level        2
Job Title              2
Years of Experience    2
Salary                 2
dtype: int64

In [6]:
df.dropna(inplace=True)

In [7]:
df.isnull().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64

**Duplicate values**

In [8]:
df.duplicated().sum()

49

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.duplicated().sum()

0

**Description of the data**

In [11]:
df.describe()

Unnamed: 0,Age,Years of Experience,Salary
count,324.0,324.0,324.0
mean,37.382716,10.058642,99985.648148
std,7.185844,6.65047,48652.27144
min,23.0,0.0,350.0
25%,31.0,4.0,55000.0
50%,36.5,9.0,95000.0
75%,44.0,16.0,140000.0
max,53.0,25.0,250000.0


**Droping usless columns**

In [12]:
df=df.drop('Job Title',axis=1)

In [13]:
df.shape

(324, 5)

**Label Encoder**

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
le=LabelEncoder()

In [16]:
df['Gender']=le.fit_transform(df['Gender'])

In [17]:
df['Education Level']=le.fit_transform(df['Education Level'])

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 324 entries, 0 to 371
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  324 non-null    float64
 1   Gender               324 non-null    int32  
 2   Education Level      324 non-null    int32  
 3   Years of Experience  324 non-null    float64
 4   Salary               324 non-null    float64
dtypes: float64(3), int32(2)
memory usage: 12.7 KB


**Train test split**

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X=df.drop('Salary',axis=1)
y=df['Salary']

In [21]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)

In [22]:
X_train.shape,X_test.shape

((226, 4), (98, 4))

**Model**

In [23]:
from sklearn.linear_model import LinearRegression

In [24]:
lr=LinearRegression()

In [25]:
lr.fit(X_train,y_train)

In [26]:
y_pred=lr.predict(X_test)

**R2 score**

In [27]:
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score

In [28]:
r2 = r2_score(y_test, y_pred)

In [29]:
print('R2 Score: ',r2)

R2 Score:  0.903963549145731


**Pickle file**

In [30]:
import pickle

In [31]:
with open('model.pkl','wb') as file:
    pickle.dump(lr,file)

**To see the prediction for custom input**

In [32]:
X_train.iloc[0]

Age                    30.0
Gender                  1.0
Education Level         0.0
Years of Experience     3.0
Name: 42, dtype: float64

In [42]:
y = lr.predict([[43.0, 0, 1, 14]])



In [43]:
y

array([128431.55487562])

In [45]:
prediction = lr.predict([[43.0, 0, 1, 14]])[0]



In [46]:
y

array([128431.55487562])

In [33]:
X_train

Unnamed: 0,Age,Gender,Education Level,Years of Experience
42,30.0,1,0,3.0
132,40.0,0,1,12.0
173,43.0,1,0,16.0
162,30.0,0,0,3.0
178,38.0,0,0,10.0
...,...,...,...,...
205,41.0,0,2,17.0
268,33.0,1,0,4.0
72,45.0,1,2,16.0
237,45.0,1,1,16.0
