In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score , mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from math import sqrt

In [4]:
Salary_DataSet = pd.read_csv('https://raw.githubusercontent.com/dsrscientist/dataset3/main/Salaries.csv')

In [5]:
Salary_DataSet.head()

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
0,Prof,B,19,18,Male,139750
1,Prof,B,20,16,Male,173200
2,AsstProf,B,4,3,Male,79750
3,Prof,B,45,39,Male,115000
4,Prof,B,40,41,Male,141500


In [6]:
Salary_DataSet.keys()

Index(['rank', 'discipline', 'yrs.since.phd', 'yrs.service', 'sex', 'salary'], dtype='object')

In [7]:
Salary_DataSet.shape

(397, 6)

In [8]:
Salary_DataSet.isnull().sum()

rank             0
discipline       0
yrs.since.phd    0
yrs.service      0
sex              0
salary           0
dtype: int64

In [9]:
Salary_DataSet.describe()

Unnamed: 0,yrs.since.phd,yrs.service,salary
count,397.0,397.0,397.0
mean,22.314861,17.61461,113706.458438
std,12.887003,13.006024,30289.038695
min,1.0,0.0,57800.0
25%,12.0,7.0,91000.0
50%,21.0,16.0,107300.0
75%,32.0,27.0,134185.0
max,56.0,60.0,231545.0


In [10]:
rank = Salary_DataSet.groupby('rank').mean()
rank

Unnamed: 0_level_0,yrs.since.phd,yrs.service,salary
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AssocProf,15.453125,11.953125,93876.4375
AsstProf,5.104478,2.373134,80775.985075
Prof,28.300752,22.815789,126772.109023


In [11]:
rank_dict = {'Prof':3,'AssocProf':2,'AsstProf':1}
updated_Salary_DataSet = Salary_DataSet.copy()
updated_Salary_DataSet['rank'] = updated_Salary_DataSet['rank'].map(rank_dict)
updated_rank = updated_Salary_DataSet.groupby('rank').mean()
updated_rank

Unnamed: 0_level_0,yrs.since.phd,yrs.service,salary
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,5.104478,2.373134,80775.985075
2,15.453125,11.953125,93876.4375
3,28.300752,22.815789,126772.109023


In [12]:
discipline = Salary_DataSet.groupby('discipline').mean()
discipline

Unnamed: 0_level_0,yrs.since.phd,yrs.service,salary
discipline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,25.381215,19.950276,108548.430939
B,19.74537,15.657407,118028.694444


In [13]:
discipline_dict = {"A": 1, "B" : 2}
updated_Salary_DataSet["discipline"] = updated_Salary_DataSet["discipline"].map(discipline_dict)
updated_discipline = updated_Salary_DataSet.groupby("discipline").mean()
updated_discipline

Unnamed: 0_level_0,rank,yrs.since.phd,yrs.service,salary
discipline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.59116,25.381215,19.950276,108548.430939
2,2.425926,19.74537,15.657407,118028.694444


In [14]:
sex = Salary_DataSet.groupby('sex').mean()
sex

Unnamed: 0_level_0,yrs.since.phd,yrs.service,salary
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,16.512821,11.564103,101002.410256
Male,22.946927,18.273743,115090.418994


In [15]:
sex_dict = {"Female": 1, "Male" : 2}
updated_Salary_DataSet["sex"] = updated_Salary_DataSet["sex"].map(sex_dict)
updated_sex = updated_Salary_DataSet.groupby("sex").mean()
updated_sex

Unnamed: 0_level_0,rank,discipline,yrs.since.phd,yrs.service,salary
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2.179487,1.538462,16.512821,11.564103,101002.410256
2,2.536313,1.544693,22.946927,18.273743,115090.418994


In [16]:
updated_Salary_DataSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   rank           397 non-null    int64
 1   discipline     397 non-null    int64
 2   yrs.since.phd  397 non-null    int64
 3   yrs.service    397 non-null    int64
 4   sex            397 non-null    int64
 5   salary         397 non-null    int64
dtypes: int64(6)
memory usage: 18.7 KB


In [17]:
updated_Salary_DataSet.columns

Index(['rank', 'discipline', 'yrs.since.phd', 'yrs.service', 'sex', 'salary'], dtype='object')

In [31]:
x = updated_Salary_DataSet.drop('salary', axis=1)
x

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex
0,3,2,19,18,2
1,3,2,20,16,2
2,1,2,4,3,2
3,3,2,45,39,2
4,3,2,40,41,2
...,...,...,...,...,...
392,3,1,33,30,2
393,3,1,31,19,2
394,3,1,42,25,2
395,3,1,25,15,2


In [19]:
y = updated_Salary_DataSet[["salary"]]
y

Unnamed: 0,salary
0,139750
1,173200
2,79750
3,115000
4,141500
...,...
392,103106
393,150564
394,101738
395,95329


In [32]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 80)

In [35]:
lr = LinearRegression()
lr.fit(x_train, y_train)

LinearRegression()

In [36]:
y_prediction = lr.predict(x_test)
y_prediction[:5]

array([[119557.19169095],
       [131552.01588163],
       [117754.70335036],
       [118799.10433149],
       [120435.08429404]])

In [37]:
y_test.describe()

Unnamed: 0,salary
count,132.0
mean,111641.757576
std,25679.556041
min,62884.0
25%,94264.0
50%,106823.5
75%,128719.0
max,193000.0


In [38]:
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))
RMSE

17769.543834201893

In [39]:
r_squared = lr.score(x_test,y_test)
r_squared

0.5175190491801952

In [40]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 124)


In [47]:
regressor = DecisionTreeRegressor(max_depth=20)
regressor.fit(x_train, y_train)

DecisionTreeRegressor(max_depth=20)

In [48]:
y_prediction = regressor.predict(x_test)
y_prediction[:5]

array([ 93000.,  74500., 166800.,  95408.,  72300.])

In [118]:
y_test.describe()

Unnamed: 0,salary
count,132.0
mean,113298.984848
std,31315.805765
min,63100.0
25%,89616.5
50%,105289.0
75%,131382.0
max,205500.0


In [49]:
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))
RMSE

29118.47829929184

In [52]:
r_squared = dtr.score(x_test,y_test)
r_squared

0.2910706799165753

In [53]:
rfr = RandomForestRegressor()
rfr.fit(x_train, y_train)

  rfr.fit(x_train, y_train)


RandomForestRegressor()

In [55]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 384)

In [57]:
y_prediction = rfr.predict(x_test)
y_prediction[:5]

array([ 97067.95      , 143342.35666667, 123454.94041667, 102076.39      ,
        68693.95      ])

In [58]:
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))
RMSE

14256.327790351208

In [59]:
r_squared = rfr.score(x_test,y_test)
r_squared

0.7919667307509266