In [40]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
import warnings
import hvplot.pandas
from sklearn.linear_model import LinearRegression
warnings.filterwarnings('ignore')

In [41]:
data = Path('compensation_experience')
df = pd.read_csv(data)
df

Unnamed: 0,totalyearlycompensation,basesalary,Masters_Degree,Bachelors_Degree,Doctorate_Degree,Highschool,Some_College,yearsofexperience,yearsatcompany
0,127000,107000.0,0,0,0,0,0,1.5,1.5
1,100000,0.0,0,0,0,0,0,5.0,3.0
2,310000,155000.0,0,0,0,0,0,8.0,0.0
3,372000,157000.0,0,0,0,0,0,7.0,5.0
4,157000,0.0,0,0,0,0,0,5.0,3.0
...,...,...,...,...,...,...,...,...,...
62637,327000,155000.0,0,0,0,0,0,10.0,1.0
62638,237000,146900.0,0,0,0,0,0,2.0,2.0
62639,220000,157000.0,0,0,0,0,0,14.0,12.0
62640,280000,194688.0,0,0,0,0,0,8.0,4.0


In [42]:
#try to do regression of experience years and salary first. drop education columns. drop base salary to start.
df.columns

Index(['totalyearlycompensation', 'basesalary', 'Masters_Degree',
       'Bachelors_Degree', 'Doctorate_Degree', 'Highschool', 'Some_College',
       'yearsofexperience', 'yearsatcompany'],
      dtype='object')

In [43]:
df = df.drop(['basesalary', 'Masters_Degree',
       'Bachelors_Degree', 'Doctorate_Degree', 'Highschool', 'Some_College', 'yearsatcompany'], axis=1)
df

Unnamed: 0,totalyearlycompensation,yearsofexperience
0,127000,1.5
1,100000,5.0
2,310000,8.0
3,372000,7.0
4,157000,5.0
...,...,...
62637,327000,10.0
62638,237000,2.0
62639,220000,14.0
62640,280000,8.0


In [44]:
#separate features from the target
y = df["totalyearlycompensation"]


In [45]:
compensation_plot = df.hvplot.scatter(
    x="yearsofexperience",
    y="totalyearlycompensation",
    title="Expected Compensation Based on Years of Experience"
)
compensation_plot

In [46]:
# Reformat data of the independent variable X as a single-column array
X = df["yearsofexperience"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[1.5],
       [5. ],
       [8. ],
       [7. ],
       [5. ]])

In [47]:
#check the shape
X.shape

(62642, 1)

In [48]:
# Create an array for the dependent variable y
y = df["totalyearlycompensation"]

In [49]:
# Create a model with scikit-learn
model = LinearRegression()

In [50]:
# Fit the data into the model
model.fit(X, y)

In [51]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [9994.49319576]


In [52]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 144298.69455784984


In [53]:
# Display the formula to predict the salary for a person with 7 years of experience
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 7")

# Predict the salary for a person with 7 years of experience
y_7 = model.intercept_ + model.coef_[0] * 7

# Display the prediction
print(f"Predicted salary for a person with 7 years of experience: ${y_7:.2f}")

Model's formula: y = 144298.69455784984 + 9994.49319575851 * 7
Predicted salary for a person with 7 years of experience: $214260.15


In [54]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [55]:
# Create a copy of the original data
df_totalyearlycompensation_predicted = df.copy()

# Add a column with the predicted salary values
df_totalyearlycompensation_predicted["totalyearlycompensation_predicted"] = predicted_y_values

# Display sample data
df_totalyearlycompensation_predicted.head()

Unnamed: 0,totalyearlycompensation,yearsofexperience,totalyearlycompensation_predicted
0,127000,1.5,159290.434351
1,100000,5.0,194271.160537
2,310000,8.0,224254.640124
3,372000,7.0,214260.146928
4,157000,5.0,194271.160537


In [57]:
# Create a line plot of years_experience versus the predicted salary values
best_fit_line = df_totalyearlycompensation_predicted.hvplot.line(
    x = "yearsofexperience",
    y = "totalyearlycompensation_predicted",
    color = "red"
)
best_fit_line

In [59]:
# Superpose the original data and the best fit line
compensation_plot * best_fit_line

In [60]:
#Model Assessment
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [61]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.1788267653287875.
The r2 is 0.1788267653287875.
The mean squared error is 15645822654.19684.
The root mean squared error is 125083.26288595465.
The standard deviation is 138032.64460621306.
