In [60]:
#Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
import hvplot.pandas
from sklearn.linear_model import LinearRegression

# Loading and Preprocessing Data

In [78]:
# Load data
file_path = Path("resources/NFWBS_PUF_2016_data.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,PUF_ID,sample,fpl,SWB_1,SWB_2,SWB_3,FWBscore,FWB1_1,FWB1_2,FWB1_3,...,PPMSACAT,PPREG4,PPREG9,PPT01,PPT25,PPT612,PPT1317,PPT18OV,PCTLT200FPL,finalwt
0,10350,2,3,5,5,6,55,3,3,3,...,1,4,8,0,0,0,0,1,0,0.367292
1,7740,1,3,6,6,6,51,2,2,3,...,1,2,3,0,0,0,0,2,0,1.327561
2,13699,1,3,4,3,4,49,3,3,3,...,1,4,9,0,0,0,1,2,1,0.835156
3,7267,1,3,6,6,6,49,3,3,3,...,1,3,7,0,0,0,0,1,0,1.410871
4,7375,1,3,4,4,4,49,3,3,3,...,1,2,4,0,0,1,0,4,1,4.260668


In [62]:
#drop columns that have -2 response value (Question not asked because respondent not in item base)
data_df = data_df[['FWBscore','FWB1_1','FWB1_2','FWB1_3','FWB1_4','FWB1_5','FWB1_6',
                   'FWB2_1','FWB2_2','FWB2_3','FWB2_4','finalwt']]
data_df

Unnamed: 0,FWBscore,FWB1_1,FWB1_2,FWB1_3,FWB1_4,FWB1_5,FWB1_6,FWB2_1,FWB2_2,FWB2_3,FWB2_4,finalwt
0,55,3,3,3,3,2,3,2,3,2,4,0.367292
1,51,2,2,3,3,3,4,2,2,2,3,1.327561
2,49,3,3,3,3,3,3,3,3,3,3,0.835156
3,49,3,3,3,3,3,3,3,3,3,3,1.410871
4,49,3,3,3,3,3,3,3,3,3,3,4.260668
...,...,...,...,...,...,...,...,...,...,...,...,...
6389,61,3,3,1,3,3,2,2,3,2,2,0.522504
6390,59,3,4,2,4,5,2,2,3,2,2,1.015219
6391,59,3,4,3,3,3,2,2,4,1,2,1.136270
6392,46,2,2,3,2,5,3,3,3,1,5,1.224941


In [63]:
data_df['wt_score_df'] = data_df['FWBscore']*data_df['finalwt']
data_df

Unnamed: 0,FWBscore,FWB1_1,FWB1_2,FWB1_3,FWB1_4,FWB1_5,FWB1_6,FWB2_1,FWB2_2,FWB2_3,FWB2_4,finalwt,wt_score_df
0,55,3,3,3,3,2,3,2,3,2,4,0.367292,20.201053
1,51,2,2,3,3,3,4,2,2,2,3,1.327561,67.705596
2,49,3,3,3,3,3,3,3,3,3,3,0.835156,40.922635
3,49,3,3,3,3,3,3,3,3,3,3,1.410871,69.132679
4,49,3,3,3,3,3,3,3,3,3,3,4.260668,208.772739
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6389,61,3,3,1,3,3,2,2,3,2,2,0.522504,31.872719
6390,59,3,4,2,4,5,2,2,3,2,2,1.015219,59.897899
6391,59,3,4,3,3,3,2,2,4,1,2,1.136270,67.039953
6392,46,2,2,3,2,5,3,3,3,1,5,1.224941,56.347287


In [64]:
data_df.describe()

Unnamed: 0,FWBscore,FWB1_1,FWB1_2,FWB1_3,FWB1_4,FWB1_5,FWB1_6,FWB2_1,FWB2_2,FWB2_3,FWB2_4,finalwt,wt_score_df
count,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0
mean,56.034094,3.048014,3.191899,2.531279,3.285737,2.771505,3.071942,2.33594,3.388176,2.029246,2.687989,1.0,54.202316
std,14.154676,1.235221,1.11413,1.196235,1.052283,1.269624,1.177299,1.178593,1.267983,1.105532,1.135752,0.585406,32.287129
min,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,0.165567,-5.015855
25%,48.0,2.0,3.0,2.0,3.0,2.0,2.0,1.0,3.0,1.0,2.0,0.600582,32.990588
50%,56.0,3.0,3.0,2.0,3.0,3.0,3.0,2.0,3.0,2.0,3.0,0.845213,46.171876
75%,65.0,4.0,4.0,3.0,4.0,4.0,4.0,3.0,4.0,3.0,3.0,1.251415,66.921936
max,95.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,6.638674,411.597797


In [65]:
# Define features set
X = data_df.copy()
X.drop(columns=['FWBscore','finalwt','wt_score_df'], axis=1, inplace=True)
X.head()

Unnamed: 0,FWB1_1,FWB1_2,FWB1_3,FWB1_4,FWB1_5,FWB1_6,FWB2_1,FWB2_2,FWB2_3,FWB2_4
0,3,3,3,3,2,3,2,3,2,4
1,2,2,3,3,3,4,2,2,2,3
2,3,3,3,3,3,3,3,3,3,3
3,3,3,3,3,3,3,3,3,3,3
4,3,3,3,3,3,3,3,3,3,3


In [66]:
# Create an array for the dependent variable y
y = data_df['FWBscore'].values.reshape(-1, 1)
y[:5]

array([[55],
       [51],
       [49],
       [49],
       [49]], dtype=int64)

# Building the Linear Regression Model

In [67]:
# Create a model with scikit-learn
model = LinearRegression()

In [68]:
# Fit the data into the model
model.fit(X, y)

In [69]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [[ 0.84329595  0.82262305 -2.0189136   1.2688397  -1.33365171 -1.60626036
  -2.5349672   1.28308009 -1.79639979 -1.86589867]]


In [70]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: [70.64499112]


In [71]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = [70.64499112] + [ 0.84329595  0.82262305 -2.0189136   1.2688397  -1.33365171 -1.60626036
 -2.5349672   1.28308009 -1.79639979 -1.86589867]X


In [72]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [73]:
# Create a copy of the original data
predicted_df = data_df.copy()

# Add a column with the predicted salary values
predicted_df["predicted_FWBscore"] = predicted_y_values

# Display sample data
predicted_df

Unnamed: 0,FWBscore,FWB1_1,FWB1_2,FWB1_3,FWB1_4,FWB1_5,FWB1_6,FWB2_1,FWB2_2,FWB2_3,FWB2_4,finalwt,wt_score_df,predicted_FWBscore
0,55,3,3,3,3,2,3,2,3,2,4,0.367292,20.201053,53.629354
1,51,2,2,3,3,3,4,2,2,2,3,1.327561,67.705596,49.606341
2,49,3,3,3,3,3,3,3,3,3,3,0.835156,40.922635,49.830234
3,49,3,3,3,3,3,3,3,3,3,3,1.410871,69.132679,49.830234
4,49,3,3,3,3,3,3,3,3,3,3,4.260668,208.772739,49.830234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6389,61,3,3,1,3,3,2,2,3,2,2,0.522504,31.872719,61.671587
6390,59,3,4,2,4,5,2,2,3,2,2,1.015219,59.897899,59.076832
6391,59,3,4,3,3,3,2,2,4,1,2,1.136270,67.039953,61.535862
6392,46,2,2,3,2,5,3,3,3,1,5,1.224941,56.347287,44.089174


## Linear Regression Model Assessment

In [74]:
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [75]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.9253742601327382.
The r2 is 0.9253742601327382.
The mean squared error is 14.949290665540028.
The root mean squared error is 3.86643125705605.
The standard deviation is 14.15356905358079.
