# Assignment 04

Regression Analysis in R

Julio Vargas (Boston University)  
September 21, 2025

In [1]:
from pyspark.sql import SparkSession
import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np

np.random.seed(42)

pio.renderers.default = "notebook+notebook_connected+vscode"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("data/lightcast_job_postings.csv")

# Show Schema and Sample Data
print("---This is Diagnostic check, No need to print it in the final doc---")

# df.printSchema() # comment this line when rendering the submission
df.show(5)
print(df.count())
#pd.set_option("display.max_rows", None)  
#pd.DataFrame(df.columns, columns=["Column Names"])


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/05 21:51:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

---This is Diagnostic check, No need to print it in the final doc---


25/10/05 21:51:36 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+------

[Stage 3:>                                                          (0 + 1) / 1]

72498


                                                                                

## Missing Value Treatment

Replace the missing values in *Salary* by the median of *Salary* based on the *EMPLOYMENT_TYPE* or *EMPLOYMENT_TYPE_NAME*.  
If both are missing, then replace them with the overall median of *Salary*.


In [2]:
from pyspark.sql.functions import col, when, isnan, count
# 1. Replace the missing values in Salary by median of Salary based on the EMPLOYMENT_TYPE or EMPLOYMENT_TYPE_NAME. If both are missing, then replace

from pyspark.sql import Window
from pyspark.sql.functions import col, when, isnan, count, expr, median
from pyspark.sql import functions as F

# Calculate overall median salary
overall_median_salary = df.approxQuantile("SALARY", [0.5], 0.01)[0]

# Calculate median_salary by EMPLOYMENT_TYPE
median_by_employment_type = df.groupBy("EMPLOYMENT_TYPE").agg(
    expr("percentile_approx(SALARY, 0.5)").alias("median_salary_emp_type")
)

# Calculate median salary by EMPLOYMENT_TYPE_NAME
median_by_employment_type_name = df.groupBy("EMPLOYMENT_TYPE_NAME").agg(
    expr("percentile_approx(SALARY, 0.5)").alias("median_salary_emp_type_name")
)

# Join median values back to the original dataframe
df_salary_imputed = df.join(median_by_employment_type, on="EMPLOYMENT_TYPE", how="left").join(median_by_employment_type_name, on="EMPLOYMENT_TYPE_NAME", how="left")
df_salary_imputed.show(5)

# Replace missing SALARY values
df_salary_imputed = df_salary_imputed.withColumn("SALARY", when(col("SALARY").isNull(),
    when(col("median_salary_emp_type").isNotNull(), col("median_salary_emp_type"))
    .when(col("median_salary_emp_type_name").isNotNull(), col("median_salary_emp_type_name"))
    .otherwise(overall_median_salary)
).otherwise(col("SALARY")))


print(df_salary_imputed.count())


                                                                                

+--------------------+---------------+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+------

[Stage 14:>                                                         (0 + 1) / 1]

72498


                                                                                

# Feature Engineering

Feature Engineering is a crucial step in preparing your data for machine learning. In this lab, we will focus on the following tasks:

1. Drop rows with missing values in the target variable and key features.
2. By now you are already familiar with the code and the data. Based on your understanding please choose any 3 (my code output has 10) variables as:
   1. three continuous variables and, `MIN_YEARS_EXPERIENCE` (total 4, use your best judgment!)
   2. two categorical.
   3. Your dependent variable (y) is `SALARY`.

3. Convert categorical variables into numerical representations using StringIndexer and OneHotEncoder.
4. Assemble features into a single vector using VectorAssembler.
5. Split the data into training and testing sets.
6. You can use pipeline to do the above steps in one go.
7. Create a new column `MIN_YEARS_EXPERIENCE_SQ` by squaring the `MIN_YEARS_EXPERIENCE` column.
8. Assemble the polynomial features into a new vector column `features_poly` using VectorAssembler.
9. Show the final structure of the DataFrame with the new features.


In [3]:
#| eval: true
#| echo: false
#| fig-align: center

from pyspark.sql.functions import col, pow
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, regexp_replace, trim
import ast
from pyspark.sql.types import BooleanType, StringType, IntegerType
from pyspark.sql.types import IntegerType,DoubleType,DecimalType

# Drop rows with NA values in relevant columns
regression_df = df_salary_imputed.dropna(subset=[
    "SALARY", "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE",
    "EDUCATION_LEVELS_NAME", "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME",
    "DURATION", "IS_INTERNSHIP", "COMPANY_IS_STAFFING"
    ]).select(
    "SALARY", "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE",
    "EDUCATION_LEVELS_NAME", "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME",
    "DURATION", "IS_INTERNSHIP", "COMPANY_IS_STAFFING", "median_salary_emp_type", "median_salary_emp_type_name")



# Categorical columns
categorical_cols = [
    "EDUCATION_LEVELS_NAME", "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME",
    "IS_INTERNSHIP", "COMPANY_IS_STAFFING"
]

regression_df=regression_df.withColumn("IS_INTERNSHIP", col("IS_INTERNSHIP").cast(IntegerType()))
regression_df=regression_df.withColumn("COMPANY_IS_STAFFING", col("COMPANY_IS_STAFFING").cast(IntegerType()))

#convert duration to numeric (ind days)
regression_df = regression_df.withColumn("DURATION", col("DURATION").cast(IntegerType()))

regression_df.show(5, truncate=False)

print(regression_df.count())



#regression_df.show(5, truncate=False)
# regression_df.count()   2243





                                                                                

+--------+--------------------+--------------------+-----------------------------+----------------------+----------------+--------+-------------+-------------------+----------------------+---------------------------+
|SALARY  |MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|EDUCATION_LEVELS_NAME        |EMPLOYMENT_TYPE_NAME  |REMOTE_TYPE_NAME|DURATION|IS_INTERNSHIP|COMPANY_IS_STAFFING|median_salary_emp_type|median_salary_emp_type_name|
+--------+--------------------+--------------------+-----------------------------+----------------------+----------------+--------+-------------+-------------------+----------------------+---------------------------+
|116500.0|2                   |2                   |[\n  "Bachelor's degree"\n]  |Full-time (> 32 hours)|[None]          |6       |0            |0                  |116500                |116500                     |
|116500.0|7                   |7                   |[\n  "No Education Listed"\n]|Full-time (> 32 hours)|[None]          |18      |0

[Stage 30:>                                                         (0 + 1) / 1]

5039


                                                                                

In [4]:
regression_df.select("DURATION").schema

StructType([StructField('DURATION', IntegerType(), True)])

# Linear Regression Model (OLS)

In [5]:
# Clean Education Levels by cleaning \n and array brackets
from pyspark.sql.functions import regexp_replace, trim
regression_df = regression_df.withColumn("EDUCATION_LEVELS_NAME",trim(regexp_replace(col("EDUCATION_LEVELS_NAME"), r"[\[\]\n]", "")))


# Index and One-Hot Encode
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_idx", handleInvalid='skip') for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=f"{col}_idx", outputCol=f"{col}_vec") for col in categorical_cols]


# Assemble base features (for GLR and Random Forest)
assembler = VectorAssembler(
    inputCols=[
        "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE", "DURATION"
    ] + [f"{col}_vec" for col in categorical_cols],
    outputCol="features"
)

pipeline = Pipeline(stages=indexers + encoders + [assembler])
regression_data = pipeline.fit(regression_df).transform(regression_df)

regression_data.show(5, truncate=False)

regression_data.select("SALARY", "features").show(5, truncate=False)

                                                                                

+--------+--------------------+--------------------+---------------------+----------------------+----------------+--------+-------------+-------------------+----------------------+---------------------------+-------------------------+------------------------+--------------------+-----------------+-----------------------+-------------------------+------------------------+--------------------+-----------------+-----------------------+-------------------------------------------------------------+
|SALARY  |MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|EDUCATION_LEVELS_NAME|EMPLOYMENT_TYPE_NAME  |REMOTE_TYPE_NAME|DURATION|IS_INTERNSHIP|COMPANY_IS_STAFFING|median_salary_emp_type|median_salary_emp_type_name|EDUCATION_LEVELS_NAME_idx|EMPLOYMENT_TYPE_NAME_idx|REMOTE_TYPE_NAME_idx|IS_INTERNSHIP_idx|COMPANY_IS_STAFFING_idx|EDUCATION_LEVELS_NAME_vec|EMPLOYMENT_TYPE_NAME_vec|REMOTE_TYPE_NAME_vec|IS_INTERNSHIP_vec|COMPANY_IS_STAFFING_vec|features                                                     |
+-

                                                                                

+--------+-------------------------------------------------------------+
|SALARY  |features                                                     |
+--------+-------------------------------------------------------------+
|116500.0|(28,[0,1,2,3,21,23,26,27],[2.0,2.0,6.0,1.0,1.0,1.0,1.0,1.0]) |
|116500.0|(28,[0,1,2,4,21,23,26],[7.0,7.0,18.0,1.0,1.0,1.0,1.0])       |
|116500.0|(28,[0,1,2,4,21,23,26],[1.0,1.0,8.0,1.0,1.0,1.0,1.0])        |
|116500.0|(28,[0,1,2,3,21,23,26,27],[1.0,1.0,32.0,1.0,1.0,1.0,1.0,1.0])|
|131100.0|(28,[0,1,2,3,21,23,26,27],[2.0,2.0,11.0,1.0,1.0,1.0,1.0,1.0])|
+--------+-------------------------------------------------------------+
only showing top 5 rows


# Train/Test Split

- Perform a random split of the data into training and testing sets.
- Set a random seed for reproducibility.
- You can choose a number for splitting to your liking, justify your choice.


In [6]:
regression_data.select("DURATION").schema

StructType([StructField('DURATION', IntegerType(), True)])

In [7]:
#| eval: true
#| echo: false
#| fig-align: center

# Split Data
regression_train, regression_test = regression_data.randomSplit([0.8, 0.2], seed=42)
print((regression_data.count(), len(regression_data.columns)))
print((regression_train.count(), len(regression_train.columns)))
print((regression_test.count(), len(regression_test.columns)))


                                                                                

(5039, 22)


                                                                                

(4070, 22)


[Stage 131:>                                                        (0 + 1) / 1]

(969, 22)


                                                                                

## Linear Regression (Get this model from Lab 5.2)

- Train a **Linear Regression** model using the training data.  
  This model is from Lab 5.2 with three more added features.  
- Make sure to use the **features** column from the assembled DataFrame to fit the model.  
- You will run into an important issue here. Please make an effort in figuring it out by yourself.  
  This is one of the most asked interview questions in CapitalOne's management recruiting program.  
- Evaluate the model on the test data.  
- Print the coefficients, intercept, **R²**, **RMSE**, and **MAE**.  
- Use the **summary** object to extract the coefficients and their standard errors, t-values, and p-values.  
- Create a DataFrame to display the coefficients, standard errors, t-values, p-values, and confidence intervals.  
- Interpret the coefficients and their significance, and explain the model performance metrics.


In [8]:
from pyspark.ml.regression import GeneralizedLinearRegression

feature_names = assembler.getInputCols()

glr = GeneralizedLinearRegression(
    featuresCol="features",
    labelCol="SALARY",
    family="gaussian", # normal distribution
    link="identity", # standard linear regression
    maxIter=10, # number of iterations for least squares
    regParam=0.3 # regularization parameter (L2 regularization by default)
)

glr_model = glr.fit(regression_train)
summary = glr_model.summary


25/10/05 21:55:06 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/10/05 21:55:06 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [9]:
regression_train.select("DURATION").schema

StructType([StructField('DURATION', IntegerType(), True)])

In [10]:
# Coefficients and Intercept
print("Intercept: {:.4f}".format(glr_model.intercept))
print("Coefficients:")
for i, coef in enumerate(glr_model.coefficients):
    print(f" Feature {i + 1}: {coef:.4f}")


Intercept: 86495.3805
Coefficients:
 Feature 1: 1611.1414
 Feature 2: 1611.1414
 Feature 3: 28.0650
 Feature 4: 1247.8481
 Feature 5: 4389.5997
 Feature 6: 9616.4514
 Feature 7: -25792.4349
 Feature 8: 14128.7132
 Feature 9: -10895.2795
 Feature 10: -1698.1255
 Feature 11: 9142.9649
 Feature 12: -12473.3292
 Feature 13: 1370.8466
 Feature 14: -16521.6858
 Feature 15: 33095.9081
 Feature 16: -9798.3842
 Feature 17: -5106.5752
 Feature 18: 3395.8151
 Feature 19: 5000.3347
 Feature 20: -27518.5037
 Feature 21: 2139.5134
 Feature 22: 6781.0001
 Feature 23: -6406.0896
 Feature 24: 3691.5593
 Feature 25: 7833.5350
 Feature 26: 2120.1061
 Feature 27: -2233.4015
 Feature 28: -38.2065


In [11]:
# Summary stats
print("\n--- Regression Summary ---")
print("Coefficient Standard Errors:", [f"{val:.4f}" for val in summary.coefficientStandardErrors])
print("T Values:", [f"{val:.4f}" for val in summary.tValues])
print("P Values:", [f"{val:.4f}" for val in summary.pValues])



--- Regression Summary ---


[Stage 151:>                                                        (0 + 1) / 1]

Coefficient Standard Errors: ['25276.5887', '25276.5887', '23.0766', '20919.0316', '20935.4220', '20938.8229', '21078.7774', '21081.4962', '21097.5086', '21109.1394', '21269.3946', '21346.9665', '21574.6969', '21648.7352', '21749.9860', '22066.3414', '22082.7237', '22639.3900', '22401.1834', '25723.4373', '25718.2598', '3157.3427', '3732.8539', '3335.2674', '3406.5553', '3789.6546', '5036.6098', '917.1197', '22023.8482']
T Values: ['0.0637', '0.0637', '1.2162', '0.0597', '0.2097', '0.4593', '-1.2236', '0.6702', '-0.5164', '-0.0804', '0.4299', '-0.5843', '0.0635', '-0.7632', '1.5217', '-0.4440', '-0.2312', '0.1500', '0.2232', '-1.0698', '0.0832', '2.1477', '-1.7161', '1.1068', '2.2995', '0.5594', '-0.4434', '-0.0417', '3.9274']
P Values: ['0.9492', '0.9492', '0.2240', '0.9524', '0.8339', '0.6461', '0.2212', '0.5028', '0.6056', '0.9359', '0.6673', '0.5590', '0.9493', '0.4454', '0.1282', '0.6570', '0.8171', '0.8808', '0.8234', '0.2848', '0.9337', '0.0318', '0.0862', '0.2684', '0.0215', '0

                                                                                

In [12]:
# print(f"\nDispersion: {summary.dispersion:.4f}")
print(f"Null Deviance: {summary.nullDeviance:.4f}")
print(f"Residual DF Null: {summary.residualDegreeOfFreedomNull}")
print(f"Deviance: {summary.deviance:.4f}")
print(f"Residual DF: {summary.residualDegreeOfFreedom}")
print(f"AIC: {summary.aic:.4f}")


                                                                                

Null Deviance: 2321265663379.3027
Residual DF Null: 4069
Deviance: 1808433477867.9951
Residual DF: 4041


[Stage 172:>                                                        (0 + 1) / 1]

AIC: 92652.3408


                                                                                

In [13]:
# 1. Pull feature names directly from Java backend
feature_names = summary._call_java("featureNames")

# 2. Construct full table including intercept
features = ["Intercept"] + feature_names
coefs = [glr_model.intercept] + list(glr_model.coefficients)
se = list(summary.coefficientStandardErrors)
tvals = list(summary.tValues)
pvals = list(summary.pValues)

#This block ensures all regression output values (coefficients, errors, t-values, p-values) align correctly before building a summary table.
print("---This is Diagnostic check, No need to print it in the final doc---")
print("Length of features:", len(features))
print("Length of coefs:", len(coefs))
print("Length of se:", len(se))
print("Length of tvals:", len(tvals))
print("Length of pvals:", len(pvals))


---This is Diagnostic check, No need to print it in the final doc---
Length of features: 29
Length of coefs: 29
Length of se: 29
Length of tvals: 29
Length of pvals: 29


In [14]:
import pandas as pd
from tabulate import tabulate
from IPython.display import HTML

coef_table = pd.DataFrame({
    "Feature": features,
    "Estimate": [f"{v:.4f}" if v is not None else None for v in coefs],
    "Std Error": [f"{v:.4f}" if v is not None else None for v in se],
    "t-stat": [f"{v:.4f}" if v is not None else None for v in tvals],
    "p-Value": [f"{v:.4f}" if v is not None else None for v in pvals]
})

# 4. Save for report
coef_table.to_csv("output/glr_summary.csv", index=False)

# 5. Optional pretty print
HTML(coef_table.to_html())


Unnamed: 0,Feature,Estimate,Std Error,t-stat,p-Value
0,Intercept,86495.3805,25276.5887,0.0637,0.9492
1,MIN_YEARS_EXPERIENCE,1611.1414,25276.5887,0.0637,0.9492
2,MAX_YEARS_EXPERIENCE,1611.1414,23.0766,1.2162,0.224
3,DURATION,28.065,20919.0316,0.0597,0.9524
4,"EDUCATION_LEVELS_NAME_vec_""Bachelor's degree""",1247.8481,20935.422,0.2097,0.8339
5,"EDUCATION_LEVELS_NAME_vec_""No Education Listed""",4389.5997,20938.8229,0.4593,0.6461
6,"EDUCATION_LEVELS_NAME_vec_""Bachelor's degree"", ""Master's degree""",9616.4514,21078.7774,-1.2236,0.2212
7,"EDUCATION_LEVELS_NAME_vec_""High school or GED"", ""Bachelor's degree""",-25792.4349,21081.4962,0.6702,0.5028
8,"EDUCATION_LEVELS_NAME_vec_""Bachelor's degree"", ""Master's degree"", ""Ph.D. or professional degree""",14128.7132,21097.5086,-0.5164,0.6056
9,"EDUCATION_LEVELS_NAME_vec_""High school or GED""",-10895.2795,21109.1394,-0.0804,0.9359


In [15]:
print(1617.0 - 25155.0, 1617.0 + 25155.0)


-23538.0 26772.0


In [18]:
#| eval: true
#| echo: false
#| fig-align: center

# Index and One-Hot Encode
# Create squared term for Polynomial Regression
poly_data = regression_data.withColumn("MAX_YEARS_EXPERIENCE_SQ", pow(col("MAX_YEARS_EXPERIENCE"), 2))

# Assemble polynomial features
assembler_poly = VectorAssembler(
    inputCols=[
        "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE_SQ", "MAX_YEARS_EXPERIENCE",
        "DURATION"
    ] + [f"{col}_vec" for col in categorical_cols],
    outputCol="features_poly"
)

poly_data = assembler_poly.transform(poly_data)

# Show final structure
poly_data.select("SALARY",  "features_poly").show(5, truncate=False) #"SALARY", "features", "features_poly"


                                                                                

+--------+-------------------------------------------------------------------+
|SALARY  |features_poly                                                      |
+--------+-------------------------------------------------------------------+
|116500.0|(29,[0,1,2,3,4,22,24,27,28],[2.0,4.0,2.0,6.0,1.0,1.0,1.0,1.0,1.0]) |
|116500.0|(29,[0,1,2,3,5,22,24,27],[7.0,49.0,7.0,18.0,1.0,1.0,1.0,1.0])      |
|116500.0|(29,[0,1,2,3,5,22,24,27],[1.0,1.0,1.0,8.0,1.0,1.0,1.0,1.0])        |
|116500.0|(29,[0,1,2,3,4,22,24,27,28],[1.0,1.0,1.0,32.0,1.0,1.0,1.0,1.0,1.0])|
|131100.0|(29,[0,1,2,3,4,22,24,27,28],[2.0,4.0,2.0,11.0,1.0,1.0,1.0,1.0,1.0])|
+--------+-------------------------------------------------------------------+
only showing top 5 rows


In [19]:
# Split Data
polyreg_train, polyreg_test = poly_data.randomSplit([0.8, 0.2], seed=42)
print((poly_data.count(), len(poly_data.columns)))
print((polyreg_train.count(), len(polyreg_train.columns)))
print((polyreg_test.count(), len(polyreg_test.columns)))


                                                                                

(5039, 24)


                                                                                

(4070, 24)


[Stage 218:>                                                        (0 + 1) / 1]

(969, 24)


                                                                                

In [21]:
from pyspark.ml.regression import GeneralizedLinearRegression

feature_names = assembler.getInputCols()

poly_glr_max_years = GeneralizedLinearRegression(
    featuresCol="features",
    labelCol="SALARY",
    family="gaussian", # normal distribution
    link="identity", # standard linear regression
    maxIter=10, # number of iterations for least squares
    regParam=0.3 # regularization parameter (L2 regularization by default)
)

poly_glr_max_years_model = poly_glr_max_years.fit(poly_data)
poly_summary = poly_glr_max_years_model.summary




                                                                                

In [22]:
# Coefficients and Intercept
print("Intercept: {:.4f}".format(poly_glr_max_years_model.intercept))
print("Coefficients:")
for i, coef in enumerate(poly_glr_max_years_model.coefficients):
    print(f" Feature {i + 1}: {coef:.4f}")

# Summary stats
print("\n--- Regression Summary ---")
print("Coefficient Standard Errors:", [f"{val:.4f}" for val in poly_summary.coefficientStandardErrors])
print("T Values:", [f"{val:.4f}" for val in poly_summary.tValues])
print("P Values:", [f"{val:.4f}" for val in poly_summary.pValues])


Intercept: 83684.2103
Coefficients:
 Feature 1: 1596.5795
 Feature 2: 1596.5795
 Feature 3: 31.1901
 Feature 4: 2943.3549
 Feature 5: 5624.8675
 Feature 6: 11303.5572
 Feature 7: -25678.5645
 Feature 8: 14668.1325
 Feature 9: -9752.7189
 Feature 10: -118.7467
 Feature 11: 13480.7278
 Feature 12: -9277.3038
 Feature 13: 3310.0724
 Feature 14: -14782.2689
 Feature 15: 35678.4989
 Feature 16: -8064.0197
 Feature 17: -2793.9409
 Feature 18: 17915.8367
 Feature 19: 8358.0949
 Feature 20: -35699.8382
 Feature 21: 32498.9245
 Feature 22: 6376.6043
 Feature 23: -5663.4155
 Feature 24: 4176.7678
 Feature 25: 8503.4618
 Feature 26: 2741.9488
 Feature 27: -765.8130
 Feature 28: -111.3548

--- Regression Summary ---


[Stage 253:>                                                        (0 + 1) / 1]

Coefficient Standard Errors: ['22889.3361', '22889.3361', '20.7234', '20967.0009', '20980.9421', '20982.8316', '21094.7761', '21104.6267', '21115.7024', '21143.7509', '21221.3547', '21327.2560', '21500.7656', '21553.1712', '21698.8688', '21921.5499', '22028.5324', '22141.1180', '22282.5601', '24311.4028', '24305.5662', '3075.8265', '3587.0214', '2925.8458', '2988.0317', '3375.9809', '4590.5476', '825.5726', '21910.9169']
T Values: ['0.0698', '0.0698', '1.5051', '0.1404', '0.2681', '0.5387', '-1.2173', '0.6950', '-0.4619', '-0.0056', '0.6352', '-0.4350', '0.1540', '-0.6859', '1.6443', '-0.3679', '-0.1268', '0.8092', '0.3751', '-1.4684', '1.3371', '2.0731', '-1.5789', '1.4275', '2.8458', '0.8122', '-0.1668', '-0.1349', '3.8193']
P Values: ['0.9444', '0.9444', '0.1324', '0.8884', '0.7886', '0.5901', '0.2235', '0.4871', '0.6442', '0.9955', '0.5253', '0.6636', '0.8777', '0.4928', '0.1002', '0.7130', '0.8991', '0.4185', '0.7076', '0.1420', '0.1813', '0.0382', '0.1144', '0.1535', '0.0044', '0

                                                                                

In [23]:
# print(f"\nDispersion: {summary.dispersion:.4f}")
print(f"Null Deviance: {poly_summary.nullDeviance:.4f}")
print(f"Residual DF Null: {poly_summary.residualDegreeOfFreedomNull}")
print(f"Deviance: {poly_summary.deviance:.4f}")
print(f"Residual DF: {poly_summary.residualDegreeOfFreedom}")
print(f"AIC: {poly_summary.aic:.4f}")


                                                                                

Null Deviance: 2909123659016.3213
Residual DF Null: 5038
Deviance: 2272959595356.6333
Residual DF: 5010


[Stage 271:>                                                        (0 + 1) / 1]

AIC: 114772.9258


                                                                                

In [24]:
# 1. Pull feature names directly from Java backend
feature_names = poly_summary._call_java("featureNames")

# 2. Construct full table including intercept
poly_features = ["Intercept"] + feature_names
poly_coefs = [poly_glr_max_years_model.intercept] + list(poly_glr_max_years_model.coefficients)
poly_se = list(poly_summary.coefficientStandardErrors)
poly_tvals = list(poly_summary.tValues)
poly_pvals = list(poly_summary.pValues)

print("---This is Diagnostic check, No need to print it in the final doc---")
print("Length of features:", len(poly_features))
print("Length of coefs:", len(poly_coefs))
print("Length of se:", len(poly_se))
print("Length of tvals:", len(poly_tvals))
print("Length of pvals:", len(poly_pvals))


---This is Diagnostic check, No need to print it in the final doc---
Length of features: 29
Length of coefs: 29
Length of se: 29
Length of tvals: 29
Length of pvals: 29


In [25]:
import pandas as pd
from tabulate import tabulate
from IPython.display import HTML

poly_coef_table = pd.DataFrame({
    "Feature": poly_features,
    "Estimate": [f"{v:.4f}" if v is not None else None for v in poly_coefs],
    "Std Error": [f"{v:.4f}" if v is not None else None for v in poly_se],
    "t-stat": [f"{v:.4f}" if v is not None else None for v in poly_tvals],
    "p-value": [f"{v:.4f}" if v is not None else None for v in poly_pvals]
})

# 4. Save for report
poly_coef_table.to_csv("output/poly_summary.csv", index=False)

# 5. Optional pretty print
HTML(poly_coef_table.to_html(index=False))


Feature,Estimate,Std Error,t-stat,p-value
Intercept,83684.2103,22889.3361,0.0698,0.9444
MIN_YEARS_EXPERIENCE,1596.5795,22889.3361,0.0698,0.9444
MAX_YEARS_EXPERIENCE,1596.5795,20.7234,1.5051,0.1324
DURATION,31.1901,20967.0009,0.1404,0.8884
"EDUCATION_LEVELS_NAME_vec_""Bachelor's degree""",2943.3549,20980.9421,0.2681,0.7886
"EDUCATION_LEVELS_NAME_vec_""No Education Listed""",5624.8675,20982.8316,0.5387,0.5901
"EDUCATION_LEVELS_NAME_vec_""Bachelor's degree"", ""Master's degree""",11303.5572,21094.7761,-1.2173,0.2235
"EDUCATION_LEVELS_NAME_vec_""High school or GED"", ""Bachelor's degree""",-25678.5645,21104.6267,0.695,0.4871
"EDUCATION_LEVELS_NAME_vec_""Bachelor's degree"", ""Master's degree"", ""Ph.D. or professional degree""",14668.1325,21115.7024,-0.4619,0.6442
"EDUCATION_LEVELS_NAME_vec_""High school or GED""",-9752.7189,21143.7509,-0.0056,0.9955
