<a href="https://colab.research.google.com/github/hbisgin/BigDatav1/blob/main/Lecture15_LinearRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# ------------------------------------------
# 1. Import libraries
# ------------------------------------------
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

# ------------------------------------------
# 2. Start Spark session
# ------------------------------------------
spark = SparkSession.builder \
    .appName("LinearRegressionExample") \
    .getOrCreate()

# ------------------------------------------
# 3. Create sample dataset
# ------------------------------------------
data = [
    (1.0, 2.0),
    (2.0, 2.8),
    (3.0, 4.2),
    (4.0, 4.9),
    (5.0, 6.1),
    (6.0, 6.8)
]

columns = ["feature", "label"]
df = spark.createDataFrame(data, columns)

# Spark’s ML models expect a vector column named “features”
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["feature"], outputCol="features")
train_df = assembler.transform(df).select("features", "label")

train_df.show()


In [0]:
# ------------------------------------------
# 4. Fit Linear Regression model
# ------------------------------------------
lr = LinearRegression(featuresCol='features', labelCol='label')
lr_model = lr.fit(train_df)

# ------------------------------------------
# 5. Print model coefficients
# ------------------------------------------
print(f"Intercept: {lr_model.intercept:.3f}")
print(f"Slope (Coefficient): {lr_model.coefficients[0]:.3f}")


## Predictions

In [0]:
# ------------------------------------------
# 6. Make predictions
# ------------------------------------------
predictions = lr_model.transform(train_df)
predictions.show()

# ------------------------------------------
# 7. Evaluate model performance
# ------------------------------------------
trainingSummary = lr_model.summary
print(f"RMSE: {trainingSummary.rootMeanSquaredError:.3f}")
print(f"R²: {trainingSummary.r2:.3f}")
