In [None]:
pip install pyspark


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


In [None]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Scalable Machine Learning with Spark") \
    .getOrCreate()

print("SparkSession Initialized")


In [None]:
# Load dataset
data_url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv"
columns = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"]
data = spark.read.csv(data_url, schema=",".join(columns), header=False, inferSchema=True)

# Display dataset schema and first few rows
data.printSchema()
data.show(5)


In [None]:
# Rename target column and vectorize features
data = data.withColumnRenamed("MEDV", "label")
feature_columns = data.columns[:-1]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
vectorized_data = assembler.transform(data).select("features", "label")

# Split data into training and testing sets
train_data, test_data = vectorized_data.randomSplit([0.8, 0.2], seed=42)

print(f"Training Data Count: {train_data.count()}")
print(f"Testing Data Count: {test_data.count()}")


In [None]:
# Train a Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(train_data)

# Display model coefficients and intercept
print("Coefficients:", lr_model.coefficients)
print("Intercept:", lr_model.intercept)


In [None]:
# Make predictions on the test data
predictions = lr_model.transform(test_data)

# Evaluate model performance
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")


In [None]:
# Save the model
model_path = "spark_lr_model"
lr_model.write().overwrite().save(model_path)
print(f"Model saved to {model_path}")

# Load the model
loaded_model = LinearRegression.load(model_path)
print("Model reloaded successfully!")


In [None]:
# Analyze feature importance
for feature, coef in zip(feature_columns, lr_model.coefficients):
    print(f"Feature: {feature}, Coefficient: {coef:.4f}")
