In [0]:
import numpy as np

# 1. Define the Coefficient Matrix 'a'
# a represents the coefficients of the unknown variables (x1, x2)
a = np.array([
    [2, 3],
    [1, 4]
])

# 2. Define the Result Vector 'b'
# b represents the results of the equations
b = np.array([12, 11])

# 3. Use np.linalg.solve to find the unknown vector 'x'
# x = [x1, x2]
x = np.linalg.solve(a, b)

print("--- Solving the System a*x = b ---")
print(f"Coefficient Matrix (a):\n{a}")
print(f"\nResult Vector (b):\n{b}")
print("-" * 30)
print(f"Solution Vector (x1, x2):\n{x}")

# 4. Verification (Optional but helpful)
# Check if a * x equals b
b_check = a @ x
print(f"\nVerification (a * x):\n{b_check}")

In [0]:
import numpy as np

def als_numpy(R, rank=2, max_iters=10, alpha=0.01, lambda_reg=0.1):
    """
    Alternating Least Squares (ALS) implementation using NumPy.

    R: The rating matrix (Users x Items). Missing values should be 0.
    rank: The number of latent factors (k).
    max_iters: Maximum number of iterations (epochs).
    alpha: Learning rate (not strictly used here, but for context).
    lambda_reg: Regularization parameter (lambda).
    """
    
    # Get dimensions
    num_users, num_items = R.shape
    
    # 1. Initialize factor matrices (U: Users x Rank, V: Items x Rank)
    # V is initialized as the transpose of the item factors (Items x Rank) for calculation ease.
    U = np.random.rand(num_users, rank)
    V = np.random.rand(num_items, rank)
    
    # Find observed ratings (where R > 0)
    observed_mask = R > 0
    
    # 2. ALS Optimization Loop
    for iteration in range(max_iters):
        
        # --- A. FIX V, SOLVE FOR U (User Factors) ---
        for i in range(num_users):
            # Items rated by user i
            V_i = V[observed_mask[i, :]]
            R_i = R[i, observed_mask[i, :]]
            
            # Least Squares Solution (Normal Equation):
            # U_i = (V_i^T * V_i + lambda * I)^-1 * V_i^T * R_i
            
            # Identity matrix for regularization
            reg_term = lambda_reg * np.eye(rank)
            
            # Solve for U[i, :] using matrix inversion
            U[i, :] = np.linalg.solve(
                V_i.T @ V_i + reg_term, 
                V_i.T @ R_i
            )

        # --- B. FIX U, SOLVE FOR V (Item Factors) ---
        for j in range(num_items):
            # Users who rated item j
            U_j = U[observed_mask[:, j]]
            R_j = R[observed_mask[:, j], j]
            
            # Solve for V[j, :] using matrix inversion
            reg_term = lambda_reg * np.eye(rank)

            V[j, :] = np.linalg.solve(
                U_j.T @ U_j + reg_term, 
                U_j.T @ R_j
            )
        
        # --- C. Check Loss (Optional but good for monitoring) ---
        # Calculate predicted ratings
        R_pred = U @ V.T
        
        # Calculate error only on observed ratings
        error = R_pred[observed_mask] - R[observed_mask]
        
        # Calculate Regularized Squared Error (similar to Spark's RMSE goal)
        loss = np.sum(error**2) + lambda_reg * (np.sum(U**2) + np.sum(V**2))
        
        # print(f"Iteration {iteration+1}: Loss = {loss:.4f}")

    return U, V, R_pred

# --- RUN THE EXAMPLE ---

# The toy rating matrix R (Users x Items)
R = np.array([
    [5., 3., 0., 1.],
    [4., 0., 0., 1.],
    [1., 1., 0., 5.],
    [0., 0., 5., 4.],
    [0., 1., 5., 4.]
])

# Run the ALS algorithm
U_factors, V_factors, R_predicted = als_numpy(R, rank=2, max_iters=20, lambda_reg=0.1)

print("--- Original Rating Matrix (R) ---")
print(R)
print("\n--- Predicted Rating Matrix (R_pred) ---")
# Round the prediction for readability
print(np.round(R_predicted, 2))
print("\n--- Extracted User Factors (U) ---")
print(np.round(U_factors, 4))

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType
from pyspark.sql.functions import col

In [0]:
movie_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("item_id", IntegerType(), True),
    StructField("rating", FloatType(), True),
    StructField("timestamp", IntegerType(), True)
])

In [0]:
training_raw = spark.read.csv(
    "dbfs:/FileStore/BDA_Datasets/sample_movielens_ratings.txt", # Often a .dat extension for this format
    header=False,
    schema=movie_schema,
    sep="::"                     # Set the custom delimiter
)

# Select only the three columns required for ALS
training_data = training_raw.select("user_id", "item_id", "rating")
training_data.cache() 



In [0]:
display(training_data)

In [0]:
dbutils.fs.ls("dbfs:/FileStore/BDA_Datasets/")

In [0]:
# 3. Load the smaller testing data
test_raw = spark.read.csv(
    "dbfs:/FileStore/BDA_Datasets/movielens_test.data", # Use the same settings for the test file
    header=False,
    schema=movie_schema,
    sep=","
)
test_data = test_raw.select("user_id", "item_id", "rating")

In [0]:
display(test_data)

In [0]:
from pyspark.ml.recommendation import ALS

# --- TRAINING THE MODEL ---

als = ALS(
    rank=10,
    maxIter=10,
    regParam=0.01,
    userCol="user_id",
    itemCol="item_id",
    ratingCol="rating",
    coldStartStrategy="drop",
    seed=42
)
model = als.fit(training_data)





In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

# 1. Generate predictions on the training data
predictions_train = model.transform(training_data)

# 2. Define the evaluator (using the same evaluator defined for the test set)
train_evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

# 3. Calculate RMSE and drop rows where prediction is NULL (due to cold start)
rmse_train = train_evaluator.evaluate(predictions_train.na.drop())

print(f"Root Mean Squared Error (RMSE) on TRAINING data = {rmse_train}")

In [0]:
predictions_train.display()

In [0]:
# --- EVALUATION ---

predictions_test = model.transform(test_data)

test_evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse_test = test_evaluator.evaluate(predictions_test.na.drop())
print(f"Root Mean Squared Error (RMSE) on TEST data = {rmse_test}")

In [0]:
predictions_test.display()

Let's do some parameter search. I'm first going to split my training data into training and validation before actually going to the test set

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

# Set the split ratios (e.g., 80% for training, 20% for validation)
(train_set, validation_set) = training_data.randomSplit([0.8, 0.2], seed=42)

# Cache the sets for faster iterative access during parameter testing
train_set.cache()
validation_set.cache()

# Define the evaluator once
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

Since cross validation could be a bit costly, this time around I made it simple in a way that we will go through a loop to test different parameters only on this 20%

In [0]:
from pyspark.ml.recommendation import ALS #I keep adding those in case you want to run this independently

# --- TRAINING THE MODEL ---
# --- Parameters to Test ---
# Define a list of (rank, regParam) tuples to try
param_combinations = [
    (5, 0.1),    # Rank=5, RegParam=0.1
    (10, 0.1),   # Rank=10, RegParam=0.1
    (10, 0.01),
    (20, 0.1),
    (20, 0.001)
]
best_rmse = float('inf')
best_params = {}
results = []

print("Starting Hold-Out Validation...")

for current_rank, current_regParam in param_combinations:
    
    # 1. Initialize ALS with current parameters
    als = ALS(
        rank=current_rank,
        maxIter=10,
        regParam=current_regParam,
        userCol="user_id",
        itemCol="item_id",
        ratingCol="rating",
        coldStartStrategy="drop",
        seed=42
    )

    # 2. Train the model on the dedicated Training Set
    model = als.fit(train_set)

    # 3. Predict on the dedicated Validation Set
    predictions = model.transform(validation_set)
    
    # 4. Evaluate RMSE on the validation set (ignoring NaNs from cold start)
    rmse = evaluator.evaluate(predictions.na.drop())
    
    results.append({
        'rank': current_rank,
        'regParam': current_regParam,
        'validation_rmse': rmse
    })
    
    print(f"Tested Rank={current_rank}, RegParam={current_regParam}: RMSE={rmse:.4f}")

    # 5. Track the best result
    if rmse < best_rmse:
        best_rmse = rmse
        best_params = {
            'rank': current_rank,
            'regParam': current_regParam,
            'maxIter': 10 # Fixed for this example
        }

print("-" * 40)
print(f"Validation Complete. Best RMSE: {best_rmse:.4f}")
print(f"Best Parameters: Rank={best_params['rank']}, RegParam={best_params['regParam']}")

In [0]:
# Retrain the final model using the best parameters found
final_als = ALS(
    rank=best_params['rank'],
    maxIter=10,
    regParam=best_params['regParam'],
    userCol="user_id",
    itemCol="item_id",
    ratingCol="rating",
    coldStartStrategy="drop",
    seed=42
)

final_model = final_als.fit(training_data)

print("\nFinal ALS Model trained on all available training data using optimal parameters.")

Do your test again on your test data

In [0]:
predictions_test = final_model.transform(test_data)
regressor_evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse_test = regressor_evaluator.evaluate(predictions_test.na.drop())
print(f"Root Mean Squared Error (RMSE) on TEST data = {rmse_test}")

In [0]:
predictions_test.display()

#ParamGrid & CrossValidation for future reference, but I'm not going to do this part in class

In [0]:
# 1. Define the search space for hyperparameters
# Note: You can expand these lists to search more combinations!
param_grid = ParamGridBuilder() \
    .addGrid(als.rank, [5, 10, 15]) \ 
    .addGrid(als.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(als.maxIter, [10]) \
    .build()

# The grid above will test 3 ranks * 3 regParams * 1 maxIter = 9 total models.

In [0]:
# This is our CrossValidator like we did before
cv = CrossValidator(
    estimator=als,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    numFolds=5,    # K-Fold Cross-Validation (commonly 3 or 5)
    seed=42
)

print("Starting 5-Fold Cross-Validation...")

# 2. Run Cross-Validation (This is the time-consuming step!)
# The CrossValidator will fit the total of 9 models * 5 folds = 45 ALS models.
cv_model = cv.fit(training_data)

print("Cross-Validation complete.")