In [None]:
import boto3
import sagemaker
import pandas as pd
import numpy as np
import sagemaker.amazon.common as smac
from sklearn.model_selection import train_test_split
from sagemaker import Session
from sagemaker.estimator import Estimator

In [None]:
session = boto3.Session(region_name="us-east-2")
sagemaker_session = sagemaker.Session(boto_session=session)

# Define AWS resources
s3_bucket = "recommender-system-jcontreras"
s3_prefix = "training-data"
sagemaker_role = "arn:aws:iam::<YOUR__ACCOUNT_ID>:role/AmazonSageMaker-ExecutionRole"

# Initialize boto3 clients
s3_client = boto3.client('s3')

In [None]:
# Step 1: Load and preprocess data
def load_and_preprocess_data():
    # Load dataset
    movies_file = f"s3://{s3_bucket}/movies.csv"
    ratings_file = f"s3://{s3_bucket}/ratings.csv"

    # Read the data
    movies_df = pd.read_csv(movies_file)
    ratings_df = pd.read_csv(ratings_file)

    # Print column information for debugging
    print("Movies columns:", movies_df.columns)
    print("Ratings columns:", ratings_df.columns)

    # Merge datasets
    data = pd.merge(ratings_df, movies_df, on="movieId")

    # Prepare data for training
    data = data[["userId", "movieId", "rating"]]

    # Encode user_id and movie_id as integers
    data["userId"] = data["userId"].astype("category").cat.codes
    data["movieId"] = data["movieId"].astype("category").cat.codes

    # Extract the number of features
    feature_dim = 2  # userId and movieId are the features
    print("Feature Dimension:", feature_dim)

    # Split data into train and test sets
    train, test = train_test_split(data, test_size=0.2, random_state=42)

    # Separate features (X) and labels (y)
    train_X = train[["userId", "movieId"]].values.astype("float32")
    train_y = train["rating"].values.astype("float32")
    test_X = test[["userId", "movieId"]].values.astype("float32")
    test_y = test["rating"].values.astype("float32")

    # Convert to RecordIO protobuf format
    def write_recordio(data, labels, file_path):
        with open(file_path, "wb") as f:
            smac.write_numpy_to_dense_tensor(f, data, labels)

    write_recordio(train_X, train_y, "train.recordio")
    write_recordio(test_X, test_y, "test.recordio")

    # Upload RecordIO files to S3
    s3_client = boto3.client("s3")
    s3_client.upload_file("train.recordio", s3_bucket, f"{s3_prefix}/train/train.recordio")
    s3_client.upload_file("test.recordio", s3_bucket, f"{s3_prefix}/test/test.recordio")

    print("Data preprocessing completed and uploaded to S3.")

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri

# Step 2: Train model
def train_model():
    session = Session()
    container = get_image_uri(boto3.Session().region_name, "factorization-machines")

    # Define estimator
    estimator = Estimator(
        container,
        role=sagemaker_role,
        instance_count=1,
        instance_type="ml.m5.large",
        output_path=f"s3://{s3_bucket}/output"
    )
    
    estimator.set_hyperparameters(
        feature_dim=2,
        num_factors=64,
        predictor_type="binary_classifier"
    )

    # Specify input data
    train_input = sagemaker.inputs.TrainingInput(
        s3_data=f"s3://{s3_bucket}/{s3_prefix}/train/",
        content_type="application/x-recordio-protobuf"
    )

    validation_input = sagemaker.inputs.TrainingInput(
        s3_data=f"s3://{s3_bucket}/{s3_prefix}/test/",
        content_type="application/x-recordio-protobuf"
    )

    estimator.fit({
        "train": train_input,
        "validation": validation_input
    })

    print("Model training completed.")
    
    return estimator


In [None]:
# Step 3: Deploy model
def deploy_model(estimator):
    # Deploy the model
    predictor = estimator.deploy(
        initial_instance_count=1,
        instance_type="ml.m5.large",
        endpoint_name="recommender-endpoint"
    )

    print(f"Model deployed at endpoint: recommender-endpoint")
    return predictor

# Execute the pipeline
if __name__ == "__main__":
    load_and_preprocess_data()
    trained_estimator = train_model()
    deploy_model(trained_estimator)