# AWS SageMaker PySpark Titanic Classification Model
This notebook demonstrates how to:
- Load the Titanic dataset from S3
- Preprocess the data using PySpark
- Train a Logistic Regression model
- Save the trained model to S3
- Load the model from S3 and perform manual predictions


In [None]:
# Install dependencies (if not installed)
!pip install pyspark

In [None]:
# Import Required Libraries
import boto3
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.pipeline import Pipeline

In [None]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Titanic_Classification") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

In [None]:
# Load Titanic Dataset from S3
s3_bucket = "your-s3-bucket-name"
s3_path = f"s3a://{s3_bucket}/titanic.csv"

df = spark.read.csv(s3_path, header=True, inferSchema=True)
df.show(5)

In [None]:
# Data Preprocessing
df = df.select("Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare")
df = df.withColumn("Sex", when(col("Sex") == "male", 1).otherwise(0))
df = df.fillna({"Age": df.selectExpr("avg(Age)").collect()[0][0]})

label_indexer = StringIndexer(inputCol="Survived", outputCol="label")
feature_assembler = VectorAssembler(
    inputCols=["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"],
    outputCol="features"
)

In [None]:
# Train the Model
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)
lr = LogisticRegression(featuresCol="features", labelCol="label")
pipeline = Pipeline(stages=[label_indexer, feature_assembler, lr])
model = pipeline.fit(train_df)

In [None]:
# Save Model to S3
s3_output_path = f"s3a://{s3_bucket}/titanic_model"
model.write().overwrite().save(s3_output_path)
print(f"Model saved to {s3_output_path}")

In [None]:
# Load Model from S3 and Perform Predictions
from pyspark.ml.classification import LogisticRegressionModel

model = LogisticRegressionModel.load(s3_output_path)

manual_data = spark.createDataFrame([
    (3, 1, 22.0, 1, 0, 7.25),
    (1, 0, 38.0, 1, 0, 71.28)
], ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"])

manual_data = feature_assembler.transform(manual_data)
manual_predictions = model.transform(manual_data)
manual_predictions.select("features", "prediction").show()