# Churn Prediction using Amazon SageMaker (XGBoost – Script Mode)

This notebook contains only the steps that worked end-to-end.

## 1. Load & Clean Dataset

In [None]:

import pandas as pd

df = pd.read_excel("E Commerce Dataset.xlsx", sheet_name="E Comm")
df = df.dropna()

churn = df["Churn"]
df = df.drop(columns=["Churn"])
df["Churn"] = churn

df.to_csv("xgb_train.csv", header=False, index=False)
print("Saved cleaned dataset")


## 2. Upload Dataset to S3

In [None]:

import boto3

bucket = "sparkify-churn-mlproject-apse2"
key = "xgb_train/xgb_train.csv"

s3 = boto3.client("s3")
s3.upload_file("xgb_train.csv", bucket, key)

print(f"Uploaded to s3://{bucket}/{key}")


## 3. Training Script

In [None]:

%%writefile train.py
import argparse
import pandas as pd
import xgboost as xgb

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_dir", type=str, default="/opt/ml/model")
    parser.add_argument("--train", type=str, default="/opt/ml/input/data/train")
    args = parser.parse_args()

    df = pd.read_csv(f"{args.train}/xgb_train.csv", header=None)
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    dtrain = xgb.DMatrix(X, label=y)
    params = {
        "objective": "binary:logistic",
        "max_depth": 5,
        "eta": 0.2,
        "eval_metric": "logloss"
    }

    model = xgb.train(params, dtrain, num_boost_round=200)
    model.save_model(f"{args.model_dir}/xgboost-model")


## 4. Train on SageMaker

In [None]:

import sagemaker
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.inputs import TrainingInput

role = sagemaker.get_execution_role()

xgb = XGBoost(
    entry_point="train.py",
    source_dir=".",
    framework_version="1.3-1",
    py_version="py3",
    instance_type="ml.m5.large",
    role=role,
)

xgb.fit({
    "train": TrainingInput(
        f"s3://{bucket}/{key}",
        content_type="text/csv"
    )
})


## 5. Deploy Endpoint

In [None]:

predictor = xgb.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large"
)
print(predictor.endpoint_name)


## 6. Predict

In [None]:

sample = df.iloc[[0]].drop(columns=["Churn"])
payload = sample.to_csv(header=False, index=False)

prediction = predictor.predict(payload)
print(prediction)


## 7. Cleanup

In [None]:

predictor.delete_endpoint()
print("Endpoint deleted")
