## 1. Initialization

In [None]:
import numpy as np
import pandas as pd
import sagemaker
from sklearn.model_selection import train_test_split
from sagemaker.pytorch import PyTorch

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
prefix = "sagemaker/human_or_ai_classification"
region = sagemaker_session.boto_session.region_name
role = sagemaker.get_execution_role()

## 2. Dataset Preparation </h2>

In [None]:
df = pd.read_csv(
    "dataset/raw.csv",
    header=0,
    usecols=[0, 1],
    names=["text", "label"],
)

df.head()

### Split the Dataset into Training and Test Sets

In [None]:
train, test = train_test_split(df, test_size=0.2)  # 80/20 split
train.to_csv("dataset/train.csv", index=False)
test.to_csv("dataset/test.csv", index=False)

### Upload to S3 Bucket

In [None]:
inputs_train = sagemaker_session.upload_data("dataset/train.csv", bucket=bucket, key_prefix=prefix)
inputs_test = sagemaker_session.upload_data("dataset/test.csv", bucket=bucket, key_prefix=prefix)

## 3. Training

#### Model Script

In [None]:
!pygmentize code/model.py

#### Training Script

Using a pre-build PyTorch container

In [None]:
estimator = PyTorch(entry_point="model.py",
                    source_dir="code",
                    role=role,
                    framework_version="1.9",
                    py_version="py38",
                    instance_count=1,
                    instance_type="ml.m5.xlarge",
                    hyperparameters={"epochs": 1,
                                     "num_labels": 2,
                                    },
                    use_spot_instances=True,
                    max_run=4000,
                    max_wait=5000
                   )
estimator.fit({"training": inputs_train, "testing": inputs_test})

## 4. Hostng the Model Endpoint </h2>

In [None]:
predictor = estimator.deploy(initial_instance_count=2, instance_type="ml.m5.xlarge")
predictor.serializer = sagemaker.serializers.JSONSerializer()
predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

## 5. Inferance

In [None]:
class_label = {1: "AI", 0: "HUMAN"}

In [None]:
test_example = ["This is an example text written by a human"]

In [None]:
result = predictor.predict(test_sentences)
result = list(np.argmax(result, axis=1))
predicted_label = class_label[result[0]]

Predicted_label

## 5. Clean up 

In [None]:
predictor.delete_endpoint()

## 6. References

- https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html
- https://medium.com/analytics-vidhya/aws-sagemaker-train-deploy-and-update-a-hugging-face-bert-model-eeefc8211368