This code was borrowed from AAI 540 labs

https://github.com/mechristenson/aai-540-labs.git

## Create S3 Bucket

In [1]:
#!pip install --upgrade boto3 botocore awscli

In [2]:
import boto3
import sagemaker

session = boto3.session.Session()
region = session.region_name
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

s3 = boto3.Session().client(service_name="s3", region_name=region)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [3]:
setup_s3_bucket_passed = False

In [4]:
print("Default bucket: {}".format(bucket))

Default bucket: sagemaker-us-east-1-590183687297


Verify S3_BUCKET Bucket Creation

In [5]:
from botocore.client import ClientError

response = None

try:
    response = s3.head_bucket(Bucket=bucket)
    print(response)
    setup_s3_bucket_passed = True
except ClientError as e:
    print("[ERROR] Cannot find bucket {} in {} due to {}.".format(bucket, response, e))

{'ResponseMetadata': {'RequestId': '22A44Q29S48394A3', 'HostId': 'DilCGa8B3Vbn0czvhWbKof5c+yoq/EqUvkAnya5cC+rSEJISj6CEjbf5Oy/CGGsmRYaR2EKrhFJCStbALHMtUg==', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'DilCGa8B3Vbn0czvhWbKof5c+yoq/EqUvkAnya5cC+rSEJISj6CEjbf5Oy/CGGsmRYaR2EKrhFJCStbALHMtUg==', 'x-amz-request-id': '22A44Q29S48394A3', 'date': 'Wed, 24 Sep 2025 02:27:27 GMT', 'x-amz-bucket-region': 'us-east-1', 'x-amz-access-point-alias': 'false', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'BucketRegion': 'us-east-1', 'AccessPointAlias': False}


In [6]:
%store setup_s3_bucket_passed


Stored 'setup_s3_bucket_passed' (bool)


In [7]:
%store


Stored variables and their in-db values:
ingest_create_athena_db_passed             -> True
s3_private_path_csv                        -> 's3://sagemaker-us-east-1-590183687297/spotify_tra
s3_private_path_parquet                    -> 's3://sagemaker-us-east-1-590183687297/toxicity_pd
setup_s3_bucket_passed                     -> True


## Set up Data lake

In [8]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

#### Set S3 Destination Folder

In [9]:
s3_private_path_parquet = "s3://{}/toxicity_pds/parquet".format(bucket)
print(s3_private_path_parquet)

s3://sagemaker-us-east-1-590183687297/toxicity_pds/parquet


In [10]:
%store s3_private_path_parquet

Stored 's3_private_path_parquet' (str)


In [11]:
%store -r setup_s3_bucket_passed

In [12]:
try:
    setup_s3_bucket_passed
except NameError:
    print("+++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.")
    print("+++++++++++++++++++++++++++++++")


In [13]:
if not setup_s3_bucket_passed:
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup S3 Bucket.")
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

### Copy data from github to S3 bucket

In [14]:
!aws s3 cp --recursive /home/sagemaker-user/aai540_toxicity_classification/civil $s3_private_path_parquet/ --include "*"


upload: civil/validation-00000-of-00001.parquet to s3://sagemaker-us-east-1-590183687297/toxicity_pds/parquet/validation-00000-of-00001.parquet
upload: civil/test-00000-of-00001.parquet to s3://sagemaker-us-east-1-590183687297/toxicity_pds/parquet/test-00000-of-00001.parquet
upload: civil/train-00000-of-00001.parquet to s3://sagemaker-us-east-1-590183687297/toxicity_pds/parquet/train-00000-of-00001.parquet


### List the files

In [15]:
print(s3_private_path_parquet)

s3://sagemaker-us-east-1-590183687297/toxicity_pds/parquet


In [16]:
!aws s3 ls $s3_private_path_parquet/

2025-09-24 02:27:35   34099179 test-00000-of-00001.parquet
2025-09-24 02:27:35   68844404 train-00000-of-00001.parquet
2025-09-24 02:27:35   11697541 validation-00000-of-00001.parquet


In [17]:
%store

Stored variables and their in-db values:
ingest_create_athena_db_passed             -> True
s3_private_path_csv                        -> 's3://sagemaker-us-east-1-590183687297/spotify_tra
s3_private_path_parquet                    -> 's3://sagemaker-us-east-1-590183687297/toxicity_pd
setup_s3_bucket_passed                     -> True


## Create Athena Database Schema

In [17]:
ingest_create_athena_db_passed = False

In [18]:
%store -r s3_private_path_parquet

In [19]:
try:
    s3_private_path_parquet
except NameError:
    print("*****************************************************************************")
    print("[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************")
    print("[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************")
    print("*****************************************************************************")

In [20]:
print(s3_private_path_parquet)

s3://sagemaker-us-east-1-590183687297/toxicity_pds/parquet


### Import PyAthena

In [21]:
from pyathena import connect

### Create Athena Database

In [22]:
database_name = "aai540_toxicity_aws"

In [23]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [24]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [25]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

CREATE DATABASE IF NOT EXISTS aai540_toxicity_aws


In [26]:
import pandas as pd

pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


### Verify The Database Has Been Created Succesfully

In [27]:
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,aai540_spotify_aws
1,aai540_toxicity_aws
2,default
3,sagemaker_featurestore


In [28]:
if database_name in df_show.values:
    ingest_create_athena_db_passed = True

In [29]:
%store ingest_create_athena_db_passed

Stored 'ingest_create_athena_db_passed' (bool)


In [30]:
%store

Stored variables and their in-db values:
ingest_create_athena_db_passed             -> True
s3_private_path_csv                        -> 's3://sagemaker-us-east-1-590183687297/spotify_tra
s3_private_path_parquet                    -> 's3://sagemaker-us-east-1-590183687297/toxicity_pd
setup_s3_bucket_passed                     -> True


### Create Tables

In [31]:
ingest_create_athena_table_pqt_passed = False

In [32]:
table_name = 'toxicity_pqt'
train = '%/train-00000-of-00001.parquet'
validation = '%/validation-00000-of-00001.parquet'
test = '%/test-00000-of-00001.parquet'

In [33]:
s3_private_path_parquet

's3://sagemaker-us-east-1-590183687297/toxicity_pds/parquet'

In [34]:
drop_stmt = """DROP TABLE IF EXISTS {}.{}""".format(database_name, table_name)
pd.read_sql(drop_stmt, conn)

  pd.read_sql(drop_stmt, conn)


In [35]:
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{} (
  uid               BIGINT,
  id                BIGINT,
  comment_text      STRING,
  toxicity          BIGINT,
  has_active_attrs  BOOLEAN,
  active_attrs      ARRAY<string>,
  male              BIGINT,
  female            BIGINT,
  lgbtq             BIGINT,
  christian         BIGINT,
  muslim            BIGINT,
  other_religions   BIGINT,
  black             BIGINT,
  white             BIGINT,
  identity_any      BIGINT,
  severe_toxicity   BIGINT,
  obscene           BIGINT,
  threat            BIGINT,
  insult            BIGINT,
  identity_attack   BIGINT,
  sexual_explicit   BIGINT
)
STORED AS PARQUET
LOCATION '{}/'""".format(database_name, table_name, s3_private_path_parquet)

print(statement)
pd.read_sql(statement, conn)

CREATE EXTERNAL TABLE IF NOT EXISTS aai540_toxicity_aws.toxicity_pqt (
  uid               BIGINT,
  id                BIGINT,
  comment_text      STRING,
  toxicity          BIGINT,
  has_active_attrs  BOOLEAN,
  active_attrs      ARRAY<string>,
  male              BIGINT,
  female            BIGINT,
  lgbtq             BIGINT,
  christian         BIGINT,
  muslim            BIGINT,
  other_religions   BIGINT,
  black             BIGINT,
  white             BIGINT,
  identity_any      BIGINT,
  severe_toxicity   BIGINT,
  obscene           BIGINT,
  threat            BIGINT,
  insult            BIGINT,
  identity_attack   BIGINT,
  sexual_explicit   BIGINT
)
STORED AS PARQUET
LOCATION 's3://sagemaker-us-east-1-590183687297/toxicity_pds/parquet/'


  pd.read_sql(statement, conn)


In [36]:
train_view = 'toxicity_train'
statement_view = """CREATE OR REPLACE VIEW {}.{} AS SELECT * FROM {}.{} WHERE "$path" LIKE '{}'""".format(
    database_name,
    train_view,
    database_name,
    table_name,
    train
)
print(statement_view)
pd.read_sql(statement_view, conn)

CREATE OR REPLACE VIEW aai540_toxicity_aws.toxicity_train AS SELECT * FROM aai540_toxicity_aws.toxicity_pqt WHERE "$path" LIKE '%/train-00000-of-00001.parquet'


  pd.read_sql(statement_view, conn)


In [37]:
validation_view = 'toxicity_val'
statement_view = """CREATE OR REPLACE VIEW {}.{} AS SELECT * FROM {}.{} WHERE "$path" LIKE '{}'""".format(
    database_name,
    validation_view,
    database_name,
    table_name,
    validation
)
print(statement_view)
pd.read_sql(statement_view, conn)

CREATE OR REPLACE VIEW aai540_toxicity_aws.toxicity_val AS SELECT * FROM aai540_toxicity_aws.toxicity_pqt WHERE "$path" LIKE '%/validation-00000-of-00001.parquet'


  pd.read_sql(statement_view, conn)


In [38]:
test_view = 'toxicity_test'
statement_view = """CREATE OR REPLACE VIEW {}.{} AS SELECT * FROM {}.{} WHERE "$path" LIKE '{}'""".format(
    database_name,
    test_view,
    database_name,
    table_name,
    test
)
print(statement_view)
pd.read_sql(statement_view, conn)


CREATE OR REPLACE VIEW aai540_toxicity_aws.toxicity_test AS SELECT * FROM aai540_toxicity_aws.toxicity_pqt WHERE "$path" LIKE '%/test-00000-of-00001.parquet'


  pd.read_sql(statement_view, conn)


### Verify The Table Has Been Created Succesfully

In [39]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,toxicity_pqt
1,toxicity_test
2,toxicity_train
3,toxicity_val


In [40]:
if table_name in df_show.values:
    ingest_create_athena_table_pqt_passed = True

print(ingest_create_athena_table_pqt_passed)

True


### Run A Sample SQL Query

In [41]:
statement = """SELECT * FROM {}.{}""".format(
    database_name, table_name
)

print(statement)

df = pd.read_sql(statement, conn)
df.head(5)

SELECT * FROM aai540_toxicity_aws.toxicity_pqt


  df = pd.read_sql(statement, conn)


Unnamed: 0,uid,id,comment_text,toxicity,has_active_attrs,active_attrs,male,female,lgbtq,christian,...,other_religions,black,white,identity_any,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,18,5711489,𝑯𝒆 𝒕𝒐𝒍𝒅 𝒑𝒐𝒍𝒊𝒄𝒆 𝒕𝒉𝒂𝒕 𝒉𝒆 𝒃𝒖𝒚𝒔 𝒄𝒐𝒄𝒂𝒊𝒏𝒆 “𝒊𝒏 𝒍𝒂𝒓𝒈𝒆 ...,0,False,[],0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21,5295059,为加拿大和中国友谊祝贺。与其想象中国人对加拿大人的看法不如直接去问，不要用自己的想法去猜。本...,0,True,[christian],0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,26,5753025,♫ Mister we could use a man like Neville Chamb...,0,True,[male],1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,47,6100226,…so much for your opinion.,0,False,[],0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,54,974066,…as most everyone knows both parties completel...,0,False,[],0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
statement = "SHOW VIEWS in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,views
0,toxicity_test
1,toxicity_train
2,toxicity_val


In [45]:
df['toxicity'].value_counts()

toxicity
0    394821
1     50472
Name: count, dtype: int64

## Set up Sagemaker Feature store

In [18]:
from sagemaker.session import Session

region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)

featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

### S3 Bucket set up for offline store

In [19]:
# You can modify the following to use a bucket of your choosing
default_s3_bucket_name = feature_store_session.default_bucket()
print(default_s3_bucket_name)

sagemaker-us-east-1-590183687297


In [20]:
# set up IAM role
from sagemaker import get_execution_role

# You can modify the following to use a role of your choosing. See the documentation for how to create this.
role = get_execution_role()
print(role)

arn:aws:iam::590183687297:role/LabRole


In [21]:
s3_private_path_parquet

's3://sagemaker-us-east-1-590183687297/toxicity_pds/parquet'

In [22]:
s3_private_path_parquet.replace("s3://", "").split("/", 1)

['sagemaker-us-east-1-590183687297', 'toxicity_pds/parquet']

In [23]:
!aws s3 ls $s3_private_path_parquet/

2025-09-24 02:27:35   34099179 test-00000-of-00001.parquet
2025-09-24 02:27:35   68844404 train-00000-of-00001.parquet
2025-09-24 02:27:35   11697541 validation-00000-of-00001.parquet


In [9]:
import pandas as pd
train = "civil/train-00000-of-00001.parquet"
validation = "civil/validation-00000-of-00001.parquet"
test = "civil/test-00000-of-00001.parquet"

train_data = pd.read_parquet(train)
validation_data = pd.read_parquet(validation)
test_data = pd.read_parquet(test)

print(train_data.shape)
print(validation_data.shape)
print(test_data.shape)


(267516, 21)
(45047, 21)
(132730, 21)


In [24]:
import boto3
import pandas as pd
import io

# Your region & private bucket
s3_client = boto3.client("s3", region_name=region)

# Split the s3 path into bucket and prefix
# Example: if $s3_private_path_csv="my-private-bucket/housing-data"
bucket_name, prefix = s3_private_path_parquet.replace("s3://", "").split("/", 1)

# Define the file keys (relative to prefix)
train_file_key = f"{prefix}/train-00000-of-00001.parquet"
validation_file_key = f"{prefix}/validation-00000-of-00001.parquet"
test_file_key = f"{prefix}/test-00000-of-00001.parquet"


# Load the files
train_data_object = s3_client.get_object(Bucket=bucket_name, Key=train_file_key)
validation_data_object = s3_client.get_object(Bucket=bucket_name, Key=validation_file_key)
test_data_object = s3_client.get_object(Bucket=bucket_name, Key=test_file_key)

# Convert to pandas DataFrames
train_data = pd.read_parquet(io.BytesIO(train_data_object["Body"].read()))
validation_data = pd.read_parquet(io.BytesIO(validation_data_object["Body"].read()))
test_data = pd.read_parquet(io.BytesIO(test_data_object["Body"].read()))


print("Train Data Shape:", train_data.shape)
print("Validation Data Shape:", validation_data.shape)
print("Test Data Shape:", test_data.shape)



Train Data Shape: (267516, 21)
Validation Data Shape: (45047, 21)
Test Data Shape: (132730, 21)


In [11]:
train_data['toxicity'].value_counts()

toxicity
0    237173
1     30343
Name: count, dtype: int64

In [25]:
civil = pd.concat([train_data, validation_data, test_data], axis=0)
civil.shape

(445293, 21)

In [12]:
# Sample 60,000 records where toxicity == 0
civil_tox0 = train_data[train_data['toxicity'] == 0].sample(n=35000, random_state=42)
# Get all records where toxicity == 1
civil_tox1 = train_data[train_data['toxicity'] == 1]
# Concatenate to create a balanced dataframe
train_balanced = pd.concat([civil_tox0, civil_tox1], ignore_index=True)
# Shuffle the resulting dataframe
civil_df = train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
civil_df['toxicity'].value_counts()

toxicity
0    35000
1    30343
Name: count, dtype: int64

### Feature Engineering

In [13]:
import re

TAG_RE = re.compile(r'<[^>]+>')

def re_tags(text_list): #define remove tag funtion
    return [TAG_RE.sub('', str(word)).lower() for word in text_list]

In [14]:
# Remove Accented Characters

import unidecode
def re_accented_char(text_list):
    return [unidecode.unidecode(word.encode().decode('utf-8')) for word in text_list]

In [15]:
# Extended contractions
def ex_contractions(text_list):
    result=[]
    for word in text_list:
        # replace contracting withoutsignal
        word = word.replace("wont","won't")
        word = word.replace("cant","can't")
        word = word.replace("its","it's")
        word = word.replace("youre","you're")
        word = word.replace("hes","he's")
        word = word.replace("shes","she's")
        word = word.replace("its","it's")
        word = word.replace("weare","we're")
        word = word.replace("theyre","they're")

        # specific
        word = re.sub(r"won\'t", "will not", str(word))
        word = re.sub(r"can\'t", "can not", str(word))

        # general
        word = re.sub(r"n\'t", " not", str(word))
        word = re.sub(r"\'re", " are", str(word))
        word = re.sub(r"\'s", " is", str(word))
        word = re.sub(r"\'d", " would", str(word))
        word = re.sub(r"\'ll", " will", str(word))
        word = re.sub(r"\'t", " not", str(word))
        word = re.sub(r"\'ve", " have", str(word))
        word = re.sub(r"\'m", " am", str(word))
        result.append(word)
    return result

In [16]:
# Removing Special Characters
def re_special_chars(text_list):
    return [re.sub("[^a-zA-Z0-9]"," ",word) for word in text_list]

#### Lemmatization

In [24]:
from nltk.stem import WordNetLemmatizer

def lemmatize_text(text_list):
    wnl = WordNetLemmatizer()
    lemmatizer_sentence = []  
    tokenizer=nltk.tokenize.WhitespaceTokenizer()
    for word in tokenizer.tokenize(text_list):
        lemmatizer_sentence.append(wnl.lemmatize(word,'v'))
        lemmatizer_sentence.append(" ")
    
    return("".join(lemmatizer_sentence))

#### Removing Stop Words

In [25]:
import nltk
from nltk.corpus import stopwords

def stopwords_text(text_list):
    stop = stopwords.words('english')
    sentence_without = []
    tokenizer=nltk.tokenize.WhitespaceTokenizer()
    for word in tokenizer.tokenize(text_list):
        if word not in stop:
            sentence_without.append(word)
            sentence_without.append(" ")
            
    return("".join(sentence_without))

In [26]:
def re_whitespaces(text_list): 
    result=[]
    for word in text_list:
        word=(re.sub(r'\d','',str(word))) #remove numbers 
        word = (re.sub(r'\s+',' ', str(word))) #remove duplicates white spacces
        result.append(word)
    return result

In [27]:
# Removing blank comments
civil_df = civil_df[civil_df['comment_text']!='']

### Pipeline

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [29]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/karthikvishwanathraghavan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/karthikvishwanathraghavan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [30]:
# Class for regular expressions application
class ApplyRegex(BaseEstimator, TransformerMixin):
    
    def __init__(self, regex_transformers):
        self.regex_transformers = regex_transformers
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # Applying all regex functions in the regex_transformers dictionary
        for regex_name, regex_function in self.regex_transformers.items():
            X = regex_function(X)
            
        return X

In [31]:
class StopWordsRemoval(BaseEstimator, TransformerMixin):
    
    def __init__(self, text_stopwords):
        self.text_stopwords = text_stopwords
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return [self.text_stopwords(comment) for comment in X]

In [32]:
class LemmatizeProcess(BaseEstimator, TransformerMixin):
    
    def __init__(self, Lemmatize):
        self.Lemmatizer = Lemmatize
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return [self.Lemmatizer(comment) for comment in X]

In [33]:
# Defining regex transformers to be applied
regex_transformers = {
    'remove_tags': re_tags,
    'remove_accents': re_accented_char,
    'decontracted': ex_contractions,
    're_sc': re_special_chars,
    'whitespaces': re_whitespaces
}

# Building a text prep pipeline
text_prep_pipeline = Pipeline([
    ('regex', ApplyRegex(regex_transformers)),
    ('stopwords', StopWordsRemoval(stopwords_text)),
    ('lemmatize', LemmatizeProcess(lemmatize_text)),
])

In [34]:
civil_df

Unnamed: 0,uid,id,comment_text,toxicity,has_active_attrs,active_attrs,male,female,LGBTQ,christian,...,other_religions,black,white,identity_any,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,277992,865178,I'm guessing your a woman and you appear to be...,1,True,"[female, christian]",0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
1,22783,5648086,"Your ""you can't even give us an exact date for...",1,True,"[male, other_religions]",1,0,0,0,...,1,0,0,1,0,0,0,0,0,0
2,20244,5287400,Yup. Once Alaskans saw what the republican al...,0,False,[],0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,315203,5946831,Huh? is that stop and go for an hour on the fr...,0,False,[],0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,70481,546025,Well said Sir!,0,False,[],0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65338,399335,825293,And how do you propose to do that? Treat Trum...,1,False,[],0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
65339,35674,5969113,"Yet they're all breaking the law, so what's yo...",1,False,[],0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
65340,231173,601614,"Lol, well by your own reasoning, somewhere in ...",0,False,[],0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65341,181638,901829,Pope Francis is an authoritarian and a tyrant....,0,True,"[male, christian]",1,0,0,1,...,0,0,0,1,0,0,0,0,0,0


In [47]:
civil_df[civil_df.columns[2:3]]

Unnamed: 0,comment_text
0,"While praising a Jesuit pope and the Jesuit ""t..."
1,That simply proves that lib trolls are on this...
2,The paranoia of the belief Americans are all r...
3,David Brooks the original Israel firster and h...
4,"Iditarod dogs are loved, honored and even give..."
...,...
110467,Like father like son. Trudeau oh so eager to ...
110468,"""I get it.\nTrump is a bad guy.""\n\nNo, you cl..."
110469,"Mike objects to gay sex because it's ""icky"". H..."
110470,Sounds so joyous!\nGreat Mother's Day story.


In [None]:
civil_df['comment_text'] = text_prep_pipeline.fit_transform(civil_df[civil_df.columns[2:3]].values)

In [37]:
civil_val_df = validation_data.copy()
civil_test_df = test_data.copy()

In [38]:
civil_val_df['comment_text'] = text_prep_pipeline.transform(validation_data[validation_data.columns[2:3]].values)
civil_test_df['comment_text'] = text_prep_pipeline.transform(test_data[test_data.columns[2:3]].values)



In [39]:
civil_df

Unnamed: 0,uid,id,comment_text,toxicity,has_active_attrs,active_attrs,male,female,LGBTQ,christian,...,other_religions,black,white,identity_any,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,277992,865178,guess woman appear run away foolish garble pos...,1,True,"[female, christian]",0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
1,22783,5648086,even give us exact date st cyril birth simply ...,1,True,"[male, other_religions]",1,0,0,0,...,1,0,0,1,0,0,0,0,0,0
2,20244,5287400,yup alaskans saw republican alternative aca lo...,0,False,[],0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,315203,5946831,huh stop go hour freeway car stop power get go...,0,False,[],0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,70481,546025,well say sir,0,False,[],0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65338,399335,825293,propose treat trump like control year old call...,1,False,[],0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
65339,35674,5969113,yet break law point cato globalist whore,1,False,[],0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
65340,231173,601614,lol well reason somewhere penultimate paragrap...,0,False,[],0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65341,181638,901829,pope francis authoritarian tyrant could imagin...,0,True,"[male, christian]",1,0,0,1,...,0,0,0,1,0,0,0,0,0,0


#### Define FeatureGroups¶

In [None]:
train_feature_group_name = "train-feature-group"
validation_feature_group_name = "validation-feature-group"
test_feature_group_name = "test-feature-group"

In [None]:
from sagemaker.feature_store.feature_group import FeatureGroup

train_feature_group = FeatureGroup(name=train_feature_group_name, sagemaker_session=feature_store_session)
validation_feature_group = FeatureGroup(name=validation_feature_group_name, sagemaker_session=feature_store_session)
test_feature_group = FeatureGroup(name=test_feature_group_name, sagemaker_session=feature_store_session)


In [40]:
train_df = civil_df[['id','comment_text','toxicity']]
validation_df = civil_val_df[['id','comment_text','toxicity']]
test_df = civil_test_df[['id','comment_text','toxicity']]

In [56]:
import time

current_time_sec = int(round(time.time()))
record_identifier_feature_name = "id"
event_time_feature_name = "EventTime"

def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")
    return data_frame


def appendEventTime(df):
    # append EventTime feature
    df[event_time_feature_name] = pd.Series(
        [current_time_sec] * len(df), dtype="float64"
    )
    return df

In [59]:
train_df.drop(["event_time_feature_name"], axis=1, inplace=True)
validation_df.drop(["event_time_feature_name"], axis=1, inplace=True)
test_df.drop(["event_time_feature_name"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop(["event_time_feature_name"], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_df.drop(["event_time_feature_name"], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.drop(["event_time_feature_name"], axis=1, inplace=True)


In [57]:
# cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
train_df = cast_object_to_string(train_df)
validation_df = cast_object_to_string(validation_df)
test_df = cast_object_to_string(test_df)

# append event time

train_df = appendEventTime(train_df)
validation_df = appendEventTime(validation_df)
test_df = appendEventTime(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[event_time_feature_name] = pd.Series(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[event_time_feature_name] = pd.Series(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[event_time_feature_name] = pd.Series(


In [None]:
# load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
train_feature_group.load_feature_definitions(data_frame=train_df)
validation_feature_group.load_feature_definitions(data_frame=validation_df)
test_feature_group.load_feature_definitions(data_frame=test_df)

[FeatureDefinition(feature_name='id', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='toxicity', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='severe_toxicity', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='obscene', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='threat', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='insult', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='identity_attack', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='sexual_explicit', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_nam

In [None]:
def featureGroupExists(feature_group_name):
    print(feature_group_name)
    # SageMaker client (not session)
    sm_client = boto3.client("sagemaker", region_name=region)
    featureGroupExists = False
    try:
        response = sm_client.describe_feature_group(FeatureGroupName=feature_group_name)
        featureGroupExists = True
        print(f"✅ Feature group exists. Status: {response['FeatureGroupStatus']}")
    except sm_client.exceptions.ResourceNotFound:
        featureGroupExists=False
        print(f"❌ {feature_group_name} does not exist.")
    return featureGroupExists

text_feature_group_exists = featureGroupExists(train_feature_group_name)
identity_feature_group_exists = featureGroupExists(validation_feature_group_name)
label_feature_group_exists = featureGroupExists(test_feature_group_name)

text-feature-group
✅ Feature group exists. Status: Created
identity-feature-group
❌ identity-feature-group does not exist.
label-feature-group
❌ label-feature-group does not exist.


In [None]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

def create_feature_group(feature_group, fg_Exists):
    fg_Exists = featureGroupExists(feature_group.name)
    if not fg_Exists:
        feature_group.create(
            s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
            record_identifier_name=record_identifier_feature_name,
            event_time_feature_name=event_time_feature_name,
            role_arn=role,
            enable_online_store=True,
        )
        wait_for_feature_group_creation_complete(feature_group)
    else:
        print(f"{feature_group.name} already exists")
    

create_feature_group(train_feature_group,text_feature_group_exists)
create_feature_group(validation_feature_group, identity_feature_group_exists)
create_feature_group(test_feature_group,label_feature_group_exists)



text-feature-group
✅ Feature group exists. Status: Created
text-feature-group already exists
identity-feature-group
❌ identity-feature-group does not exist.
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup identity-feature-group successfully created.
label-feature-group
❌ label-feature-group does not exist.
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup label-feature-group successfully created.


In [None]:
train_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:590183687297:feature-group/text-feature-group',
 'FeatureGroupName': 'text-feature-group',
 'RecordIdentifierFeatureName': 'id',
 'EventTimeFeatureName': 'EventTime',
 'FeatureDefinitions': [{'FeatureName': 'id', 'FeatureType': 'Integral'},
  {'FeatureName': 'comment_text', 'FeatureType': 'String'},
  {'FeatureName': 'toxicity', 'FeatureType': 'Integral'},
  {'FeatureName': 'EventTime', 'FeatureType': 'Fractional'}],
 'CreationTime': datetime.datetime(2025, 9, 24, 4, 17, 43, 61000, tzinfo=tzlocal()),
 'OnlineStoreConfig': {'EnableOnlineStore': True},
 'OfflineStoreConfig': {'S3StorageConfig': {'S3Uri': 's3://sagemaker-us-east-1-590183687297/toxicity_pds/parquet',
   'ResolvedOutputS3Uri': 's3://sagemaker-us-east-1-590183687297/toxicity_pds/parquet/590183687297/sagemaker/us-east-1/offline-store/text-feature-group-1758687463/data'},
  'DisableGlueTableCreation': False,
  'DataCatalogConfig': {'TableName': 'text_feature_group_1758687463',
 

In [69]:
X_train, y_train = train_data["comment_text"], train_data["toxicity"]
X_val, y_val = validation_data["comment_text"], validation_data["toxicity"]
X_test, y_test = test_data["comment_text"], test_data["toxicity"]

In [78]:
import os, re, sys, json, warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
# Build pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', LinearSVC())
])

# Train
pipeline.fit(train_data['comment_text'], train_data['toxicity'])

0,1,2
,steps,"[('tfidf', ...), ('svm', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [67]:
# Predict and evaluate
y_pred = pipeline.predict(test_data['comment_text'])
print('Accuracy:', accuracy_score(test_data['toxicity'], y_pred))

Accuracy: 0.9180064793189181


In [74]:
# Build pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

# Train
pipeline.fit(X_train, y_train)
# Predict and evaluate
y_pred = pipeline.predict(test_data['comment_text'])
print('Accuracy:', accuracy_score(test_data['toxicity'], y_pred))

Accuracy: 0.8872673849167483


In [75]:
# Build pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train
pipeline.fit(X_train, y_train)
# Predict and evaluate
y_pred = pipeline.predict(test_data['comment_text'])
print('Accuracy:', accuracy_score(test_data['toxicity'], y_pred))

Accuracy: 0.9009643637459505


In [80]:
# Build pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', xgb.XGBClassifier(objective='multi:softmax', num_class=y_train.nunique(), use_label_encoder=False, eval_metric='mlogloss'))
])

# Train
pipeline.fit(X_train, y_train)
# Predict and evaluate
y_pred = pipeline.predict(test_data['comment_text'])
print('Accuracy:', accuracy_score(test_data['toxicity'], y_pred))

Accuracy: 0.9136743765539064


### Release Resources

In [1]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [1]:

%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>