### Prepare dataset

In [None]:
from datasets import load_dataset
import numpy as np
np.random.seed(100)

ds = load_dataset("google/civil_comments")

In [4]:
print(ds)
print(ds['train'][0])

DatasetDict({
    train: Dataset({
        features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit'],
        num_rows: 1804874
    })
    validation: Dataset({
        features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit'],
        num_rows: 97320
    })
    test: Dataset({
        features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit'],
        num_rows: 97320
    })
})
{'text': "This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!", 'toxicity': 0.0, 'severe_toxicity': 0.0, 'obscene': 0.0, 'threat': 0.0, 'insult': 0.0, 'identity_attack': 0.0, 'sexual_explicit': 0.0}


In [5]:
train_df = ds['train'].to_pandas()
valid_df = ds['validation'].to_pandas()
test_df = ds['test'].to_pandas()

print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)

(1804874, 8)
(97320, 8)
(97320, 8)


In [12]:
def random_sample_n(df, n):
    df['rand'] = np.random.randint(0, 30)
    df_new = df.sort_values(by='rand').reset_index(drop=True).drop('rand', axis=1)
    return df_new.iloc[:n]

train_df_sample = random_sample_n(train_df, 1000)
test_df_sample = random_sample_n(test_df, 300)

print(train_df_sample.columns)

print("## Row counts")
print(train_df_sample.shape)
print(test_df_sample.shape)

Index(['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult',
       'identity_attack', 'sexual_explicit'],
      dtype='object')
## Row counts
(1000, 8)
(300, 8)


In [14]:
label_columns = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult',
       'identity_attack', 'sexual_explicit']

train_df_sample[label_columns] = train_df_sample[label_columns].round(3)
test_df_sample[label_columns] = test_df_sample[label_columns].round(3)

In [15]:
def row_to_formatted_json(row):
    json_content = row.to_json(orient='index', double_precision=2, indent=2)
    
    formatted_string = '```json\n' + json_content + '\n```'
    
    return formatted_string

train_df_sample['response'] = train_df_sample[label_columns].apply(row_to_formatted_json, axis=1)
test_df_sample['response'] = test_df_sample[label_columns].apply(row_to_formatted_json, axis=1)

In [16]:
initial_prompt = '''
Analyze the following text for toxicity and provide scores for the specified categories. Each score must be a float between 0.0 and 1.0. Your response must be only a single, valid JSON object with the specified keys, formatted exactly as shown below, including the enclosing '```json\n...\n```' block.

Text: {text_input}

Output format:
'```json\n{{\n  "toxicity":<float_score>,\n  "severe_toxicity":<float_score>,\n  "obscene":<float_score>,\n  "threat":<float_score>,\n  "insult":<float_score>,\n  "identity_attack":<float_score>,\n  "sexual_explicit":<float_score>\n}}\n```'
'''

train_df_sample['prompt'] = train_df_sample['text'].apply(lambda x: initial_prompt.format(text_input=x))
test_df_sample['prompt'] = test_df_sample['text'].apply(lambda x: initial_prompt.format(text_input=x))

In [17]:
train_df_sample.to_csv('./sample_data/train_df_sampled.csv', index=False)
test_df_sample.to_csv('./sample_data/test_df_sampled.csv', index=False)

train_df_sample[['text', 'response']].to_csv('./sample_data/train_df_sampled_vapo.csv', index=False)
test_df_sample[['text', 'response']].to_csv('./sample_data/test_df_sampled_vapo.csv', index=False)

### Save datasets to a GCS bucket

In [22]:
from google.cloud import storage

In [21]:
!gcloud auth application-default login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=BuQIh82reVjf7oKzKLYtzhxKZBwABO&access_type=offline&code_challenge=MMW3otBMJGOQJDXALJb0cQ4Zkko6k00rdlIGcWq4wgM&code_challenge_method=S256


Credentials saved to file: [/Users/lexha/.config/gcloud/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "my-argolis-prj" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.


Updates are available for some Google Cloud CLI co

In [None]:
from google.cloud import storage
from pathlib import Path

def upload_jsonl_to_gcs(bucket_name: str, source_file_name: str, destination_blob_name: str):
    """
    Uploads a local file (e.g., a .jsonl file) to a Google Cloud Storage bucket.

    Args:
        bucket_name (str): The name of the GCS bucket (e.g., 'my-data-bucket').
        source_file_name (str): The path to the local file to upload (e.g., 'data/input.jsonl').
        destination_blob_name (str): The desired path/name of the file in the bucket 
                                     (e.g., 'inputs/2025/input.jsonl').
    """
    
    # Check if the source file exists
    source_path = Path(source_file_name)
    if not source_path.exists():
        print(f"Error: Source file not found at {source_file_name}")
        return

    try:
        # Initialize a GCS Client
        # The client will automatically use Application Default Credentials (ADC)
        storage_client = storage.Client()

        # Get the bucket object
        bucket = storage_client.bucket(bucket_name)

        # Create a blob (file) object using the desired path in GCS
        blob = bucket.blob(destination_blob_name)

        # Upload the file
        blob.upload_from_filename(source_file_name)

        print(
            f"File {source_file_name} uploaded successfully to "
            f"gs://{bucket_name}/{destination_blob_name}"
        )

    except Exception as e:
        print(f"An error occurred during upload: {e}")

In [None]:
GCS_BUCKET_NAME = "your-unique-bucket-name"  # <-- Replace with your bucket name
LOCAL_JSONL_PATH = "my_local_data.jsonl"    # <-- Ensure this file exists
GCS_DESTINATION_PATH = "data_uploads/daily_run_20251002.jsonl" 


In [None]:
# 1. Define your parameters
GCS_BUCKET_NAME = "your-unique-bucket-name"  # <-- Replace with your bucket name
LOCAL_JSONL_PATH = "my_local_data.jsonl"    # <-- Ensure this file exists
GCS_DESTINATION_PATH = "data_uploads/daily_run_20251002.jsonl" 

# 2. (Optional) Create a dummy JSONL file for testing
# This ensures the script has a file to upload
try:
    with open(LOCAL_JSONL_PATH, 'w') as f:
        f.write('{"id": 1, "text": "This is line one."}\n')
        f.write('{"id": 2, "text": "This is line two."}\n')
    print(f"Created dummy file: {LOCAL_JSONL_PATH}")
except Exception as e:
    print(f"Could not create dummy file: {e}")


# 3. Call the upload function
upload_jsonl_to_gcs(
    bucket_name=GCS_BUCKET_NAME,
    source_file_name=LOCAL_JSONL_PATH,
    destination_blob_name=GCS_DESTINATION_PATH
)