This notebook is for inference using a fine-tuned Open AI gpt-4o-mini model.

In [None]:
!pip3 install openai

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
import numpy as np, pandas as pd
import json
from openai import OpenAI
from google.colab import userdata

In [4]:
class CONFIG:
    model = 'ft:gpt-4o-mini-2024-07-18:personal::A7TQ6aWo'
    lines_per_task = 1

In [5]:
class PATHS:
    save = '/content/drive/MyDrive'
    few_shot = f'{save}/few_shot_labeled.tsv'
    train = f'{save}/ft-train_labeled.tsv'
    valid = f'{save}/ft-valid_labeled.tsv'
    sw = f'{save}/subreddit_SuicideWatch_900_v2_trimmed.tsv'
    ar = f'{save}/subreddit_abusiverelationships_900_v2_trimmed.tsv'
    arc = f'{save}/subreddit_abusiverelationships_600_v2_comments_trimmed.tsv'
    toxic = f'{save}/jigsaw_toxic_2019_threshold_0.9.tsv'

In [6]:
few_shot_df = pd.read_csv(PATHS.few_shot, sep='\t', dtype={'id': 'string'})
train_df = pd.read_csv(PATHS.train, sep='\t', dtype={'id': 'string'})
valid_df = pd.read_csv(PATHS.valid, sep='\t', dtype={'id': 'string'})
all_train_df = pd.concat([train_df, valid_df])
sw_df = pd.read_csv(PATHS.sw, sep='\t', dtype={'id': 'string'})
ar_df = pd.read_csv(PATHS.ar, sep='\t', dtype={'id': 'string'})
arc_df = pd.read_csv(PATHS.arc, sep='\t', dtype={'id': 'string'})
toxic_df = pd.read_csv(PATHS.toxic_v2, sep='\t', dtype={'id': 'string'})

In [7]:
client = OpenAI(api_key=userdata.get("OPENAI_API_KEY"))

# Define Response Format

In [8]:
response_format = {
    "type": "json_schema",
    "json_schema": {
        "strict": True,
        "name": "harmlabels",
        "schema": {
            "type": "object",
            "properties": {
                "id": {
                    "type": "string"
                },
                "self_harm": {
                    "type": "integer",
                    "enum": [0, 1]
                },
                "harming_others": {
                    "type": "integer",
                    "enum": [0, 1]
                },
                "harmed_by_others": {
                    "type": "integer",
                    "enum": [0, 1]
                },
                "reference_to_harm": {
                    "type": "integer",
                    "enum": [0, 1]
                }
            },
            "required": [
                "id", "self_harm", "harming_others", "harmed_by_others", "reference_to_harm"
            ],
            "additionalProperties": False
        }
    }
}

# Set Prompt

In [9]:
instruction = """You are a data labeler. The targets are 'self_harm', 'harming_others', 'harmed_by_others', and 'reference_to_harm', label each target as 0 or 1.
1. 'self_harm': harm directed from me to me
2. 'harming_others': harm directed from me to other people
3. 'harmed_by_others': harm directed from other people to me
4. 'reference_to_harm': harm directed from other people to other people
"""

In [10]:
action = """Label the following texts. Return 'id', 'self_harm', 'harming_others', 'harmed_by_others', and 'reference_to_harm'. 'id' is the same as input.
"""

In [11]:
labels = ['self_harm', 'harming_others', 'harmed_by_others','reference_to_harm']

# Make Inference

In [12]:
def split_text(filename):
    split = []
    df = pd.read_csv(filename, sep='\t')
    for i in range(0, len(df), CONFIG.lines_per_task):
        text = df.iloc[i: i+CONFIG.lines_per_task].to_csv(sep='\t', index=False, header=False)
        split.append((i//CONFIG.lines_per_task, text))
    return split

In [13]:
def get_task(index, text, model):
    task = {
        "custom_id": f"task_{index}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": model,
            "temperature": 0,
            "max_tokens": 256,
            "response_format": response_format,
            "messages": [
                {
                    "role": "system",
                    "content": instruction,
                },
                {
                    "role": "user",
                    "content": ''.join([action, text]),
                },
            ],
        },
    }
    return task

In [14]:
def make_inference_jsonl(input, output, model):
    split = split_text(input)
    # write jsonl
    with open(output, 'w') as f:
        for index, text in split:
            task = get_task(index, text, model)
            f.write(json.dumps(task) + '\n')

In [15]:
def submit_batch(jsonl_file):
    # upload jsonl
    batch_file = client.files.create(
        file=open(jsonl_file, 'rb'),
        purpose="batch",
    )
    # submit batch job
    batch_job = client.batches.create(
        input_file_id=batch_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
    )
    return batch_job

Submit suicide watch inference

In [None]:
inference_jsonl = f'{PATHS.save}/sw_inference.jsonl'
make_inference_jsonl(PATHS.sw, inference_jsonl, CONFIG.model)

In [None]:
batch_job = submit_batch(inference_jsonl)

In [None]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)

In [None]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
sw_predicted_jsonl = f'{PATHS.save}/sw_predicted.jsonl'
with open(sw_predicted_jsonl, 'wb') as f:
    f.write(result)

Submit abusive relationship inference

In [None]:
inference_jsonl = f'{PATHS.save}/ar_inference.jsonl'
make_inference_jsonl(PATHS.ar, inference_jsonl, CONFIG.model)

In [None]:
batch_job = submit_batch(inference_jsonl)

In [None]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)

In [None]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
ar_predicted_jsonl = f'{PATHS.save}/ar_predicted.jsonl'
with open(ar_predicted_jsonl, 'wb') as f:
    f.write(result)

Submit abusive relationship comments inference

In [16]:
inference_jsonl = f'{PATHS.save}/arc_inference.jsonl'
make_inference_jsonl(PATHS.arc, inference_jsonl, CONFIG.model)

In [17]:
batch_job = submit_batch(inference_jsonl)

In [None]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)

In [34]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
arc_predicted_jsonl = f'{PATHS.save}/arc_predicted.jsonl'
with open(arc_predicted_jsonl, 'wb') as f:
    f.write(result)

Submit toxic inference

In [None]:
inference_jsonl = f'{PATHS.save}/toxic_inference.jsonl'
make_inference_jsonl(PATHS.toxic, inference_jsonl, CONFIG.model)

In [None]:
batch_job = submit_batch(inference_jsonl)

In [None]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)

In [None]:
result_file_id = batch_job.output_file_id

result = client.files.content(result_file_id).content
toxic_predicted_jsonl = f'{PATHS.save}/toxic_predicted.jsonl'
with open(toxic_predicted_jsonl, 'wb') as f:
    f.write(result)

# Process Inference Result

In [36]:
def get_df(jsonl):
    with open(jsonl) as f:
        json_lines = f.readlines()
        json_objs = [json.loads(line) for line in json_lines]
    df = pd.DataFrame(columns=(['id']+labels))
    for i, json_obj in enumerate(json_objs):
        json_content = json.loads(json_obj['response']['body']['choices'][0]['message']['content'])
        s = pd.Series(data=json_content)
        df.loc[i] = s
    for label in labels:
        df[label] = df[label].astype('int')
    return df

Label suicide watch data

In [None]:
sw_predicted_df = get_df(sw_predicted_jsonl)
sw_labeled_df = pd.merge(sw_df, sw_predicted_df, how='inner', on='id')
sw_labeled_df.to_csv(f'{PATHS.save}/sw_v2_labeled.tsv', sep='\t', index=False)

Label abusive relationship data

In [None]:
ar_predicted_df = get_df(ar_predicted_jsonl)
ar_labeled_df = pd.merge(ar_df, ar_predicted_df, how='inner', on='id')
ar_labeled_df.to_csv(f'{PATHS.save}/ar_v2_labeled.tsv', sep='\t', index=False)

Label abusive relationship comments data

In [38]:
arc_predicted_df = get_df(arc_predicted_jsonl)
arc_labeled_df = pd.merge(arc_df, arc_predicted_df, how='inner', on='id')
arc_labeled_df.to_csv(f'{PATHS.save}/arc_v2_labeled.tsv', sep='\t', index=False)

Label toxic data

In [None]:
toxic_predicted_df = get_df(toxic_predicted_jsonl)
toxic_labeled_df = pd.merge(toxic_df, toxic_predicted_df, how='inner', on='id')
toxic_labeled_df.to_csv(f'{PATHS.save}/toxic_v2_labeled.tsv', sep='\t', index=False)