In [10]:
import pandas as pd
import ast  # To safely evaluate the string representation of lists
import math
import os
import random
import nltk
import os
import sys
import datetime
import argparse
import json
import openai
from openai import OpenAI

from google.colab import drive, userdata
drive.mount('/content/drive')
# Adjust path as needed
BASE_DIR = '/content/drive/MyDrive/SUNY_Poly_DSA598/'
DATA_DIR = os.path.join(BASE_DIR, 'datasets/FEVER/')

# --- Download NLTK sentence tokenizer if needed ---
nltk.download('punkt')
nltk.download('punkt_tab')

# --- Set up OpenAI API key ---
api_key = userdata.get('openaikey')

client = OpenAI(api_key=api_key)
def create_ft_job(job_name, train_path, val_path):
    """
    Create a fine-tuning job using the OpenAI API.
    """
    # Upload the training file
    training_file = client.files.create(
        file=open(train_path, "rb"),
        purpose="fine-tune"    )

    # Upload the validation file
    validation_file = client.files.create(
        file=open(val_path, "rb"),
        purpose="fine-tune"    )

    # Create the fine-tuning job
    fine_tuning_job = client.fine_tuning.jobs.create(
        training_file=training_file.id,
        validation_file=validation_file.id,
        model="gpt-4o-2024-08-06",
        seed=2025,
        method={
          "type": "supervised",
          "supervised": {
            "hyperparameters": {
              "n_epochs": 4,
              "batch_size": 4
            }
          }
        }
    )
    return fine_tuning_job.id

def get_ft_job_list():
    """
    Get the list of fine-tuning jobs using the OpenAI API.
    """
    fine_tuning_jobs = client.fine_tuning.jobs.list()
    return fine_tuning_jobs

def get_ft_job_status(job_id):
    """
    Get the status of a specific fine-tuning job using the OpenAI API.
    """
    fine_tuning_job = client.fine_tuning.jobs.retrieve(job_id)
    return fine_tuning_job

def get_ft_job_results(job_id):
    """
    Get the results of a specific fine-tuning job using the OpenAI API.
    """
    fine_tuning_job = client.fine_tuning.jobs.retrieve(job_id)
    return fine_tuning_job

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [11]:
"""
{"messages": [{"role": "system", "content": ""}, {"role": "user", "content": "Extract sentences from the source text that are relevant (either supporting or refuting) to the preceding claim. Return a comma separated list of sentences.\n\nClaim: Fox 2000 Pictures released the film Soul Food.\n\nSource Text: Soul Food is a 1997 American comedy-drama film produced by Kenneth `` Babyface '' Edmonds , Tracey Edmonds and Robert Teitel and released by Fox 2000 Pictures . Featuring an ensemble cast , the film stars Vanessa L. Williams , Vivica A. Fox , Nia Long , Michael Beach , Mekhi Phifer , Jeffrey D. Sams , Irma P. Hall , Gina Ravera and Brandon Hammond . Written and directed by George Tillman , Jr. -- in his major studio debut -- the film centers on the trials of an extended African-American family , held together by longstanding family traditions which begin to fade as serious problems take center stage .   Tillman based the family in the film on his own and Soul Food was widely acclaimed for presenting a more positive image of African-Americans than is typically seen in Hollywood films . In 2000 , Showtime premiered a one-hour television series based upon the film . In 2015 , it was announced that 20th Century Fox is planning a sequel for film called More Soul Food , written by Tillman , Jr. . "}, {"role": "assistant", "content": "Soul Food is a 1997 American comedy-drama film produced by Kenneth `` Babyface '' Edmonds , Tracey Edmonds and Robert Teitel and released by Fox 2000 Pictures ."}]}
"""
train_data = pd.read_csv(os.path.join(BASE_DIR, "datasets/FEVER/tabular_sets/tabular_sentEx_paper_dev_train/v1_segmented_sentIDs_n3461_04-04_002.csv"))
valid_data = pd.read_csv(os.path.join(BASE_DIR, "datasets/FEVER/tabular_sets/tabular_sentEx_paper_dev_valid/v1_segmented_sentIDs_n1482_04-04_002.csv"))
# Format dataset for chat-based fine-tuning for page title prediction.
def prepare_chat_finetuning_data(df):
    formatted_data = []
    supports = 0
    refutes = 0
    nei = 0

    bracket_mapping = {
        "-LRB-": "(",
        "-RRB-": ")",
        "-LSB-": "[",
        "-RSB-": "]",
        "-LCB-": "{",
        "-RCB-": "}"
    }

    for _, row in df.iterrows():
      print(f"On row {row['id']}")
      print(f"Label: {row['label']}")

      page_titles = []
      claim = row['claim']
      print(row['evidence_items'])
      #print(claim)
      if row['label'] == 'NOT ENOUGH INFO': # We don't need NEI (negative) samples for page title relationship modeling
          continue
      elif row['label'] == 'SUPPORTS':
          evidence_items = row['evidence_items']
          evidence_items = ast.literal_eval(evidence_items)
          print(evidence_items)
          if supports < 100:
            for item in evidence_items: # item is a list, we want the second element (the page title)
                mapped_text = item[1]
                for key, value in bracket_mapping.items():
                    mapped_text = mapped_text.replace(key, value)
                page_titles.append(f"'{mapped_text}'")

            # Drop duplicate page titles (we only need GPT_query to suggest the page, not necessarily how many sentences were extracted from that page, which is what the repeated page titles indicate).
            page_titles = list(set(page_titles))

            formatted_data.append({
                "messages": [
                    {"role": "system", "content": "You are an assistant that identifies relevant Wikipedia page titles based on a claim."},
                    {"role": "user", "content": f"Given the claim '{claim}', list the most relevant Wikipedia page titles likely to contain evidence. Consider key facts about the claim, such as the type of items mentioned. Respond with only a bracketed list of lowercase page titles with spaces as underscores, each title wrapped in single quotes and separated by a comma."},
                    {"role": "assistant", "content": f'[{",".join(page_titles)}]'},
                ]
            })
            supports += 1
          else:
            continue
      elif row['label'] == 'REFUTES':
          evidence_items = row['evidence_items']
          evidence_items = ast.literal_eval(evidence_items)
          print(evidence_items)
          if refutes < 100:
            for item in evidence_items: # item
                mapped_text = item[1]
                for key, value in bracket_mapping.items():
                    mapped_text = mapped_text.replace(key, value)
                page_titles.append(f"'{mapped_text}'")

            # Drop duplicate page titles (we only need GPT_query to suggest the page, not necessarily how many sentences were extracted from that page, which is what the repeated page titles indicate).
            page_titles = list(set(page_titles))

            formatted_data.append({
                "messages": [
                    {"role": "system", "content": "You are an assistant that identifies relevant Wikipedia page titles based on a claim."},
                    {"role": "user", "content": f"Given the claim '{claim}', list the most relevant Wikipedia page titles likely to contain evidence. Consider key facts about the claim, such as the type of items mentioned. Respond with only a bracketed list of lowercase page titles with spaces as underscores, each title wrapped in single quotes and separated by a comma."},
                    {"role": "assistant", "content": f'[{",".join(page_titles)}]'},
                ]
            })
            refutes += 1
          else:
            continue
      else:
          raise ValueError(f"Unknown label: {row['label']}")
      if supports >= 100 and refutes >= 100:
          print(f"Reached 100 supports and refutes. Stopping.")
          break

    # Shuffle the data
    random.shuffle(formatted_data)
    return formatted_data


# Save the data in JSONL format
def jsonize(data, filename):
    with open(filename, 'w') as f:
        for line in data:
            f.write(json.dumps(line) + '\n')

# Prepare the training and validation data
# Shuffle the dataframes with a seed (2025)
train_data = train_data.sample(frac=1, random_state=2025).reset_index(drop=True)
valid_data = valid_data.sample(frac=1, random_state=2025).reset_index(drop=True)

# Prepare the training data
train_data = prepare_chat_finetuning_data(train_data)
valid_data = prepare_chat_finetuning_data(valid_data)

TRAIN_SAVE_DIR = os.path.join(DATA_DIR, 'GPT_sets/GPT_query_paper_dev_train/')
VALID_SAVE_DIR = os.path.join(DATA_DIR, 'GPT_sets/GPT_query_paper_dev_valid/')

train_filename = os.path.join(TRAIN_SAVE_DIR, 'prompt_v1_segmented_n200_balanced_04-27_002.jsonl')
valid_filename = os.path.join(VALID_SAVE_DIR, 'prompt_v1_segmented_n200_balanced_04-27_002.jsonl')

# Save the training and validation data
jsonize(train_data, train_filename)
jsonize(valid_data, valid_filename)



On row 1751
Label: REFUTES
[["For its output , it might pass the message through unchanged for delivery to the user 's mailbox , redirect the message for delivery elsewhere , or even throw the message away .", 'Email_filtering', 5, []]]
[["For its output , it might pass the message through unchanged for delivery to the user 's mailbox , redirect the message for delivery elsewhere , or even throw the message away .", 'Email_filtering', 5, []]]
On row 708
Label: REFUTES
[["Winfield Scott `` Scotty '' Moore III -LRB- December 27 , 1931 -- June 28 , 2016 -RRB- was an American guitarist and recording engineer .", 'Scotty_Moore', 0, []]]
[["Winfield Scott `` Scotty '' Moore III -LRB- December 27 , 1931 -- June 28 , 2016 -RRB- was an American guitarist and recording engineer .", 'Scotty_Moore', 0, []]]
On row 2247
Label: SUPPORTS
[["Stanley `` Tookie '' Williams III -LRB- December 29 , 1953 -- December 13 , 2005 -RRB- was an American gang member and convicted murderer , who was part of the We

In [12]:
def load_jsonl(file_path, encoding='utf-8'):
    """Loads a JSON Lines file into a list of Python objects."""
    data = []
    try:
        with open(file_path, 'r', encoding=encoding) as f:
            for line in f:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError:
                    print(f"Warning: Skipping invalid JSON line in {file_path}: {line.strip()}")
    except FileNotFoundError:
        print(f"ERROR: File not found at {file_path}")
        return None # Return None or empty list on error
    return data

In [13]:
# call the function to create a fine-tuning job for NLI stance classification
query_job_name = "GPT_query_paper_dev_train"
query_train_path = os.path.join(TRAIN_SAVE_DIR, 'prompt_v1_segmented_n200_balanced_04-27_002.jsonl')
query_val_path = os.path.join(VALID_SAVE_DIR, 'prompt_v1_segmented_n200_balanced_04-27_002.jsonl')
query_job_id = create_ft_job(query_job_name, query_train_path, query_val_path)
print(f"Fine-tuning job created with ID: {query_job_id}")

Fine-tuning job created with ID: ftjob-Z6TatIFHOaYBJHZhpDf2538i
