In [None]:

import pandas as pd
import ast  # To safely evaluate the string representation of lists
import math
import os
import random
import nltk
import os
import sys
import datetime
import argparse
import json
import openai
from openai import OpenAI


from google.colab import drive, userdata
drive.mount('/content/drive')
# Adjust path as needed
BASE_DIR = '/content/drive/MyDrive/SUNY_Poly_DSA598/'
DATA_DIR = os.path.join(BASE_DIR, 'datasets/FEVER/')

# --- Download NLTK sentence tokenizer if needed ---
nltk.download('punkt')
nltk.download('punkt_tab')

# --- Set up OpenAI API key ---
api_key = userdata.get('openaikey')

client = OpenAI(api_key=api_key)
def create_ft_job(job_name, train_path, val_path):
    """
    Create a fine-tuning job using the OpenAI API.
    """
    # Upload the training file
    training_file = client.files.create(
        file=open(train_path, "rb"),
        purpose="fine-tune"    )

    # Upload the validation file
    validation_file = client.files.create(
        file=open(val_path, "rb"),
        purpose="fine-tune"    )

    # Create the fine-tuning job
    fine_tuning_job = client.fine_tuning.jobs.create(
        training_file=training_file.id,
        validation_file=validation_file.id,
        model="gpt-4o-mini-2024-07-18",
        seed=2025,
        method={
          "type": "supervised",
          "supervised": {
            "hyperparameters": {
              "n_epochs": 2,
              "batch_size": 2
            }
          }
        }
    )
    return fine_tuning_job.id

def get_ft_job_list():
    """
    Get the list of fine-tuning jobs using the OpenAI API.
    """
    fine_tuning_jobs = client.fine_tuning.jobs.list()
    return fine_tuning_jobs

def get_ft_job_status(job_id):
    """
    Get the status of a specific fine-tuning job using the OpenAI API.
    """
    fine_tuning_job = client.fine_tuning.jobs.retrieve(job_id)
    return fine_tuning_job

def get_ft_job_results(job_id):
    """
    Get the results of a specific fine-tuning job using the OpenAI API.
    """
    fine_tuning_job = client.fine_tuning.jobs.retrieve(job_id)
    return fine_tuning_job

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
"""
{"messages": [{"role": "system", "content": ""}, {"role": "user", "content": "Extract sentences from the source text that are relevant (either supporting or refuting) to the preceding claim. Return a comma separated list of sentences.\n\nClaim: Fox 2000 Pictures released the film Soul Food.\n\nSource Text: Soul Food is a 1997 American comedy-drama film produced by Kenneth `` Babyface '' Edmonds , Tracey Edmonds and Robert Teitel and released by Fox 2000 Pictures . Featuring an ensemble cast , the film stars Vanessa L. Williams , Vivica A. Fox , Nia Long , Michael Beach , Mekhi Phifer , Jeffrey D. Sams , Irma P. Hall , Gina Ravera and Brandon Hammond . Written and directed by George Tillman , Jr. -- in his major studio debut -- the film centers on the trials of an extended African-American family , held together by longstanding family traditions which begin to fade as serious problems take center stage .   Tillman based the family in the film on his own and Soul Food was widely acclaimed for presenting a more positive image of African-Americans than is typically seen in Hollywood films . In 2000 , Showtime premiered a one-hour television series based upon the film . In 2015 , it was announced that 20th Century Fox is planning a sequel for film called More Soul Food , written by Tillman , Jr. . "}, {"role": "assistant", "content": "Soul Food is a 1997 American comedy-drama film produced by Kenneth `` Babyface '' Edmonds , Tracey Edmonds and Robert Teitel and released by Fox 2000 Pictures ."}]}
"""
train_data = pd.read_csv(os.path.join(BASE_DIR, "datasets/FEVER/tabular_sets/tabular_sentEx_paper_dev_train/v1_segmented_sentIDs_n3461_04-04_002.csv"))
valid_data = pd.read_csv(os.path.join(BASE_DIR, "datasets/FEVER/tabular_sets/tabular_sentEx_paper_dev_valid/v1_segmented_sentIDs_n1482_04-04_002.csv"))
# Format dataset for chat-based fine-tuning
def prepare_chat_finetuning_data(df):
    formatted_data = []
    supports = 0
    refutes = 0
    for _, row in df.iterrows():
        if row['label'] == 'NOT ENOUGH INFO':
            continue # Skip rows with 'NOT ENOUGH INFO' label, they are not needed for training the sentEx model
        if row['label'] == 'SUPPORTS':
            if supports < 100:
              evidence_items = ast.literal_eval(row['evidence_items'])
              evidence_sentences = []
              for item in evidence_items: # item is a list, we want the first element (the sentence)
                  evidence_sentences.append(item[0])
                  print(evidence_sentences)

              formatted_data.append({
                  "messages": [
                      {"role": "system", "content": ""},
                      {"role": "user", "content": "Extract sentences from the source text that are relevant (either supporting or refuting) to the preceding claim. Return a comma separated list of sentences.\n\nClaim: " + row['claim'] + "\n\nSource Text:\n" + row['full_text']},
                      {"role": "assistant", "content": "\n".join(evidence_sentences)}
                  ]
              })
              supports += 1
            else:
              continue
        elif row['label'] == 'REFUTES':
            if refutes < 100:
              evidence_items = ast.literal_eval(row['evidence_items'])
              evidence_sentences = []
              for item in evidence_items: # item is a list, we want the first element (the sentence)
                  evidence_sentences.append(item[0])
              formatted_data.append({
                  "messages": [
                      {"role": "system", "content": ""},
                      {"role": "user", "content": "Extract sentences from the source text that are relevant (either supporting or refuting) to the preceding claim. Return a comma separated list of sentences.\n\nClaim: " + row['claim'] + "\n\nSource Text:" + row['full_text']},
                      {"role": "assistant", "content": "\n".join(evidence_sentences)}
                  ]
              })
              refutes += 1
            else:
              continue
        else:
            raise ValueError(f"Unknown label: {row['label']}")
        if supports >= 100 and refutes >= 100:
            print(f"Reached 100 supports and 100 refutes. Stopping.")
            break

    # Shuffle the data
    random.shuffle(formatted_data)

    return formatted_data


# Save the data in JSONL format
def jsonize(data, filename):
    with open(filename, 'w') as f:
        for line in data:
            f.write(json.dumps(line) + '\n')

# Prepare the training and validation data
# Shuffle the dataframes with a seed (2025)
train_data = train_data.sample(frac=1, random_state=2025).reset_index(drop=True)
valid_data = valid_data.sample(frac=1, random_state=2025).reset_index(drop=True)

# Prepare the training data
train_data = prepare_chat_finetuning_data(train_data)
valid_data = prepare_chat_finetuning_data(valid_data)

TRAIN_SAVE_DIR = os.path.join(DATA_DIR, 'GPT_sets/GPT_sentEx_paper_dev_train/')
VALID_SAVE_DIR = os.path.join(DATA_DIR, 'GPT_sets/GPT_sentEx_paper_dev_valid/')

train_filename = os.path.join(TRAIN_SAVE_DIR, 'prompt_v1_segmented_n200_balanced_04-20_002.jsonl')
valid_filename = os.path.join(VALID_SAVE_DIR, 'prompt_v1_segmented_n60_balanced_04-20_002.jsonl')

# Save the training and validation data
jsonize(train_data, train_filename)
jsonize(valid_data, valid_filename)



["Stanley `` Tookie '' Williams III -LRB- December 29 , 1953 -- December 13 , 2005 -RRB- was an American gang member and convicted murderer , who was part of the West Side Crips , a street gang which has its roots in South Central Los Angeles in 1969 ."]
['Carlos Santana -LRB- born July 20 , 1947 -RRB- is a Mexican and American musician who first became famous in the late 1960s and early 1970s with his band , Santana , which pioneered a fusion of rock and Latin American music .']
['In 2013 she was ranked as the second highest earning actress/models in Israel , behind Bar Refaeli , but ahead of Esti Ginzburg and Shlomit Malka , with the majority of her income coming from acting .']
['Marnie is a 1964 American psychological thriller film directed by Alfred Hitchcock .']
["Introduced in 1996 's A Game of Thrones , Sam is the eldest son of Randyll Tarly , from the fictional kingdom of Westeros ."]
["Introduced in 1996 's A Game of Thrones , Sam is the eldest son of Randyll Tarly , from the

In [None]:
def load_jsonl(file_path, encoding='utf-8'):
    """Loads a JSON Lines file into a list of Python objects."""
    data = []
    try:
        with open(file_path, 'r', encoding=encoding) as f:
            for line in f:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError:
                    print(f"Warning: Skipping invalid JSON line in {file_path}: {line.strip()}")
    except FileNotFoundError:
        print(f"ERROR: File not found at {file_path}")
        return None # Return None or empty list on error
    return data

In [None]:
# call the function to create a fine-tuning job for NLI stance classification
sentEx_job_name = "GPT_sentEx_paper_dev_train"
sentEx_train_path = os.path.join(TRAIN_SAVE_DIR, 'prompt_v1_segmented_n200_balanced_04-20_002.jsonl')
sentEx_val_path = os.path.join(VALID_SAVE_DIR, 'prompt_v1_segmented_n60_balanced_04-20_002.jsonl')
sentEx_job_id = create_ft_job(sentEx_job_name, sentEx_train_path, sentEx_val_path)
print(f"Fine-tuning job created with ID: {sentEx_job_id}")



Fine-tuning job created with ID: ftjob-fSiyVbBRg7VQH89NnDj0mBuJ


In [None]:
# Call the function to create a fine-tuning job for sentence extraction
clf_job_name = "GPT_clf_paper_dev_train"
# datasets/FEVER/GPT_sets/GPT_clf_paper_dev_train/prompt_v1_segmented_n200_04-19_001.jsonl
clf_train_path = os.path.join(DATA_DIR, 'GPT_sets/GPT_clf_paper_dev_train/prompt_v1_segmented_n200_04-19_001.jsonl')
clf_val_path = os.path.join(DATA_DIR, 'GPT_sets/GPT_clf_paper_dev_valid/prompt_v1_segmented_n60_04-19_001.jsonl')
clf_job_id = create_ft_job(clf_job_name, clf_train_path, clf_val_path)
print(f"Fine-tuning job created with ID: {clf_job_id}")



Fine-tuning job created with ID: ftjob-u5WXXSkEq70O3LdS3RIlIA8u


In [None]:
sentEx_train = load_jsonl(sentEx_train_path)
sentEx_val = load_jsonl(sentEx_val_path)
clf_train = load_jsonl(clf_train_path)
clf_val = load_jsonl(clf_val_path)

"""# print the first few items of each
for item in sentEx_train[:5]:
    for message in item['messages']:
        for entry in message.values():
            print(entry)
    print()

for item in sentEx_val[:5]:
    for message in item['messages']:
        for entry in message.values():
            print(entry)
    print()"""

for item in clf_train[:5]:
  for message in item['messages']:
      for entry in message.values():
          print(entry)
  print()

for item in clf_val[:5]:
    for message in item['messages']:
        for entry in message.values():
            print(entry)
    print()

system

user
Given the claim, classify the stance of the potentially relevant evidence out of the following categories: '1' (if the claim is supported by the evidence), '0' (if the claim is refuted by the evidence), '2' (if you do not have enough info to make a confident decision). Respond with a single digit label. Do not use any other labels.

Claim: Riddick is in a science fiction film.

Evidence:
Riddick is a Furyan , a member of a warrior race obliterated by a military campaign that left Furya desolate , and is one of the last of his kind .
Riddick was once a mercenary , then part of a security force , and later a soldier .
Actor Vin Diesel has played the title role in all of the Riddick-based films and video games so far .
Richard B. Riddick , more commonly known as Riddick , is a fictional character and the antihero of four films in the Riddick series -LRB- Pitch Black , The Chronicles of Riddick , the animated movie The Chronicles of Riddick : Dark Fury , and Riddick -RRB- , as