In [None]:

fever_label_mapping = {
    "SUPPORTS": "1",
    "REFUTES": "0",
    "NOT ENOUGH INFO": "-1"
}

def convert_fever_to_jsonl_(fever_jsonl_filepath, output_jsonl_filepath, wiki_page_dict, mode, data_segmentation=False):
    """
    Converts a FEVER JSONL file to OpenAI JSONL format, including evidence text.

    Args:
        fever_jsonl_filepath (str): Path to the input FEVER JSONL file.
        output_jsonl_filepath (str): Path to the output OpenAI JSONL file.
        wiki_page_dict (dict): Dictionary of loaded Wikipedia pages.
        mode (str): Mode for conversion (i.e., model type; 'evidence_extraction' or 'fact_checking').
        data_segmentation (bool): Whether to segment the data into two halves to ensure no overlap (default: False).

    """
    fever_data_ = load_jsonl(fever_jsonl_filepath)
    # Sort the data
    fever_top_half = fever_data_[:((len(fever_data_)-1)//2)]
    fever_bottom_half = fever_data_[((len(fever_data_)-1)//2):]
    train_jsonl = []
    valid_jsonl = []

    if mode == 'evidence_extraction':
      print(f"Running conversion for evidence extraction mode...")
      '''
      In this mode, we create the user message stub from the full text, and the completion from the evidence sentences.
      '''
      run_count = len(os.listdir(f'{fever_path}/GPT_sentEx_paper_dev_train'))
      print(f"Number of files in GPT_sentEx_paper_dev_train: {run_count}")
      run_count_str = str(run_count).zfill(3)
      skipped_count = 0

      if data_segmentation:
         fever_data = fever_top_half
         train_true = 0
         train_total = 200 # TESTING VARIABLE
         valid_true = 0
         valid_total = 60 # TESTING VARIABLE
         print(f"Segmentation enabled. Using top half of the dataset.")
      else:
        fever_data = fever_data_

      for fever_item in fever_data:
          claim = fever_item['claim']
          fever_label = fever_item['label']
          mapped_label = fever_label_mapping.get(fever_label)
          evidence_sentences, full_text = extract_evidence_text_debug(fever_item, wiki_page_dict) # Use the corrected evidence extraction
          # Drop duplicate sentences
          evidence_sentences = list(set(evidence_sentences))

          # If data_segmentation is enabled, select exactly 200 examples
          if data_segmentation:
            if len(train_jsonl) >= train_total and len(valid_jsonl) >= valid_total:
              #print(f"Warning: Data segmentation enabled, but more than 200 examples found. Skipping...")
              continue

          if mapped_label: # Only process if we have a valid mapping
            if len(evidence_sentences) > 0:
              evidence_text_combined = ",".join(evidence_sentences) # Combine evidence sentences into a single string
              #print(f"Claim: {claim}\nEvidence:\n{evidence_text_combined}\nFull Text:\n{full_text}\nMapped Label: {mapped_label}")

              ########## CRUCIAL VARIABLEs — SUBJECTIVE PROMPTS ##########
              sys_prompt = "" # VARIABLE
              user_prompt = f"Extract sentences from the source text that are relevant (either supporting or refuting) to the preceding claim. Return a comma separated list of sentences.\n\nClaim: {claim}\n\nSource Text: {full_text}" # VARIABLE
              completion = evidence_text_combined # VARIABLE
              #########################################################
              if fever_label == 'SUPPORTS':
                if train_true < train_total // 2:
                  # Add to training data
                  train_jsonl.append({
                      "messages": [
                          {"role": "system", "content": sys_prompt},
                          {"role": "user", "content": user_prompt},
                          {"role": "assistant", "content": completion}
                      ],
                  })
                  train_true  += 1
                elif valid_true < valid_total // 2:
                  # Add to validation data
                  valid_jsonl.append({
                      "messages": [
                          {"role": "system", "content": sys_prompt},
                          {"role": "user", "content": user_prompt},
                          {"role": "assistant", "content": completion}
                      ],
                  })
                  valid_true  += 1
                else:
                  print(f"Warning: No space left for SUPPORTS in either train or validation data.")
                  print(f"current train_jsonl length: {len(train_jsonl)}")
                  print(f"current valid_jsonl length: {len(valid_jsonl)}")
                  continue
              elif fever_label == 'REFUTES':
                if len(train_jsonl) < train_total:
                  # Add to training data
                  train_jsonl.append({
                      "messages": [
                          {"role": "system", "content": sys_prompt},
                          {"role": "user", "content": user_prompt},
                          {"role": "assistant", "content": completion}
                      ],
                  })
                elif len(valid_jsonl) < valid_total:
                  # Add to validation data
                  valid_jsonl.append({
                      "messages": [
                          {"role": "system", "content": sys_prompt},
                          {"role": "user", "content": user_prompt},
                          {"role": "assistant", "content": completion}
                      ],
                  })
                else:
                  print(f"Warning: No space left for REFUTES in either train or validation data.")
                  print(f"current train_jsonl length: {len(train_jsonl)}")
                  print(f"current valid_jsonl length: {len(valid_jsonl)}")
                  continue
              else:
                print(f"Warning: Unexpected label: {fever_label}")
            else:
              # We don't want to include empty evidence in the sentence extraction data. If the semantic analysis module finds no sources above the threshold this model would be skipped.
              #print(f"Warning: No evidence found for claim: {claim}, skipping")
              #print('-------------------------------------------------------------------------------------------')
              skipped_count += 1
              continue
          else:
            print(f"Warning: No mapping found for label: {fever_label}")
            continue
      print(f"Length of the evidence extraction training data: {len(train_jsonl)}")
      print(f"Length of the evidence extraction validation data: {len(valid_jsonl)}")

      with open(f"{fever_path}GPT_sentEx_paper_dev_train/{output_jsonl_filepath}_{run_count_str}_{date_fmt}.jsonl", 'w') as f:
         for data_item in train_jsonl:
               json.dump(data_item, f)
               f.write('\n')

      # Print the number of skipped, true, and false claims
      print(f"Number of claims skipped: {skipped_count}")
      true_count = sum(1 for item in fever_data if item['label'] == 'SUPPORTS')
      false_count = sum(1 for item in fever_data if item['label'] == 'REFUTES')
      print(f"Number of true claims: {true_count}")
      print(f"Number of false claims: {false_count}")
      # Print the number of claims in the original dataset
      print(f"Number of claims in the original dataset: {len(fever_data)}")
      print(f"FEVER data conversion to OpenAI JSONL completed. Output file: {fever_path}GPT_sentEx_paper_dev_train/{output_jsonl_filepath}_{run_count_str}_{date_fmt}.jsonl")
    ############### END OF EVIDENCE EXTRACTION MODE ################

    ############### START OF FACT CHECKING MODE ###################
    elif mode == 'fact_checking':
      print(f"Running conversion for fact checking mode...")
      ''''
      In this mode, we create the user message stub from the claim, and the completion from the label.
      '''
      run_count = len(os.listdir(f'{fever_path}/GPT_clf_paper_dev_train'))
      print(f"Number of files in GPT_clf_paper_dev_train: {run_count}")
      run_count_str = str(run_count).zfill(3)
      skipped_count = 0

      if data_segmentation:
          fever_data = fever_bottom_half
          train_true = 0
          train_total = 200 # TESTING VARIABLE
          valid_true = 0
          valid_total = 60 # TESTING VARIABLE
          print(f"Segmentation enabled. Using bottom half of the dataset.")
      else:
        fever_data = fever_data_

      for fever_item in fever_data:
          claim = fever_item['claim']
          fever_label = fever_item['label']
          mapped_label = fever_label_mapping.get(fever_label)
          evidence_sentences, full_text = extract_evidence_text_debug(fever_item, wiki_page_dict)
          # Drop duplicate sentences
          evidence_sentences = list(set(evidence_sentences))

          # If data_segmentation is enabled, select exactly 100 examples for each label
          if data_segmentation:
            if len(train_jsonl) >= train_total and len(valid_jsonl) >= valid_total:
              #print(f"Warning: Data segmentation enabled, but more than 200 examples found. Skipping...")
              continue
          #print(f"Count true: {count_true}, Count false: {count_false}")

          if mapped_label:
            if len(evidence_sentences) > 0:
                #print(f"Claim: {claim}\nEvidence:\n{evidence_sentences}\nFull Text:\n{full_text}\nMapped Label: {mapped_label}")

                ########## CRUCIAL ELEMENT — SUBJECTIVE PROMPTS ##########
                sys_prompt = ""
                user_prompt = f"Classify the truthfulness of the text out of the following categories: '1' (if the claim is true), '0' (if the claim is false), given the evidence. Do not use any other labels.\n\nClaim:\n{claim}\n\nEvidence:\n{evidence_sentences}"
                completion = mapped_label
                #########################################################

                if fever_label == 'SUPPORTS':
                  if train_true < train_total // 2:
                    # Add to training data
                    train_jsonl.append({
                        "messages": [
                            {"role": "system", "content": sys_prompt},
                            {"role": "user", "content": user_prompt},
                            {"role": "assistant", "content": completion}
                        ],
                    })
                    train_true += 1
                  elif valid_true < valid_total // 2:
                    # Add to validation data
                    valid_jsonl.append({
                        "messages": [
                            {"role": "system", "content": sys_prompt},
                            {"role": "user", "content": user_prompt},
                            {"role": "assistant", "content": completion}
                        ],
                    })
                    valid_true += 1
                elif fever_label == 'REFUTES':
                  if len(train_jsonl) < train_total:
                    # Add to training data
                    train_jsonl.append({
                        "messages": [
                            {"role": "system", "content": sys_prompt},
                            {"role": "user", "content": user_prompt},
                            {"role": "assistant", "content": completion}
                        ],
                    })
                  elif len(valid_jsonl) < valid_total:
                    # Add to validation data
                    valid_jsonl.append({
                        "messages": [
                            {"role": "system", "content": sys_prompt},
                            {"role": "user", "content": user_prompt},
                            {"role": "assistant", "content": completion}
                        ],
                    })
                else:
                  print(f"Warning: Unexpected label: {fever_label}")
            else:
              # We don't want to include empty evidence in the sentence extraction data. If the semantic analysis module finds no sources above the threshold this model would be skipped.
              #print(f"Warning: No evidence found for claim: {claim}, skipping")
              #print('-------------------------------------------------------------------------------------------')
              skipped_count += 1
              continue
          else:
            print(f"Warning: No mapping found for label: {fever_label}")
            continue
      print(f"Length of the fact checking training data: {len(train_jsonl)}")
      print(f"Length of the fact checking validation data: {len(valid_jsonl)}")
      '''
      with open(f"{fever_path}GPT_clf_paper_dev_train/{output_jsonl_filepath}_{run_count_str}_{date_fmt}.jsonl", 'w') as f:
         for data_item in train_jsonl:
               json.dump(data_item, f)
               f.write('\n')
      '''
      # Print the number of skipped, true, and false claims
      print(f"Number of claims skipped: {skipped_count}")
      true_count = sum(1 for item in fever_data if item['label'] == 'SUPPORTS')
      false_count = sum(1 for item in fever_data if item['label'] == 'REFUTES')
      print(f"Number of true claims: {true_count}")
      print(f"Number of false claims: {false_count}")
      # Print the number of claims in the original dataset
      print(f"Number of claims in the original dataset: {len(fever_data_)}")
      print(f"FEVER data conversion to OpenAI JSONL completed. Output file: {fever_path}GPT_clf_paper_dev_train/{output_jsonl_filepath}_{run_count_str}_{date_fmt}.jsonl")

# Example usage (assuming you have fever_train.jsonl and wiki_page_dict loaded):
fever_train_filepath = fever_path + 'paper_dev.jsonl'
output_filepath = 'prompt_v1_n200_segmented'
# 'evidence_extraction' for relevant sentence extraction, 'fact_checking' for claim:source relationship classification
mode = 'evidence_extraction'
convert_fever_to_jsonl(fever_train_filepath, output_filepath, wiki_page_list_dicts, mode, data_segmentation=True)
