# Overview

This notebook is used for determining the length of human and GPT-generated BLURB dataset labels.

# Env Setup

In [None]:
import csv
import json
import pandas as pd
import numpy as np
import re
import string
from collections import Counter
import os
from google.colab import drive
import time
import ast

In [None]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
cd drive/MyDrive/'6.8611 Research Project'/'Colab Notebooks'

/content/drive/.shortcut-targets-by-id/1vdEcgdXIfpnlORVlPsJtHUmKXSAqr69R/6.8611 Research Project/Colab Notebooks


In [None]:
ls

 BC5CDR-D_devel_1.csv                               'Intrinsic Eval Precision.ipynb'
 BC5CDR-D_devel_2.csv                                Intrinsic_exact_match.ipynb
 Data-cleaning.ipynb                                 [0m[01;34mllm-annotations[0m/
 [01;34mdevel_gpt_generated_datasets[0m/                      ' NER with BERT.ipynb'
 Entity_Intrinsic_Eval_with_aligned_datasets.ipynb   openai-test.ipynb
 Fine-Tuning-Few-Shot.ipynb                          retry_prompts.gsheet
 Fine-Tuning-Human-Annotated.ipynb                   RW-Fine-Tuning-Human-Annotated.ipynb
 Fine-Tuning-One-Shot.ipynb                          [01;34msft_training_data[0m/
 Fine-Tuning-Zero-Shot.ipynb                         TEST_LABEL_BUGS.ipynb
 GPT-Finetuning.ipynb                                tokens_labels.csv
 Intrinsic_approx_match.ipynb                        Untitled
 [01;34mintrinsic_data[0m/                                     zero-shot-bc5cdr-chem.pynb
 Intrinsic_data_clean.ipynb               

# Data Preprocessing

In [None]:
# load the datasets into dataframes

def load_tsv_dataset(file_path):
  """
  Loads a tsv dataset. Renames thne columns to 'token' and 'label'.
  Note that renaming the columns will overwrite the first row of the dataframe
  """
  df = pd.read_csv(file_path, delimiter='\t', header=None, engine='python')
  df.columns = ['token', 'label']
  # print(df.head())
  return df


In [None]:
# load the datasets into dataframes

def load_csv_dataset(file_path):
  """
  Loads a csv dataset. Renames thne columns to 'token' and 'label'.
  Note that renaming the columns will overwrite the first row of the dataframe
  """
  df = pd.read_csv(file_path, header=None, engine='python')
  df.columns = ['token', 'label']
  # print(df.head())
  return df


In [None]:
def split_by_sentence(list_of_strings):
  sentences = []
  current_sentence = []
  num_sentences = 0

  for word in list_of_strings:
      current_sentence.append(word)
      if type(word) is str and word.endswith('.'):
          num_sentences += 1
          sentence_str = ' '.join(map(str, current_sentence))
          sentences.append(sentence_str)
          current_sentence = []

  print("\nNumber of sentences: ", num_sentences)
  return sentences

In [None]:
def get_filtered_entities(df, target_label):
  """
  df (pandas dataframe): has two columns 'token' and 'label'
  target_label: 'B', 'I', or 'O' (see description above for what these signify)

  Filtering involves: removing blanks, and filtering out entities that consist
  only of punctuation, numbers, or single letters.

  Return a frequency of all filtered entities with label 'target_label'.
  """
  filtered_df = df[df['label'] == target_label]
  target_entities = filtered_df['token'].tolist() # a set of all the entities with the target label

  # regex for filtering out nonsense strings
  punctuation = re.escape(string.punctuation)
  pattern = re.compile(rf'^(?![a-zA-Z]?$)(?!\d+$)(?!^[{punctuation}]+$).+')
  target_entities = [ent for ent in target_entities if pattern.match(ent)]
  return Counter(target_entities)

# Intrinsic Eval


In [None]:
def load_dfs(dataset):
  print('\nFor ', dataset, ": \n")
  zero_shot = f'devel_gpt_generated_datasets/zero_shot/{dataset}-devel.csv'
  one_shot = f'devel_gpt_generated_datasets/one_shot/{dataset}-devel.csv'
  few_shot = f'devel_gpt_generated_datasets/few_shot/{dataset}-devel.csv'
  devel = f'llm-annotations/datasets/{dataset}/devel.tsv'

  zero_shot_df = load_csv_dataset(zero_shot)
  one_shot_df = load_csv_dataset(one_shot)
  few_shot_df = load_csv_dataset(few_shot)
  devel_df = load_tsv_dataset(devel)
  print('Zero Shot Length: ', len(zero_shot_df), '\nOne Shot Length: ', len(one_shot_df), '\nFew Shot Length: ', len(few_shot_df), '\nTrue Length: ', len(devel_df))

  return devel_df, zero_shot_df, one_shot_df, few_shot_df

In [None]:
one_shot = f'devel_gpt_generated_datasets/one_shot/BC5CDR-disease-devel.csv'
one_shot_df = load_csv_dataset(one_shot)

In [None]:
def get_chunks(all_tokens, CHUNK_SIZE = 300):
  nans = 0
  for i in range(len(all_tokens)):
    token = all_tokens[i]
    if isinstance(token,float) and str(token)=="nan":
      nans +=1
      all_tokens[i]="null"
  print('Number of nans: ', nans)

  sentences = split_by_sentence(all_tokens)

  SENTENCE_CHUNKS = []
  sentence_chunk_len = []
  word_chunk_len = []
  chunk_sentence_count = 0

  curr_chunk, curr_chunk_len = [], 0
  for sent in sentences:
    curr_chunk.append(sent)
    curr_chunk_len += len(sent)
    chunk_sentence_count += 1
    if curr_chunk_len >= CHUNK_SIZE:
      word_chunk_len.append(curr_chunk_len)
      sentence_chunk_len.append(chunk_sentence_count)
      SENTENCE_CHUNKS.append(' '.join(curr_chunk))
      curr_chunk = []
      curr_chunk_len = 0
      chunk_sentence_count = 0

  print('Number of Sentences in First 10 Chunks: ', sentence_chunk_len[:10])
  SENTENCE_CHUNKS.append(' '.join(curr_chunk))
  print('Number of Chunks: ', len(SENTENCE_CHUNKS))
  return SENTENCE_CHUNKS, sentence_chunk_len, word_chunk_len

In [None]:
def data_label_dict(dataset_df, name):
  print('\nFor', name, ':')
  all_tokens = dataset_df['token'].tolist()
  CHUNK_SIZE = 300 # string length of the chunk
  sentence_chunks, sentence_chunk_len, word_chunk_len = get_chunks(all_tokens)

  n = len(dataset_df)
  i = 0

  sentence_labels = []

  while i < n:
    for chunk in sentence_chunks:
      num_tokens = len(chunk.split())
      rows = dataset_df.iloc[i:i+num_tokens].reset_index()

      tokens = [str(row['token'])+ '_' + str(i) for i, row in rows.iterrows()]
      labels = [row['label'] for _, row in rows.iterrows()]

      sentence_labels.append(dict(zip(tokens, labels)))

      i += num_tokens
  return all_tokens, sentence_chunks, sentence_labels, sentence_chunk_len, word_chunk_len


In [None]:
def chunk_length_analysis(length_type, devel_chunk_lengths, zero_chunk_lengths, one_chunk_lengths, few_chunk_lengths):
  d_z_same = True
  d_o_same = True

  for i in range(len(devel_chunk_lengths)):
    if d_z_same and devel_chunk_lengths[i] != zero_chunk_lengths[i] or d_o_same and devel_chunk_lengths[i] != one_chunk_lengths[i]:
      if d_z_same and devel_chunk_lengths[i] != zero_chunk_lengths[i]:
        d_z_same = False
        print('Number of ', length_type, ' in devel and ZERO shot chunks are identical until chunk number: ', i)

      if d_o_same and devel_chunk_lengths[i] != one_chunk_lengths[i]:
        d_o_same = False
        print('Number of ', length_type, ' in devel and ONE shot chunks are identical until chunk number: ', i)

      continue

In [None]:
for dataset in ['NCBI', 'JNLPBA', 'BC5CDR-chem', 'BC5CDR-disease', 'BC2GM']:
  devel, zero_shot, one_shot, few_shot = load_dfs(dataset)

  # for df, name in [[devel, 'devel'], [zero_shot, 'zero_shot'], [one_shot, 'one_shot']]:
  #   print(name, 'has ', get_filtered_entities(df, 'B'), ' entities labeled B, ', get_filtered_entities(df, 'I'), ' entities labeled I, and', get_filtered_entities(df, 'O'), ' entities labeled O')


  devel_tokens, devel_chunks, devel_labels, devel_chunk_s_lengths, devel_chunk_w_lengths = data_label_dict(devel, dataset + '_human')
  zero_tokens, zero_chunks, zero_labels, zero_chunk_s_lengths, zero_chunk_w_lengths = data_label_dict(zero_shot, dataset + '_zeroshot')
  one_tokens, one_chunks, one_labels, one_chunk_s_lengths, one_chunk_w_lengths = data_label_dict(one_shot, dataset + '_oneshot')
  few_tokens, few_chunks, few_labels, few_chunk_s_lengths, few_chunk_w_lengths = data_label_dict(few_shot, dataset + '_fewshot')

  chunk_length_analysis('sentences', devel_chunk_s_lengths, zero_chunk_s_lengths, one_chunk_s_lengths, few_chunk_s_lengths)
  chunk_length_analysis('words', devel_chunk_w_lengths, zero_chunk_w_lengths, one_chunk_w_lengths, few_chunk_w_lengths)





For  NCBI : 

Zero Shot Length:  23880 
One Shot Length:  23924 
Few Shot Length:  23916 
True Length:  23965

For NCBI_human :
Number of nans:  9

Number of sentences:  1027
Number of Sentences in First 10 Chunks:  [4, 3, 5, 3, 3, 4, 3, 3, 4, 3]
Number of Chunks:  348

For NCBI_zeroshot :
Number of nans:  10

Number of sentences:  933
Number of Sentences in First 10 Chunks:  [4, 3, 4, 3, 3, 3, 2, 3, 4, 3]
Number of Chunks:  350

For NCBI_oneshot :
Number of nans:  9

Number of sentences:  963
Number of Sentences in First 10 Chunks:  [4, 3, 4, 3, 3, 4, 3, 3, 4, 3]
Number of Chunks:  349

For NCBI_fewshot :
Number of nans:  9

Number of sentences:  957
Number of Sentences in First 10 Chunks:  [4, 3, 4, 3, 3, 4, 2, 3, 4, 3]
Number of Chunks:  346


TypeError: ignored