In [111]:
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
import openai
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from transformers import GPT2Tokenizer
import pandas as pd
from tqdm.auto import tqdm
import warnings
import nltk

In [8]:
# Downloading the punkt tokenizer models
nltk.download('punkt')

from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jamesliounis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import pandas as pd
import json
import numpy as np
from utils import *
from tqdm.auto import tqdm
from transformers import GPT2TokenizerFast
from fuzzywuzzy import fuzz
import pickle
tqdm.pandas()
import math



In [389]:
train_df = pd.read_csv("/Users/jamesliounis/Documents/Projects/World Bank/NLP Project/data/train_set.csv")
train_df.head(2)

Unnamed: 0,Id,section_title,text,pub_title,cleaned_label,dataset_title,dataset_label
0,796f35c1-ba6b-4552-8a7f-5d8b61164fb0,Introduction,"Grasslands provide key services, especially in...",Land cover dynamics influence distribution of ...,north american breeding bird survey bbs|north ...,North American Breeding Bird Survey (BBS)|Nort...,North American Breeding Bird Survey (BBS)|Nort...
1,796f35c1-ba6b-4552-8a7f-5d8b61164fb0,Study area,The study area consisted of the states Oklahom...,Land cover dynamics influence distribution of ...,north american breeding bird survey bbs|north ...,North American Breeding Bird Survey (BBS)|Nort...,North American Breeding Bird Survey (BBS)|Nort...


In [390]:
train_df.dropna(inplace = True, subset='text')

In [391]:
# Initialize an empty list to collect all the new rows
rows_list = []

# Using tqdm for a progress bar
for index, row in tqdm(train_df.iterrows(), total=train_df.shape[0]):
    # Tokenizing the 'text' field into sentences
    sentences = sent_tokenize(str(row['text']))
    
    # For each sentence, create a new row and add it to the list
    for sentence in sentences:
        # Instead of copying the DataFrame row, create a new dict
        new_row = row.to_dict()
        new_row['text'] = sentence
        rows_list.append(new_row)

# Concatenate them into a new DataFrame
train_df_split = pd.DataFrame(rows_list)

  0%|          | 0/236628 [00:00<?, ?it/s]

In [12]:
train_df_split.head(5)['text'].to_list()[1]

'Since the late 18th century, major land cover changes, such as grassland conversion to cropland and, elsewhere, woody plant encroachment, have occurred across large portions of the Great Plains in North America.'

# Designing a Human-AI-in-the-Loop solution to rank data

## Using various LLMs (open/closed course) to annotate data

In [25]:
# Initialize the tokenizer once, outside of the function, to avoid reloading it on each function call
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


def limit_string_by_tokens(input_string, max_tokens):
    """
    Truncates the input string to a specified maximum number of tokens using GPT-2 tokenizer.

    Args:
    input_string (str): The string to be truncated.
    max_tokens (int): The maximum number of tokens the output string should contain.

    Returns:
    str: The truncated string.
    """
    # Tokenize the input string
    tokens = tokenizer.tokenize(input_string)

    # Truncate the token list if it exceeds the maximum length
    if len(tokens) > max_tokens:
        truncated_tokens = tokens[:max_tokens]
        # Convert the truncated token list back to a string
        limited_string = tokenizer.convert_tokens_to_string(truncated_tokens)
    else:
        limited_string = input_string

    return limited_string

In [319]:
### IDENTIFY DATASET IN SENTENCES USING GROQ

mixtral = "mixtral-8x7b-32768"
llama = "llama2-70b-4096"

model = ChatGroq(model_name=llama, groq_api_key=groq_key, temperature=0)

system_message = "You are a helpful research assistant who can only return answers in the form of dictionaries in JSON format."


def extract_dictionary(response_str):
    """
    This function aims to extract a dictionary from a given string that may contain
    additional text or characters outside the dictionary structure.

    :param response_str: The string from which the dictionary will be extracted.
    :return: The extracted dictionary if successful, None otherwise.
    """

    # Use a regular expression to find a substring that looks like a dictionary.
    # This regex matches a string that starts with '{', ends with '}', and does not contain
    # any curly braces in between. The re.DOTALL flag allows '.' to match newlines as well.
    dict_str_match = re.search(r"\{[^{}]*\}", response_str, re.DOTALL)

    # Check if a match was found
    if dict_str_match:
        # Extract the matched dictionary-like string
        dict_str = dict_str_match.group(0)

        # Remove newlines to avoid JSON parsing errors and replace single quotes with double quotes
        # to conform to JSON format. Also, handle nested single quotes properly by ensuring
        # instances of "'s" are not incorrectly replaced.
        dict_str = dict_str.replace("\n", "").replace("'", '"').replace('"s', "'s")

        try:
            # Attempt to parse the corrected string as JSON and convert it into a dictionary
            result_dict = json.loads(dict_str)
            return result_dict
        except json.JSONDecodeError as e:
            # If JSON parsing fails, print an error message and return None
            print(f"Error parsing the extracted string as JSON: {e}")
            return None
    else:
        # If no dictionary-like string is found, print an error message and return None
        print("No dictionary-like string found.")
        return None


def identify_dataset_with_groq(passage):
    """
    Analyzes the given passage to identify any specific dataset names using LangChain Groq.

    Args:
    passage (str): The text passage to analyze.

    Returns:
    dict: A dictionary with the dataset name and indicators of mention and active use.
    """

    text = f"""
    Understand the following sentence: "{passage}".
    Your task is to identify if dataset is referenced in the text, assessing its context of mention and use. 
    You must not identify more than one dataset per text.
    You must not identify a dataset if it is not directly mentioned or used in the text. 
    You must always return a dictionary with two keys: one for the dataset analysis and another for the explanation.
    Provide a response strictly adhering to the structured format below without any additional narrative or explanatory text. 
    You may not provide a response if its criteria do not adhere to any of the 3 categories:
    - [0,0] denotes denotes no mention and no active use.
    - [1,0] denotes mention but no active use.
    - [1,1] denotes mention and active use. 

    - If no dataset is directly mentioned:
    {{
        "result": {{"No dataset mentioned": ["0", "0"]}},
        "explanation": "The specific explanation based on the passage content."
    }}

    - If a dataset is mentioned but not actively used:
    {{
        "result": {{"Dataset Name": ["1", "0"]}},
        "explanation": "The specific explanation based on the passage content."
    }}

    - If a dataset is mentioned and actively used:
    {{
        "result": {{"Dataset Name": ["1", "1"]}},
        "explanation": "The specific explanation based on the passage content."
    }}

    For instance:
    - Passage: "Our analysis leverages the GPT-3 dataset for training" should only return: 
    {{
        result:{{"GPT-3": ["1", "1"]}}, 
        explanation:'This passage clearly mentions the GPT-3 dataset and its usage.'
    }}

    Additional guidelines:
    - Define a dataset as "actively used" if it is integral to the research, analysis, or results being discussed.
    - You must distinguish dataset names from other entities such as indicators, citations of other academic papers, figures/tables in papers, or appendixes.
    - Make sure to always be coherent in your responses. 


    Please provide the response in the structured dictionary format as illustrated above, focusing solely on populating the 'result' and 'explanation' fields accurately according to the guidelines provided, with no additional text or context.

    """

    text = limit_string_by_tokens(text, 3800)
    response = ""
    for chunk in model.stream(text):
        response += chunk.content

    return response


# Test with random data entry
identify_dataset_with_groq(train_df_split["text"].to_list()[10555])


'{\n"result": {"1992 survey": ["1", "0"]},\n"explanation": "The 1992 survey is mentioned in the passage as the last survey affected by the wedging procedure, but it is not actively used in the text."\n}'

In [301]:
def update_dataframe_with_groq_results(df):
    """
    This function updates a DataFrame by adding new columns based on the results
    obtained from a GROQ (Graph-Relational Object Queries) model. These results
    include whether data is present, if data was used, the name of the dataset,
    and an explanation of the model's output.

    :param df: The DataFrame to be updated, which must contain a 'text' column.
    :return: The updated DataFrame with new columns related to GROQ model results.
    """

    # Initialize new columns in the DataFrame to store the results.
    # 'has_data': Boolean flag indicating if relevant data was found.
    # 'data_used': Boolean flag indicating if the data was used in analysis.
    # 'dataset_name': The name of the dataset identified (if any).
    # 'explanation': Textual explanation of the analysis result.
    df['has_data'] = False
    df['data_used'] = False
    df['dataset_name'] = None
    df['explanation'] = None
    
    # Iterate over each row in the DataFrame. 'tqdm' is used to show a progress bar.
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        # Extract the text from 'text' column of the DataFrame.
        passage = row['text']
        
        # The function returns a dictionary containing the analysis results.
        model_output = identify_dataset_with_groq(passage)
        
        # Verify the model output contains expected keys: 'result' and 'explanation'.
        if model_output and 'result' in model_output and 'explanation' in model_output:
            # Extract the first key-value pair from the 'result' dictionary.
            result_key, result_values = next(iter(model_output['result'].items()))
            
            # Check if the key is meaningful (not 'null' or an empty string).
            if result_key.lower() != 'null' and result_key != "":
                # Update the DataFrame with the analysis results.
                # A value of '1' in result_values indicates 'true'.
                df.at[index, 'has_data'] = result_values[0] == '1'
                df.at[index, 'data_used'] = result_values[1] == '1'
                # Assign the dataset name if 'has_data' is true; otherwise, leave as None.
                df.at[index, 'dataset_name'] = result_key if df.at[index, 'has_data'] else None
            else:
                # If the result key is not meaningful, update the DataFrame to reflect no data was found.
                df.at[index, 'has_data'] = False
                df.at[index, 'data_used'] = False
                df.at[index, 'dataset_name'] = None
            
            # Update the 'explanation' column with the explanation from the model output.
            df.at[index, 'explanation'] = model_output['explanation']
    
    # Return the updated DataFrame.
    return df


In [None]:
# Defining path to save data

PATH = '/Users/jamesliounis/Documents/Projects/World Bank/NLP Project/Documents/GeneratedData/'


In [329]:
# Suppress all warnings
warnings.filterwarnings('ignore')
# Define lambda function
get_response = lambda text: identify_dataset_with_groq(text)

# Load dataset of 90000 rows
df = train_df_split.head(90000)

# Define output path
output_path = PATH + 'annotated_data_90000.xlsx'

# Initialize an ExcelWriter object
with pd.ExcelWriter(output_path) as writer:
    # Process the DataFrame in chunks
    for start in tqdm(range(0, len(df), 20)):  # Adjust the chunk size as needed
        end = min(start + 20, len(df))
        chunk = df[start:end]
        
        # Apply the get_response function to the 'text' column and store directly in 'response'
        chunk['response'] = chunk['text'].apply(get_response)
        
        # Append the processed chunk to the Excel file
        # If it's the first chunk, write headers, otherwise, append without headers
        chunk.to_excel(writer, sheet_name='Sheet1', startrow=start, index=False, header=not bool(start))


  0%|          | 0/4500 [00:00<?, ?it/s]

In [362]:
# First issue with API calls - not very little data actually populated

train_90000 = pd.read_excel(PATH + 'annotated_data_90000.xlsx')
train_90000.isna().sum()

Id                   0
section_title     3304
text                 3
pub_title            0
cleaned_label        0
dataset_title        0
dataset_label        0
response         73348
dtype: int64

In [363]:
# Saving the data that was correctly annotated

train_9000_correctly_annotated = train_90000.dropna(subset='response')

In [364]:
# Identifying the data that's missing annotations

df_missing_annotation = train_90000[train_90000['response'].isna()]
df_missing_annotation.shape

(73348, 8)

In [372]:
## Annotating missing data: 73348 rows

file_name = 'annotated_data_73348.csv'

# Assuming identify_dataset_with_groq and df_missing_annotation are defined
get_response = lambda s: identify_dataset_with_groq(s)

# Applying the function with tqdm progress bar
tqdm.pandas(desc="Processing Responses")
df_missing_annotation['response'] = df_missing_annotation['text'].progress_apply(get_response)

# Define the output path for the CSV
output_csv_path = PATH + file_name

# Write the DataFrame to a CSV file
df_missing_annotation.to_csv(output_csv_path, index=False)

Processing Responses:   0%|          | 0/73348 [00:00<?, ?it/s]

In [387]:
## Annotating another 200000 rows

file_name = 'annotated_data_200000.json'

train_200000 = train_df_split.tail(200000)

# Assuming identify_dataset_with_groq and df_missing_annotation are defined
get_response = lambda s: identify_dataset_with_groq(s)

# Applying the function with tqdm progress bar
tqdm.pandas(desc="Processing Responses")
train_200000['response'] = train_200000['text'].progress_apply(get_response)

# Define the output path for the CSV
output_path = PATH + file_name

# Write the DataFrame to a CSV file
train_200000.to_json(output_path, orient='records', lines=True)

Processing Responses:   0%|          | 0/200000 [00:00<?, ?it/s]

In [None]:
# Close the Excel writer
writer.close()