In [36]:
import json
import os
import pandas as pd
from pathlib import Path
from text_extraction import file_finder, TextExtraction
from utils import create_data, get_completion_from_messages

In [9]:
def doi_to_article_name(doi: str) -> str:
    """
    This function takes a file name and returns the path to the file in the all_data_articles.
    """
    title_doi = "../data/titles_doi.csv"
    folder_path = "../all_data_articles"
    
    #extract the doi from the file name
    #doi = file_str.split("_")[-1].split(".")[0]

    # find the row in the csv file where the doi column ends with the doi
    df = pd.read_csv(title_doi)
    doi_row = df[df["DOI"].str.endswith(doi)]

    # extract the title from the row
    title_json = doi_row["Title"].values[0].replace(" ", "_") + ".json"

    for filename in os.listdir(folder_path):
        if filename.endswith(".json") and filename.startswith(title_json[:int(len(title_json)/3)]):
            return filename

In [10]:
op_examples = pd.read_csv('../data/few_shot_examples/opinionated_examples.csv')
neutral_examples = pd.read_csv('../data/few_shot_examples/neutral_examples.csv')
data = create_data()
data_keys = list(data.keys())

In [11]:
path_annotations = Path("../data/annotated")
path_articles = Path("../all_data_articles")

In [13]:
doi_to_article_name(neutral_examples['doi'].iloc[0])

'Royal_Attitudes_to_the_Atlantic_Slave_Trade_and_Abolition_in_the_Late_Eighteenth_and_Early_Nineteenth_Centuries*.json'

In [48]:
examples = []
for i in range(len(neutral_examples)):
    name = neutral_examples["author/s"].iloc[i]
    title = neutral_examples['title'].iloc[i]
    article = doi_to_article_name(neutral_examples['doi'].iloc[i])
    article_path = path_articles / article
    footnote_number = neutral_examples['footnote'].iloc[i]
    with open(article_path, "r", encoding="utf-8") as file:
            article_dict = json.load(file)
    context = TextExtraction(article_dict, previous_context_tokens=45, following_context_tokens=45,
                        previous_context_sentences=None, following_context_sentences=None,
                        previous_whole_paragraph=False, following_whole_paragraph=False,
                        till_previous_citation=None, till_following_citation=None
                    , footnote_text=False, footnote_mask=True
                    ).generate_context(footnote_number)
    footnote = article_dict["footnotes"][str(footnote_number)]
    label = neutral_examples['label'].iloc[i]
    examples.append({"name": name, "title": title, "context": context, "footnote": footnote, "label": label})


for i in range(len(op_examples)):
    name = op_examples["author/s"].iloc[i]
    title = op_examples['title'].iloc[i]
    article = doi_to_article_name(op_examples['doi'].iloc[i])
    article_path = path_articles / article
    footnote_number = op_examples['footnote'].iloc[i]
    with open(article_path, "r", encoding="utf-8") as file:
            article_dict = json.load(file)
    context = TextExtraction(article_dict, previous_context_tokens=45, following_context_tokens=45,
                        previous_context_sentences=None, following_context_sentences=None,
                        previous_whole_paragraph=False, following_whole_paragraph=False,
                        till_previous_citation=None, till_following_citation=None
                    , footnote_text=False, footnote_mask=True
                    ).generate_context(footnote_number)
    footnote = article_dict["footnotes"][str(footnote_number)]
    label = op_examples['label'].iloc[i]
    examples.append({"name": name, "title": title, "context": context, "footnote": footnote, "label": label})

In [54]:
few_shot_examples = ""

for example in [examples[0], examples[3], examples[1]]:
    name = example["name"]
    title = example["title"]
    context = example["context"]
    footnote = example["footnote"]
    label = example["label"]
    example_string = f"""
    name: {name}
    title: {title}
    context: {context}
    footnote: {footnote}
    label: {label}
    """
    few_shot_examples += example_string

In [67]:
def get_sentiment(name, title, context, footnote):
    
    system_message = """
    You are an expert in analyzing citations from historical papers. It is your job 
    to determine if the author makes a statement about the quality of the work or just 
    uses it for the purpose of information reproduction.
    """
    prompt = f"""
    You will receive the name of the author of the cited source, its title, the context 
    of the citation and its corresponding footnote. 
    The data will be submitted in the following format:
    #######################Begin format instructions####################################
    name: name of authors
    title: title of cited source
    context: The context of the citation
    footnote: The corresponding footnote text of the citation
    #######################End format instructions#################################
    In the context citations are annotated like this: "[Citation footnotenumber]".

    The author makes a statement about the quality of the work if:
    - the author makes a judgemental statement about the quality of a  cited source. 
    - the author rates the quality of the work in a positive or negative mannser etc.
    Keywords of opinionated citations:
    - better, failed, argue, however, convincingy, nuanced, vague, fail, overlook, simplification
    Be very strict when labeling if a statement about the quality of a work is made. Only do so if the criteria match precisely.

    A citation reproduces information if it does not make an explicit statement about the quality of the cited work!

    Look closely at the text in the footnote! It can be the case that the hint if 
    a citation is neutral or opinionated might be located in the footnote text.
    Footnotes often contain multiple citations of different authors. 
    Look at the names of authors from the name field and relate them to the footnote. 
    Only rate the citation that is related to the name of authors in the name field. 

    Return 1 if the author makes a statement about the quality of the work else 0.
    Only return integerts 0 and 1, nothing else.
    #################Begin data####################################
    name: {name}
    title: {title}
    context: {context}
    footnote: {footnote}
    Your answer: Enter integer here
    """
    messages = [{"role": "system", "content": system_message}, {"role": "user", "content": prompt}]
    prediction = get_completion_from_messages(messages)
    return prediction

In [69]:
for i in range(len(data_keys)):
    df = data[data_keys[i]][0:10]

    predictions = []
    for i in range(len(df)):
        name = df["Authors"].iloc[i]
        title = df["Title"].iloc[i]
        context = df["context"].iloc[i]
        footnote = df["footnote_text"].iloc[i]

        pred = get_sentiment(name, title, context, footnote)
        predictions.append(pred)
    print(predictions)

['0', '0', '0', '0', '0', '1', '1', '1', '1', '1']
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
['0', '1', '0', '0', '0', '0', '0', '0', '0', '0']
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
['0', '0', '1', '1', '1', '1', '1', '1', '1', '1']
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
