In [1]:
%pip install openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
import openai
import os
import pandas as pd
import json
import re

In [3]:
filepath = "../data/retractable.json"
with open(filepath, 'r') as fp:
    data = json.load(fp)

df = pd.DataFrame.from_dict(data["retracted_articles"])
retrieved = df[df['retraction_body'] != ""]


In [4]:
with open('assets/openAI_key.txt', 'r') as f:
    api_key = f.read().strip()

os.environ['OPENAI_API_KEY'] = api_key

openai.api_key = os.getenv("OPENAI_API_KEY")

In [5]:
# Define the list of retraction notices
notices = [
    "invalid results",
    "plagiarism",
    "conflict of interest, legal reasons",
    "research misconduct and data manipulation",
    "agreement by author(s)",
    "reader concerns",
    "issues with authorship",
    "duplicated paper",
    "could not reproduce results",
] 

# Define a dictionary to store the results
results = {reason:[] for reason in notices}

# Define the prompt to extract retraction reasons
reasons_string = "\n-".join(results.keys())
reason_prompt = f"Given a list of possible reasons for retraction, please extract the reason(s) for retraction from a given text and return a comma-separated string of the identified reasons. The list of possible reasons is as follows: \n-{reasons_string} \nFor example, if the input text is: \"The article is suspected to contain manipulated data (...) The authors agreed to retract the article.\", the output should be \"research misconduct and data manipulation, agreement by author(s)\". There must not be any sentence in the output phrased differently than the provided retraction reason list."

print(results)

{'invalid results': [], 'plagiarism': [], 'conflict of interest, legal reasons': [], 'research misconduct and data manipulation': [], 'agreement by author(s)': [], 'reader concerns': [], 'issues with authorship': [], 'duplicated paper': [], 'could not reproduce results': []}


In [6]:

# Loop over the retraction notices
i=0
tokens = 0
llm_reasons_unfiltered = {}
llm_reasons_filtered = {}
for index, row in retrieved.iterrows(): 
    i+=1
    if i<100: #limit to 100 to keep it cheap for now
        notice = row["retraction_body"]
        #print(notice)
        id = row["article"]
        word_count = len(notice.split())
        tokens += word_count
        #print(f"Article with identifier {id} has a length of {word_count} characters, approximately {word_count*1000/750} tokens. ~{tokens}tokens used so far")
        reason_text = reason_prompt + " " + notice
        reason_result = openai.Completion.create(
            model="text-davinci-003",
            prompt=reason_text,
            max_tokens=1024,
            stop=None,
            temperature=0.0,# As deterministic as possible
        )
        reasons_string = reason_result["choices"][0]["text"].strip()
        #print(reasons_string)
        reasons_filtered = []
        for reason in notices: 
            if reason in reasons_string:
                reasons_filtered.append(reason)

        llm_reasons_unfiltered[id] = reasons_string.split(", ")
        llm_reasons_filtered[id] = reasons_filtered
        retrieved.at[index, "llm-reason"] = ", ".join(llm_reasons_filtered)
        


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  retrieved.at[index, "llm-reason"] = ", ".join(llm_reasons_filtered)


In [7]:
with open('../data/llm_reason_unfiltered.json', 'w') as f:
    json.dump(llm_reasons_unfiltered, f)
with open('../data/llm_reason_filtered.json', 'w') as f:
    json.dump(llm_reasons_filtered, f)

In [8]:
retrieved.to_html("../docs/_includes/retractable_data_llm.html")

In [9]:
retrieved

Unnamed: 0,article,pmcid,url,retraction_reason,retraction_body,llm-reason
1,MED36961293,PMC10054288,https://europepmc.org/article/MED/36961293,"{""concerned reader"":""We, the Editors and Publi...","We, the Editors and Publisher of the journal A...",MED36961293
6,MED37141222,PMC10159185,https://europepmc.org/article/MED/37141222,"{""research misconduct and data manipulation"":""...",The PLOS ONE Editors retract this article [1] ...,"MED36961293, MED37141222"
7,MED37141216,PMC10159117,https://europepmc.org/article/MED/37141216,"{""research misconduct and data manipulation"":""...","After this article was published, similarities...","MED36961293, MED37141222, MED37141216"
10,MED37137930,PMC10156653,https://europepmc.org/article/MED/37137930,,Retraction of: Scientific Reports 10.1038/srep...,"MED36961293, MED37141222, MED37141216, MED3713..."
11,MED37143044,PMC10161624,https://europepmc.org/article/MED/37143044,"{""concerned reader"":""Retraction Note: BMC Med ...","Retraction Note: BMC Med 17, 223 (2019)https:/...","MED36961293, MED37141222, MED37141216, MED3713..."
...,...,...,...,...,...,...
14073,MED12569089,PMC2173750,https://europepmc.org/article/MED/12569089,,"Reese, E.L., and L.T. Haimo. 2000. Dynein, dyn...",
14087,MED17554198,PMC2637132,https://europepmc.org/article/MED/17554198,"{""research misconduct and data manipulation"":""...","The article ""Does the Oropharyngeal Fat Tissue...",
14088,MED17407607,PMC1855922,https://europepmc.org/article/MED/17407607,,"The corresponding author, Dr Wenbao Wang, subm...",
14095,PMC2001301,PMC2001301,https://europepmc.org/article/PMC/PMC2001301,,The corresponding author submitted this articl...,
