# Using chatGPT for factchekcing

An easy way to check whether statements are true is to just ask a LLM. This is a different task from asking a question where it can make things up.  In this case, there are 3 outcomes: True, False, Unknown.

I take 2 approaches:
- just ask whether the statement is true or not
- in addition, ask it to present some evidence. There are 2 variations:
  - simply ask for evidence
  - ask for some number of pieces of evidence

Asking for evidence is analogous to a chain of thought approach and it will be intetersting to see if this changes the performance.


## Implementation

Querying chatGPT is straightforward. 

In [110]:
import openai
import os
import time
import json
import pandas as pd
import random
import re

BASEDIR = os.getcwd()
TESTDIR = os.path.join(BASEDIR, 'testData')
RESULTDIR = os.path.join(BASEDIR, 'testResults/chatgpt')

MODEL = "gpt-3.5-turbo"

reTV = re.compile('<(.*?)>')


def verifyStatement(stmts: str):
    nqStmts = stmts.replace("'", '')
    rv = []
    query = [{"role": "system",
              "content": "You answer questions tersely."},
            {"role": "user", "content": f"""For each statement in the list below, determine whether it is true or not.
            If it is true, respond with '<TRUE>', if it is false respond with '<FALSE>', otherwise, respond with '<UNKNOWN>'. Return your responses in a json list.
            
            Statements: {nqStmts}
            Answer: """}]
    oaiResp = openai.ChatCompletion.create(
        model = MODEL, 
        messages = query,
        temperature = 0)
    response = oaiResp['choices'][0]['message']['content']
    tvmap = {'<TRUE>': True, '<FALSE>': False, '<UNKNOWN>': 'Unknown'}
    try:
        respL = json.loads(response)
    except:
        if type(response) == list:
            respL = response
        else:
            respL = [response]
    rv = [tvmap[x] if x in tvmap else 'Unknown' for x in respL]
    return rv, oaiResp


def verifyStatementWithEvidence(stmts: str):
    query = [
        {"role": "system",
         "content": "You are a knowledgeable and conscientious agent who follows instructions exactly all the time."},
        {"role": "user", 
         "content": f"""For each statment in the list below, provide: 
    1. A few pieces of evidence for it with their sources
    2. A few pieces of evidence against it with their sources
    3. <TRUE> if you assess the statement to be true, <FALSE> if you believe it is false, <UNKNOWN> if you cannot conclude either true or false.

Statement list: 
{stmts}
"""}
    ]
    oaiResp = openai.ChatCompletion.create(
        model=MODEL,
        messages=query,
        temperature=0,
    )
    result = oaiResp['choices'][0]['message']['content']
    lb = 0
    tvs = []
    responses = reTV.findall(result)
    for tv in responses:
        if tv == 'TRUE': tvs.append(True)
        elif tv == "FALSE": tvs.append(False)
        else: tvs.append('Unknown')
    return tvs, oaiResp

def verifyStatementWith4Pieces(stmts: str):
    query = [
        {"role": "system",
         "content": "You are a knowledgeable and conscientious agent who follows instructions exactly all the time."},
        {"role": "user", 
         "content": f"""For each statment in the list below, provide: 
    1. 4 pieces of evidence for it, with their sources
    2. 4 pieces of evidence against it, with their sources
    3. <TRUE> if you assess the statement to be true, <FALSE> if you believe it is false, <UNKNOWN> if you cannot conclude either true or false.

Statement list: 
{stmts}
"""}
    ]
    oaiResp = openai.ChatCompletion.create(
        model=MODEL,
        messages=query,
        temperature=0,
    )
    result = oaiResp['choices'][0]['message']['content']
    lb = 0
    tvs = []
    responses = reTV.findall(result)
    for tv in responses:
        if tv == 'TRUE': tvs.append(True)
        elif tv == "FALSE": tvs.append(False)
        else: tvs.append('Unknown')
    return tvs, oaiResp


### Driver

This needs to handle intances where chatGPT does not return the correct number of results.  


In [111]:


MAXSLEEP = 3
MINSLEEP = 0
BATCHSIZE = 4

from typing import Callable

def testFactCheck(inFile: str,          # json list of dicts with statement, fake_statment...
                  selectFile: str,      # json list of True, False
                  vmethod: Callable[list[str], list[str]],  # approach to take 
                  outFile: str,         # output
                  logFile: str = None,  # detailed output
                  batchSize: int = 64,  # batch size
                  startIdx: int = 0,
                  numProc: int = -1,
                 ):
    results = []
    predVals = []
    nb = 0
    with open(inFile, 'r') as ix:
        testData = json.load(ix)
    with open(selectFile, 'r') as sx:
        selData = json.load(sx)
    if logFile is not None:
        log = open(logFile, 'w')
    lb = 0
    ub = 0
    while lb < len(testData) -1:
        ub = min(lb + batchSize, len(testData))
        if numProc > -1:
            ub = min(ub, (numProc + startIdx))
            if lb >= numProc + startIdx: break
        try:
            batch = []
            stv = []
            for bx in range(lb, ub):
                batch.append(testData[bx]['statement'] if selData[bx] else testData[bx]['fake_statement'])
                stv.append([batch[-1], selData[bx]])

            preds, oaiResp = vmethod(('\n'.join(batch)).replace("'", ' '))
            textResp = oaiResp['choices'][0]['message']['content']
            if len(preds) != len(batch):
                print(f"ERROR in batch {nb}: lenght of preds = {len(preds)}, batch = {len(batch)}. Redoing one at a time")
                preds = []
                textResp = ''
                for s in batch:
                    px, ox = vmethod(s + '\n')
                    if len(px) != 1:
                        preds.append('Unknown')
                    else:
                        preds.append(px[0])
                    textResp += ox['choices'][0]['message']['content']
            lb = ub
            nb += 1
            predVals.extend(preds)
            results.extend(stv)
            if logFile is not None:
                log.write(f"{textResp}\n\n")
            time.sleep(MINSLEEP + random.random() * (MAXSLEEP - MINSLEEP))
        except Exception as e:
            print('Error in \n', batch, '\n', e)
        if nb % 10 == 0:
            log.flush()
            print(f"{nb} batches done")
            df = pd.DataFrame(results, columns=['question', 'trueAnswer'])
            try:
                df['gptAnswer'] = predVals
                df['correct?'] = (df['gptAnswer'] == df['trueAnswer'])
            except Exception as e:
                print(e)
            df.to_csv(outFile, index=False, header=True)
    if logFile is not None: log.close()
    df = pd.DataFrame(results, columns=['question', 'trueAnswer'])
    try:
        df['gptAnswer'] = predVals
        df['correct?'] = (df['gptAnswer'] == df['trueAnswer'])
    except Exception as e:
        print(e)
        with open(os.path.join(RESULTDIR, 'predvals_buggy'), 'w') as px:
            json.dump(predVals, px)
            print('Dumped predVals')
    df.to_csv(outFile, index=False, header=True)


In [None]:
testFactCheck(os.path.join(TESTDIR, 'tf_qa-dev.json'),
              os.path.join(TESTDIR, 'tf_qa-dev_select1.json'),
              verifyStatementWith4Pieces,
              os.path.join(RESULTDIR, 'triviaQA_dev_4Pieces.csv'),
              os.path.join(RESULTDIR, 'triviaQA_dev_4Pieces.log'),
              BATCHSIZE,
             )

## Results

Results are obtained from the same dataset as for the websearch version - a list of randomly picked true (according to triviaQA) and false statements generated by chatGPT.

### Examples


#### Evidence

True statement: 
>Daniel Barenboim was married to one of the subjects of the film Hillary and Jackie.

- chatGPT without evidence: Unknown
- chatGPT with evidence: True

> 1. Evidence for the statement:
>   - Daniel Barenboim was indeed married to one of the subjects of the film "Hillary and Jackie." The film is a biographical drama based on the lives of the British cellist Jacqueline du Pré and her sister Hilary du Pré. Jacqueline du Pré was married to Daniel Barenboim from 1967 until her death in 1987. This information can be verified through various sources, including:
     - The Guardian: "Daniel Barenboim: 'I have never been a fanatic about music'"
     - The New York Times: "Daniel Barenboim: A Life in Music"

>2. Evidence against the statement:
>   - There is no evidence against the statement, as it is accurate that Daniel Barenboim was married to one of the subjects of the film "Hillary and Jackie."

>3. Assessment of the truth of the statement: True

chatGPT generates succinct arguments if available, together with sources. 

#### Evidence confusion

In this case the same arguments are taken to be for and against the statement:

>1. Evidence for the statement "Niamey is the capital of Nigeria":

>- Niamey is indeed the capital of Niger, not Nigeria. This is a well-known fact and can be verified through various reputable sources such as Encyclopedia Britannica (Source: Encyclopedia Britannica - Niger).
>- Abuja is the capital of Nigeria, as stated by the Nigerian government and recognized internationally. This information can be confirmed through official government websites, such as the Federal Republic of Nigeria's official website (Source: Official website of the Federal Republic of Nigeria).
>- The United Nations, as well as other international organizations, recognize Abuja as the capital of Nigeria. This can be verified through the United Nations' official website or other international governmental organizations (Source: United Nations - Member States).
>- Numerous travel guides and reputable travel websites, such as Lonely Planet and TripAdvisor, state that Abuja is the capital of Nigeria (Source: Lonely Planet - Nigeria, TripAdvisor - Nigeria).

>2. Evidence against the statement "Niamey is the capital of Nigeria":

>- Niamey is the capital of Niger, not Nigeria. This is a well-known fact and can be verified through various reputable sources such as Encyclopedia Britannica (Source: Encyclopedia Britannica - Niger).
>- Abuja is the capital of Nigeria, as stated by the Nigerian government and recognized internationally. This information can be confirmed through official government websites, such as the Federal Republic of Nigeria's official website (Source: Official website of the Federal Republic of Nigeria).
>- The United Nations, as well as other international organizations, recognize Abuja as the capital of Nigeria. This can be verified through the United Nations' official website or other international governmental organizations (Source: United Nations - Member States).
>- Numerous travel guides and reputable travel websites, such as Lonely Planet and TripAdvisor, state that Abuja is the capital of Nigeria (Source: Lonely Planet - Nigeria, TripAdvisor - Nigeria).

>3. <FALSE> The statement "Niamey is the capital of Nigeria" is false.

#### Evidence may be made up

False statement:
> Uranus has moons called Proteus and Nereid.
 
>1. Evidence for the statement:
>   - Proteus: Proteus is indeed one of the moons of Uranus. It was discovered by Voyager 2 in 1989. Source: NASA - Solar System Exploration website (https://solarsystem.nasa.gov/moons/uranus-moons/proteus/in-depth/)
>   - Nereid: Nereid is also a moon of Uranus. It was discovered in 1949 by Gerard Kuiper. Source: NASA - Solar System Exploration website (https://solarsystem.nasa.gov/moons/uranus-moons/nereid/in-depth/)

>2. Evidence against the statement:
>   - There is no evidence against the statement. The statement is accurate, and both Proteus and Nereid are moons of Uranus.

The statement is false. While Voayger 2 did discover Proteus but not as a moon of Uranus.  The statement is assumed and stated, then a true fact is added as though providing evidence for the false statement.

#### More evidence can help

For the same statement, asking for more evidence gives:

> Uranus has moons called Proteus and Nereid.

>1. Evidence for:
   a. Uranus has a moon named Proteus, which was discovered by Voyager 2 in 1989. (Source: NASA - Uranus Moons Fact Sheet)
   b. Proteus is the second-largest moon of Uranus and has a irregular shape. (Source: NASA - Uranus Moons Fact Sheet)
   c. Nereid is another moon of Uranus, discovered in 1949. (Source: NASA - Uranus Moons Fact Sheet)
   d. Nereid is one of the outermost moons of Uranus and has an eccentric orbit. (Source: NASA - Uranus Moons Fact Sheet)

>2. Evidence against:
   a. Uranus has a total of 27 known moons, and Proteus and Nereid are not among them. (Source: NASA - Uranus Moons Fact Sheet)
   b. The known moons of Uranus include names like Miranda, Ariel, Umbriel, Titania, and Oberon, but not Proteus or Nereid. (Source: NASA - Uranus Moons Fact Sheet)
   c. Proteus and Nereid are moons of Neptune, not Uranus. (Source: NASA - Neptune Moons Fact Sheet)
   d. The confusion between Uranus and Neptune moons might arise due to their similar blue color and their positions in the outer regions of the solar system.

>3. <FALSE> Uranus does not have moons called Proteus and Nereid. These moons are actually associated with the planet Neptune.

The evidence for the statement is still made up, but it now gets evidence against and finds that these are moons of Neptune, not Uranus.

#### Made up sources

In the above examples, chatGPT was  attributing false statements to specific sources.  Another issue is that most of the urls it provides as source give 404 errors. It is unlikely that most of the urls changed since they were scraped.  The more likely explanation is that chatGPT is making urls up. For example:

False statement:
> Thriller by Michael Jackson became the first ever music video to be shown on MTV.

> Evidence for:
> 1. In an article by Billboard, it is stated that "Thriller" was indeed the first music video to be shown on MTV. (Source: https://www.billboard.com/articles/news/467705/michael-jacksons-thriller-30-years-later-10-key-moments)
> 2. The official MTV website also confirms that "Thriller" was the first music video to be aired on the channel. (Source: http://www.mtv.com/news/2697375/michael-jackson-thriller-video/)
> 3. The Guinness World Records website recognizes "Thriller" as the first music video to be shown on MTV. (Source: https://www.guinnessworldrecords.com/world-records/first-music-video-shown-on-mtv)
> 4. Various reputable music and entertainment publications, such as Rolling Stone and NME, also acknowledge "Thriller" as the first music video on MTV. (Sources: https://www.rollingstone.com/music/music-lists/100-greatest-music-videos-66601/michael-jackson-thriller-1983-66684/ and https://www.nme.com/photos/100-greatest-music-videos-ever-1426112)

> Evidence against:
> 1. There is no credible evidence or sources that dispute the claim that "Thriller" was the first music video on MTV.

> Assessment: <TRUE>

Not only is the evidence incorrect, the urls either give a 404 error or are not about Thriller.

### Metrics

We next compute some metrics.

#### Precision, recall, F1

Below are computations for these metrics.  Asking for some evidence seems worse than asking for none, but asking for more evidence gives better results.

In [117]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

def computeMetrics(inFile):
    df = pd.read_csv(inFile, index_col=None, dtype=str)
    preds = df['gptAnswer'].tolist()
    trues = df['trueAnswer'].tolist()
    labels = [True, False, 'Unknown']
    p,r,f,s = precision_recall_fscore_support(trues, preds, labels=labels, average=None,
                                             zero_division=np.nan)
    prf = np.array([p[:2], r[:2], f[:2]])
    print('Precision, recall, F1 for True, False')
    print(prf)
    cm = confusion_matrix(trues, preds, labels=labels)
    print('Confusion matrix for True, False, Unknown')
    print(cm)


In [118]:
# Version not asking for evidence

computeMetrics(os.path.join(RESULTDIR, 'triviaQA_dev_noEvidence.csv'))

Precision, recall, F1 for True, False
[[0.67821782 0.73571429]
 [0.685      0.50738916]
 [0.68159204 0.60058309]]
Confusion matrix for True, False, Unknown
[[137  37  26]
 [ 65 103  35]
 [  0   0   0]]


In [119]:
# asking for evidence
computeMetrics(os.path.join(RESULTDIR, 'triviaQA_dev_withEvidence.csv'))

Precision, recall, F1 for True, False
[[0.89010989 0.73099415]
 [0.40703518 0.61576355]
 [0.55862069 0.6684492 ]]
Confusion matrix for True, False, Unknown
[[ 81  46  72]
 [ 10 125  68]
 [  0   0   0]]


In [120]:
# asking for 4 pieces of evidence

computeMetrics(os.path.join(RESULTDIR, 'triviaQA_dev_4Pieces.csv'))

Precision, recall, F1 for True, False
[[0.79617834 0.76190476]
 [0.625      0.70935961]
 [0.70028011 0.73469388]]
Confusion matrix for True, False, Unknown
[[125  45  30]
 [ 32 144  27]
 [  0   0   0]]


In [None]:
I next compare pairs of versions using a confusion matrix.

In [121]:
# compare versions

def compareVersions(leftFile, rightFile, labels):
    dfLeft = pd.read_csv(leftFile, index_col=None, dtype=str)
    dfRight = pd.read_csv(rightFile, index_col=None, dtype=str)
    dfM = pd.merge(dfLeft, dfRight, how='inner', on='question', suffixes=labels)
    #print(dfM)
    cm = confusion_matrix(dfM[f"gptAnswer{labels[0]}"], 
                          dfM[f"gptAnswer{labels[1]}"], 
                          labels=['True', 'False', 'Unknown'])
    print(f"Confusion matrix for {labels[0]} X {labels[1]}. (no_evidence is rows)")
    print(cm)
    cm = confusion_matrix(dfM[dfM[f"trueAnswer{labels[0]}"] == 'True'][f"gptAnswer{labels[0]}"], 
                          dfM[dfM[f"trueAnswer{labels[0]}"] == 'True'][f"gptAnswer{labels[1]}"], 
                          labels=['True', 'False', 'Unknown'])
    print(f"\nConfusion matrix for only true statements, {labels[0]} X {labels[1]}")
    print(cm)
    cm = confusion_matrix(dfM[dfM[f"trueAnswer{labels[0]}"] == 'False'][f"gptAnswer{labels[0]}"], 
                          dfM[dfM[f"trueAnswer{labels[0]}"] == 'False'][f"gptAnswer{labels[1]}"], 
                          labels=['True', 'False', 'Unknown'])
    print(f"\nConfusion matrix for only false statements, {labels[0]} X {labels[1]}")
    print(cm)
    


In [116]:
# compare noEvidence with withEvidence
compareVersions(os.path.join(RESULTDIR, 'triviaQA_dev_noEvidence.csv'),
               os.path.join(RESULTDIR, 'triviaQA_dev_withEvidence.csv'),
              ['_noEvidence', '_withEvidence'])

Confusion matrix for labels[0] X labels[1]. (no_evidence is rows)
[[81 54 67]
 [ 5 96 38]
 [ 5 21 35]]

Confusion matrix for only true statements, _noEvidence X _withEvidence
[[72 26 39]
 [ 4 17 15]
 [ 5  3 18]]

Confusion matrix for only false statements, _noEvidence X _withEvidence
[[ 9 28 28]
 [ 1 79 23]
 [ 0 18 17]]


## Conclusions

Factchecking is meant to be a way to verify whether a statement is true. In addition providing evidence is meant to help people gain confidence in the judgement of the system.

1. the metrics are relatively low
2. chatGPT makes up sources and arguments, obliterating any confidence this approach was supposed ot provide to people.

