# Experiments

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
MODEL = "gpt4" # @param ["gpt4", "gpt3.5"] {allow-input: true}
PROMPT_TYPE = "prompt1" # @param ["prompt1", "prompt2", "prompt3"] {allow-input: true}
HOME = "/content/drive/My Drive/template_experiment/" # @param {type:"string"}
ANNOTATOR = "Paul" # @param {type:"string"}
MATCHING_CRITERION = "Partial" # @param ["Partial", "Exact"] {allow-input: true}

In [7]:
import os
HOME = os.path.join(HOME, MODEL, PROMPT_TYPE)
HOME

'/content/drive/My Drive/template_experiment/gpt4/prompt1'

In [8]:
import pandas as pd
import numpy as np
import os
import statistics

In [9]:
df_dev_results = {}
df_dev_results1 = {}
df_dev_results5 = {}

suf = ""

if "prompt2" in HOME: suf = "1"
if "prompt3" in HOME: suf = "2"

#zero
file_name = f"dev_predictions_zero-shot{suf}.csv"
file_path = os.path.join(HOME, file_name)
df_dev_results= pd.read_csv(file_path)
df_dev_results["predicted_slot_fillers"] = df_dev_results["predicted_slot_fillers"].apply(eval)

#one
file_name = f"dev_predictions_1-shot{suf}.csv"
file_path = os.path.join(HOME, file_name)
df_dev_results1= pd.read_csv(file_path)
df_dev_results1["predicted_slot_fillers"] = df_dev_results1["predicted_slot_fillers"].apply(eval)

#five
file_name = f"dev_predictions_5-shot{suf}.csv"
file_path = os.path.join(HOME, file_name)
df_dev_results5 = pd.read_csv(file_path)
df_dev_results5["predicted_slot_fillers"] = df_dev_results5["predicted_slot_fillers"].apply(eval)

# Template selection

In [None]:
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

def template_selection(shot_results,result_rows):
  for ft in shot_results["Fallacy Types"].unique():
      def eval_F(df, col):
          _p, _r, _f, _ = precision_recall_fscore_support(
              df[df["Fallacy Types"] == ft]["Template #"],
              df[df["Fallacy Types"] == ft][col],
              average="macro",
              )
          return _f

      def eval_acc(df, col):
          return accuracy_score(
              df[df["Fallacy Types"] == ft]["Template #"],
              df[df["Fallacy Types"] == ft][col],
              )

      row = {"Fallacy Type": ft}
      row = eval_acc(shot_results, "predicted_template_no")
      result_rows.append(row)

  overall_row = {"Fallacy Type": "Overall"}
  overall_row = np.mean([row for row in result_rows])
  result_rows.append(overall_row)

  pd.options.display.float_format = "{:.2f}".format
  result_df = pd.DataFrame(result_rows).transpose()
  return result_df

In [None]:
zero = []
one = []
five = []

zero_df = template_selection(df_dev_results,zero)
one_df = template_selection(df_dev_results1,one)
five_df = template_selection(df_dev_results5,five)

In [None]:
def ts_final(result_rows,model,avg):
  FD = result_rows[0][0]
  FG = result_rows[1][0]
  FC = result_rows[2][0]
  FCred = result_rows[3][0]
  overall = result_rows[4][0]

  model.extend([FD,FG,FC,FCred,overall])
  ts=pd.DataFrame({avg:model},index=['false dilemma','faulty generalization','false causality','fallacy of credibility','total']).transpose()

  return ts

In [None]:
zero_shot = []
one_shot = []
five_shot = []

zero_final = ts_final(zero_df,zero_shot,"0")
one_final = ts_final(one_df,one_shot,"1")
five_final = ts_final(five_df,five_shot,"5")

final_df_ts = pd.concat([zero_final, one_final, five_final])
final_df_ts

Unnamed: 0,false dilemma,faulty generalization,false causality,fallacy of credibility,total
0,0.14,0.38,0.26,0.5,0.32
1,0.16,0.42,0.32,0.6,0.38
5,0.06,0.52,0.1,0.54,0.31


## Confusion matrix

In [None]:
# from sklearn.metrics import confusion_matrix

# df = df_dev_results["5-shot"]

# for ft in df_dev_results["zero-shot"]["Fallacy Types"].unique():
#     print(ft)
#     print(confusion_matrix(
#         df[df["Fallacy Types"] == ft]["Template #"],
#         df[df["Fallacy Types"] == ft]["predicted_template_no"],
#         labels=range(1,6),
#         ))
#     print()

# Slot filling

In [None]:
def slotfiller_match(row, slotfillers, strict=False):
    def overlap(s1, s2):
        return len(set(s1.lower().split()) & set(s2.lower().split())) / len(set(s1.lower().split()) | set(s2.lower().split()))

    for k in "A A' C C' X".split():
        y_true = row["{} ({})".format(k, ANNOTATOR)]
        y_pred = slotfillers.get(k, "")

        if not pd.notna(y_true): continue

        if strict:
            if y_pred.lower() != y_true.lower():
                return False

        else:
            if overlap(y_pred, y_true) < 0.5:
                return False

    return True

def slot_filler_score(df, pred_slot_fillers, strict=False):
    if len(df) == 0:
        return 0

    return sum(slotfiller_match(row, slotfillers, strict)
        for (_, row), slotfillers in zip(df.iterrows(), pred_slot_fillers)) / len(df)

In [None]:
from sklearn.metrics import classification_report, precision_recall_fscore_support

def slot_filling(shot_results,result_rows):
  for ft in shot_results["Fallacy Types"].unique():
      def get_slot_filler_score(df, col, strict):
          return slot_filler_score(
              df[(df["Fallacy Types"] == ft) & (df["predicted_template_no"] == df["Template #"]) & (df["predicted_template_no"] != 5)],
              df[(df["Fallacy Types"] == ft) & (df["predicted_template_no"] == df["Template #"]) & (df["predicted_template_no"] != 5)]["predicted_slot_fillers"],
              strict=strict,
              )

      row = {"Fallacy Type": ft}
      row["em"] = get_slot_filler_score(shot_results, "predicted_template_no", strict=True)
      row["pm"] = get_slot_filler_score(shot_results, "predicted_template_no", strict=False)
      result_rows.append(row)

  overall_row = {"Fallacy Type": "Overall"}
  overall_row["em"] = np.mean([row["em"] for row in result_rows])
  overall_row["pm"] = np.mean([row["pm"] for row in result_rows])
  result_rows.append(overall_row)

  pd.options.display.float_format = "{:.2f}".format
  result_df = pd.DataFrame(result_rows).transpose()

  return result_df

In [None]:
zero = []
one = []
five = []

zero_df = slot_filling(df_dev_results,zero)
one_df = slot_filling(df_dev_results1,one)
five_df = slot_filling(df_dev_results5,five)

In [None]:
#Exact Match
def em_final(result_rows,model,avg):
  FD = result_rows[0]['em']
  FG = result_rows[1]['em']
  FC = result_rows[2]['em']
  FCred = result_rows[3]['em']
  overall = result_rows[4]['em']

  model.extend([FD,FG,FC,FCred,overall])
  sf=pd.DataFrame({avg:model},index=['false dilemma','faulty generalization','false causality','fallacy of credibility','total']).transpose()

  return sf

In [None]:
#Partial Match
def pm_final(result_rows,model,avg):
  FD = result_rows[0]['pm']
  FG = result_rows[1]['pm']
  FC = result_rows[2]['pm']
  FCred = result_rows[3]['pm']
  overall = result_rows[4]['pm']

  model.extend([FD,FG,FC,FCred,overall])
  sf=pd.DataFrame({avg:model},index=['false dilemma','faulty generalization','false causality','fallacy of credibility','total']).transpose()

  return sf

In [None]:
zero_shot_pm = []
one_shot_pm = []
five_shot_pm = []

zero_shot_em = []
one_shot_em = []
five_shot_em = []

zero_em = em_final(zero_df,zero_shot_em,"0")
one_em = em_final(one_df,one_shot_em,"1")
five_em = em_final(five_df,five_shot_em,"5")

zero_pm = pm_final(zero_df,zero_shot_pm,"0")
one_pm = pm_final(one_df,one_shot_pm,"1")
five_pm = pm_final(five_df,five_shot_pm,"5")

final_df_sf_em = pd.concat([zero_em, one_em, five_em])
final_df_sf_pm = pd.concat([zero_pm, one_pm, five_pm])
final_df_sf_active = final_df_sf_em

if MATCHING_CRITERION == "Partial":
    final_df_sf_active = final_df_sf_pm

# Results

In [None]:
final_df = final_df_sf_active.copy()
final_df["total_ts"] = final_df_ts["total"]
final_df["total_joint"] = final_df_sf_active["total"] * final_df_ts["total"]
final_df["total_sd"] = final_df.apply(lambda x: "{:.2f}".format(x["total"]), axis=1)
final_df[["false dilemma", "faulty generalization", "false causality", "fallacy of credibility", "total_sd", "total_joint"]]

Unnamed: 0,false dilemma,faulty generalization,false causality,fallacy of credibility,total_sd,total_joint
0,0.5,0.13,0.3,0.35,0.32,0.1
1,0.25,0.21,0.23,0.17,0.21,0.08
5,0.5,0.09,0.5,0.24,0.33,0.1


In [None]:
final_df[["total_ts", "total_sd", "total_joint"]]

Unnamed: 0,total_ts,total_sd,total_joint
0,0.21,0.2,0.04
1,0.29,0.43,0.12
5,0.36,0.38,0.14
