# Load from database

<a href="https://colab.research.google.com/github/kirubarajan/roft/blob/master/annotation/analysis/research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install fsspec gcsfs

In [None]:
import json
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

DATABASE_DUMP_FILE = 'gs://roft_buckups/09/23/2021.json'

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:

with tf.io.gfile.GFile(DATABASE_DUMP_FILE, 'r') as f:
    db = json.load(f)

def get_df(sql_model='core.annotation'):
    df = pd.DataFrame(db)
    df = df[df.model == sql_model]
    
    if 'date' in df.columns.values:
        df = df.set_index('date')

    return pd.json_normalize(df.fields).assign(pk=df.pk.values)

In [None]:
df = pd.DataFrame(db)
print(set(df["model"].tolist()))

### Load all the tables

In [None]:
annotation_df = get_df()
profile_df = get_df('core.profile')
generation_df = get_df('core.generation')
prompt_df = get_df('core.prompt')
playlist_df = get_df('core.playlist')
decodingstrategy_df = get_df('core.decodingstrategy')
user_df = get_df('auth.user')

### Modify column names to avoid duplicates across tables.

In [None]:
prompt_df = prompt_df.rename(columns={"body": "prompt_body"})
generation_df = generation_df.rename(columns={"body": "gen_body"})
decodingstrategy_df = decodingstrategy_df.rename(
    columns={"name": "dec_strat", "value": "dec_strat_value"})
annotation_df["date"] = pd.to_datetime(annotation_df["date"])

### Merge all the relevant tables together.

In [None]:
gen_to_playlist = {}
for idx, row in playlist_df.iterrows():
  shortname = row["shortname"]
  version = row["version"]
  generations = row["generations"]
  for gen_id in generations:
    gen_to_playlist[gen_id] = (shortname, version)

In [None]:
full_df = annotation_df.join(generation_df.set_index('pk'), on='generation')
full_df = full_df.join(prompt_df.set_index('pk'), 'prompt')
full_df = full_df.join(decodingstrategy_df.set_index('pk'), 'decoding_strategy')

In [None]:
playlist_names = [] 
playlist_versions = []
for idx, row in full_df.iterrows():
  gen_id = row["generation"]
  playlist_info = gen_to_playlist[gen_id]
  playlist_names.append(playlist_info[0])
  playlist_versions.append(playlist_info[1])
full_df["playlist_name"] = playlist_names
full_df["playlist_version"] = playlist_versions

In [None]:
full_df

# Filtering annotations for users who have agreed to have their data analyzed

In [None]:
SURVEY_RESPONSES_FILE = 'gs://roft_buckups/521_responses.csv'
survey_df = pd.read_csv(SURVEY_RESPONSES_FILE)
survey_filter_df = survey_df[survey_df["Do you agree for the data being collected on this form along with any annotations you make on the RoFT website to be used in an anonymized, aggregated way for research on students' ability to detect machine-generated text? Your answer on this question will not affect your grade."] == 'No']

# all the users who GAVE US PERMISSION
users_filter_df = user_df[~user_df.email.isin(survey_filter_df["Email Address"].values)]

# all the profiles that GAVE US PERMISSION
profiles_filter_df = profile_df[profile_df.user.isin(users_filter_df.pk)]

full_df = full_df[full_df.annotator.isin(profiles_filter_df.pk)]

# Analysis

## Mean points

In [None]:
def analyze_per_playlist():
  info_to_return = []
  playlist_names = set(playlist_df["shortname"].tolist())
  for playlist in playlist_names:
    df = full_df[(full_df["playlist_name"]==playlist) & (full_df["playlist_version"]=="0.2")]

    info = {"playlist": playlist,
            "mean score": np.mean(df["points"]),
            "median score": np.median(df["points"]),
            "fraction_nonzero": len(df[df["points"] > 0]) / len(df),
            "num_annotations": len(df)
    }
    info_to_return.append(info)
  return pd.DataFrame(info_to_return)

analyze_per_playlist()

In [None]:
def analyze_per_decoding_strat():
  info_to_return = []
  playlist_names = set(playlist_df["shortname"].tolist())
  for playlist in playlist_names:
    for top_p_value in [0.0, 0.4, 1.0]:
      df = full_df[(full_df["dec_strat"]=="top-p") &
                  (full_df["dec_strat_value"]==top_p_value) &
                  (full_df["playlist_name"]==playlist)]

      if len(df) > 0:
        info = {"p-value": top_p_value,
                "playlist": playlist,
                "mean score": np.mean(df["points"]),
                "median score": np.median(df["points"]),
                "fraction_nonzero": len(df[df["points"] > 0]) / len(df),
                "num_annotations": len(df),
        }
        info_to_return.append(info)
  return pd.DataFrame(info_to_return)

analyze_per_decoding_strat()

In [None]:
full_df.columns

In [None]:
[0.0, 0.4, 1.0]

## Annotator skill

In [None]:
annotation_df.groupby('annotator').points.mean().plot.hist(
    figsize=(20, 10),
    title='Achieved Points Distribution'
)

Of the annotators who did at least K annotations, plot their mean score over time

In [None]:
df = annotation_df.groupby('annotator').count()


In [None]:
def analyze_progress(k=50):
  """Analyze whether annotators improve in aggregate over k annotations."""
  all_score_series = []
  annotators = df[df["pk"] > k].reset_index()["annotator"].tolist()
  for annotator in annotators:
    annotations = annotation_df[annotation_df["annotator"] == annotator]
    score_series = annotations.sort_values("date")["points"][:k].tolist()
    all_score_series.append(score_series)

  data = np.array(s)
  data = np.mean(data, axis=0)
  plt.plot(range(1, k+1), data)
  plt.ylabel("Mean score")
  plt.xlabel("$n$th annotation")
  plt.title("Performance over time")
  plt.show()

  return np.array(all_score_series)

k = 100
s = analyze_progress(k)

### Inter-annotator agreement

For every pair of annotators who annotated the same generaton, what fraction guessed the same boundary?

In [None]:
annotations_per_gen = annotation_df.groupby('generation')
# 
num_annotations_per_gen = annotations_per_gen.points.count()


In [None]:
def analyze_fraction_agreements():
  generation_ids = set(annotation_df["generation"].tolist())
  annotations_per_gen = annotation_df.groupby('generation')

  overall_num_annotations = 0
  overall_num_agreements = 0

  x = annotations_per_gen.boundary.value_counts()
  for idx, generation in enumerate(generation_ids):
    chosen_boundaries = x[generation]
    chosen_boundaries = {k: chosen_boundaries[k] for k in chosen_boundaries.keys()}
    
    total_annotations = sum(chosen_boundaries.values())
    if total_annotations > 1:
      total_agreements = sum(v for v in chosen_boundaries.values() if v > 1)

      overall_num_annotations += total_annotations
      overall_num_agreements += total_agreements

  print("Out of {} total annotations on generations with >1 annotation, {} were in agreement with another annotation on the true boundary position. That is {}".format(
      overall_num_annotations, overall_num_agreements, overall_num_agreements/overall_num_annotations
  ))
analyze_fraction_agreements()

# TODO: Figure out what the baseline of random guessing would be,

In [None]:
user_df[user_df["pk"] == 2697]

## Profile Statistics

In [None]:
profile_df.is_temporary.mean()