In [1]:
import pandas as pd
import json
from os import path

In [2]:
scores_csv_file = "scores.csv"
auto_df = pd.read_csv(scores_csv_file)


auto_df_long = auto_df.melt(
    id_vars=["essay_id", "correction_style", "system"],
    value_vars=["gleu", "precision", "recall", "f0.5", "scribendi_score"],
    var_name="metric",
    value_name="score",
)

In [3]:
manual_evaluation_dir = "manual_evaluation/"
d_key = "evaluations"

manual_eval_dicts = []

styles = auto_df_long["correction_style"].unique().tolist()
teams = auto_df_long["system"].unique().tolist()

for team in teams:
    team_dir = path.join(manual_evaluation_dir, team)
    for style in styles:
        style_file_name = f"{style}.json"
        style_file_path = path.join(team_dir, style_file_name)
        with open(style_file_path) as f:
            metric_df = json.load(f)
        scores = metric_df[d_key]
        for d in scores:
            total = 0
            for metric in ["grammaticality", "fluency", "meaning_preservation"]:
                manual_eval_dicts.append(
                    {
                        "essay_id": d["id"],
                        "correction_style": style,
                        "system": team,
                        "metric": metric,
                        "score": d[metric],
                    }
                )
                total += d[metric]
            manual_eval_dicts.append(
                {
                    "essay_id": d["id"],
                    "correction_style": style,
                    "system": team,
                    "metric": "manual_evaluation",
                    "score": total / 3,
                }
            )

manual_df_long = pd.DataFrame(manual_eval_dicts)

In [4]:
dfs = [auto_df_long, manual_df_long]
df_long = pd.concat(dfs)
essay_ids = df_long["essay_id"].unique().tolist()

essay_id_subs = {essay_id: i for i, essay_id in enumerate(essay_ids, 1)}

df_long["essay_id"] = df_long["essay_id"].map(essay_id_subs)

print(df_long)


      essay_id correction_style      system                metric      score
0            1          minimal     UAM-CSI                  gleu  37.000000
1            2          minimal     UAM-CSI                  gleu  49.210000
2            3          minimal     UAM-CSI                  gleu  28.940000
3            4          minimal     UAM-CSI                  gleu  19.600000
4            5          minimal     UAM-CSI                  gleu  62.700000
...        ...              ...         ...                   ...        ...
1195        49          fluency  Viking-13B     manual_evaluation   2.666667
1196        50          fluency  Viking-13B        grammaticality   3.000000
1197        50          fluency  Viking-13B               fluency   2.000000
1198        50          fluency  Viking-13B  meaning_preservation   2.000000
1199        50          fluency  Viking-13B     manual_evaluation   2.333333

[2700 rows x 5 columns]


In [5]:
output_headers = {
    "essay_id": "Essay ID",
    "correction_style": "Edit Style",
    "system": "System",
    "metric": "Metric",
    "score": "Score",
}

df_long.rename(columns=output_headers, inplace=True)
output_edit_styles = {
    "minimal": "Minimal",
    "fluency": "Fluency",
}
print(df_long)
df_long["Edit Style"] = df_long["Edit Style"].map(output_edit_styles)
print(df_long)

      Essay ID Edit Style      System                Metric      Score
0            1    minimal     UAM-CSI                  gleu  37.000000
1            2    minimal     UAM-CSI                  gleu  49.210000
2            3    minimal     UAM-CSI                  gleu  28.940000
3            4    minimal     UAM-CSI                  gleu  19.600000
4            5    minimal     UAM-CSI                  gleu  62.700000
...        ...        ...         ...                   ...        ...
1195        49    fluency  Viking-13B     manual_evaluation   2.666667
1196        50    fluency  Viking-13B        grammaticality   3.000000
1197        50    fluency  Viking-13B               fluency   2.000000
1198        50    fluency  Viking-13B  meaning_preservation   2.000000
1199        50    fluency  Viking-13B     manual_evaluation   2.333333

[2700 rows x 5 columns]
      Essay ID Edit Style      System                Metric      Score
0            1    Minimal     UAM-CSI               

In [6]:
print(df_long["Metric"].unique())
output_metrics = {
    "gleu": "GLEU",
    "precision": "ERRANT;Precision",
    "recall": "ERRANT;Recall",
    "f0.5": r"ERRANT;$\text{F}_{0.5}$-Score",
    "scribendi_score": "Scribendi Score",
    "grammaticality": "SOME;Grammaticality",
    "fluency": "SOME;Fluency",
    "meaning_preservation": "SOME;Meaning Preservation",
    "manual_evaluation": "SOME;Total",
}
df_long["Metric"] = df_long["Metric"].map(output_metrics)
df_long[["Metric", "Submetric"]] = df_long["Metric"].str.split(";", expand=True)
df_long["Submetric"] = df_long["Submetric"].str.strip().fillna("")
df_long = df_long[
    [
        "Essay ID",
        "Edit Style",
        "System",
        "Metric",
        "Submetric",
        "Score",
    ]
]

print(df_long.info())

['gleu' 'precision' 'recall' 'f0.5' 'scribendi_score' 'grammaticality'
 'fluency' 'meaning_preservation' 'manual_evaluation']
<class 'pandas.core.frame.DataFrame'>
Index: 2700 entries, 0 to 1199
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Essay ID    2700 non-null   int64  
 1   Edit Style  2700 non-null   object 
 2   System      2700 non-null   object 
 3   Metric      2700 non-null   object 
 4   Submetric   2700 non-null   object 
 5   Score       2700 non-null   float64
dtypes: float64(1), int64(1), object(4)
memory usage: 147.7+ KB
None


In [7]:
file_name = "scores_long.csv"
df_long.to_csv(file_name, index=False)