# Exploratory Data Analysis

In this notebook, we explored the possibility of training models on specific raters. Ultimately, we decided there would not be enough data from a single rater to train an effective model.

In [1]:
# Import Python library for working with dataframes
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score

# Enables inline display of plots within the Python Notebook (instead of having them pop up on new windows)
%matplotlib inline

# Display figures the same way they will be saved.
%config InlineBackend.print_figure_kwargs = {'bbox_inches': 'tight'}

# Import Python libraries for plotting
import seaborn as sns
sns.set_theme(style="white")
import matplotlib.pyplot as plt
import matplotlib as mpl
plt.rcParams.update({
    'figure.dpi': 300,
    "font.family": "serif",
})

## Load Data

Anonymize rater names if anonymized version not found

In [11]:
def anonymize(
    original_df_path: str = "../data/All_adjudicated_ELL_data_1022.csv",
    out_df_path: str = "../data/both_raters_anonymized_1022.csv",
):
    from faker import Faker
    from functools import cache

    fake = Faker()
    Faker.seed(0)

    @cache
    def replace_name(orig_name: str):
        return fake.first_name()

    orig_df = pd.read_csv(original_df_path)
    rater_1 = orig_df.pop("Rater_1")
    rater_2 = orig_df.pop("Rater_2")

    out_df = orig_df.copy()
    out_df["Rater_1"] = rater_1.map(replace_name)
    out_df["Rater_2"] = rater_2.map(replace_name)

    out_df.to_csv("../data/both_raters_anonymized_1022.csv", index=False)

    return out_df

try:
    df = pd.read_csv("../data/both_raters_anonymized_1022.csv")
except FileNotFoundError:
    df = anonymize()

### Confirm that Overall_1 is not a simple average of the other scores.
This should be a human-generated holistic score.

In [12]:
(
    pd.DataFrame()
    .assign(
        Overall_1 = df['Overall_1'],
        Average_1 = (
            df[['Cohesion_1', 'Syntax_1', 'Vocabulary_1', 'Phraseology_1', 'Grammar_1', 'Conventions_1']]
            .mean(axis=1)
            .round(0)
            .astype(int)
        )
    )
    .query('Overall_1 != Average_1')
)

Unnamed: 0,Overall_1,Average_1
0,3,4
6,3,4
13,3,2
35,3,2
38,2,3
...,...,...
8851,3,4
8859,3,2
8860,3,4
8865,2,3


In [13]:
pd.concat([df["Rater_1"], df["Rater_2"]]).describe()

count       17760
unique         25
top       William
freq         2095
dtype: object

In [14]:
pd.concat([df["Rater_1"], df["Rater_2"]]).value_counts().describe()

count      25.000000
mean      710.400000
std       571.500583
min         2.000000
25%       225.000000
50%       605.000000
75%      1062.000000
max      2095.000000
Name: count, dtype: float64

In [None]:
display(pd.Series(df[["Rater_1", "Rater_2"]].values.tolist()).apply(set).value_counts())

In [None]:
# with pd.option_context("display.max_rows", None):
    display(df.groupby(["Rater_1"])["Rater_2"].value_counts())