In [1]:
from collections import ChainMap
import json
import os
import re

import numpy as np
import pandas as pd

from valueconsistency import *
from measures import *
from plots import *


INFO:datasets:PyTorch version 2.2.2 available.


In [2]:
os.chdir('..')

In [3]:
def process_paraphrase_validate_cloud_research(filename):
    df = pd.read_csv(filename)
    d = df[['Task Data', 'Submitted Data']].dropna().reset_index(drop=True).map(json.loads)
    
    d['Submitted Data'] = d['Submitted Data'].apply(lambda x: x['Data']['taskData'])
    
    def process_cloud_research_task_data(row):
        list_of_dicts = [{col['ColumnHeader'] : col['CellData']} for col in row['RowData']]
        result = dict(ChainMap(*list_of_dicts)) # {k: v for d in list_of_dicts for k, v in d.items()}
        return result
    
    d['Task Data'] = d['Task Data'].apply(process_cloud_research_task_data)

    results = []
    
    for _, row in d.iterrows():
        rows = dict(ChainMap(*row.values))
        num_qs = len({k for k in rows.keys() if re.fullmatch(r'q_\d', k)})
        if rows['topic'].startswith('['):
            topics = eval(rows['topic'])
        else:
            topics = rows['topic']

        for i in range(num_qs):
            result = {'answer' : rows[f'q_{i}'],
                        'topic' : topics if isinstance(topics, str) else topics[i],
                        'tm' : rows['tm']}
            if f'q_{i}_question' in rows and rows[f'q_{i}_question']:
                result['original'] = rows[f'q_{i}_question']
            results.append(result)

    result =  pd.DataFrame(results)
    result['tm'] = result['tm'].astype('float')
    return result

In [30]:
controversial_dir = 'data/validate/controversial'
paraphrase_dir = 'data/validate/paraphrase/cloud_research'


stats = []
total_equivalent_n = 0
total_equivalent = 0

total_controversial_n = 0
total_controversial = 0

total_uncontroversial_n = 0
total_uncontroversial = 0

all_controversial = []
all_uncontroversial = []


for controversial in [True, False]:
    for language, country in [('english', 'us'), ('german', 'germany'),
                              ('chinese', 'china'), ('japanese', 'japan')]:

        controversial_str = "\cmark" if controversial else '\\xmark'
        result = {
            'Controversial' : controversial_str,
            'Language' : language.capitalize(),
            'Country' : country.capitalize() if country != 'us' else "U.S.",
        }
        name = f'{"controversial" if controversial else "uncontroversial"}_{language}_{country}'

        ### Paraphrase stats
        filename = f'{paraphrase_dir}/{name}.csv'
        if os.path.exists(filename):
            df = process_paraphrase_validate_cloud_research(filename)
            df['questions_equivalent'] = df['answer']
            df.to_csv(f'data/validate/paraphrase/{name}.csv')
    
            n = len(df)
            n_equivalent = df['questions_equivalent'].value_counts()['yes']
            total_equivalent_n += n
            total_equivalent += n_equivalent
    
            result['# (%) Equivalent'] = f"{n_equivalent} / {n} ({n_equivalent / n:.0%})"

        #### Controversial stats
        cont_str = 'controversial' if controversial else 'uncontroversial'
        name = f"{cont_str}_{language}_{country}"
        dfs = []
        for i in range(3):
            df = process_paraphrase_validate_cloud_research(f'{controversial_dir}/cloud_research/{name}_{i + 1}.csv')
            controversial_value = df['answer'].apply(lambda x: ["Very controversial",  "Somewhat controversial",
                                           "Not very controversial", "Not at all controversial"].index(x))
            if controversial:
                all_controversial += controversial_value.to_list()
            else:
                all_uncontroversial += controversial_value.to_list()
            df['controversial'] = df['answer'].apply(lambda x: x in ['Very controversial', 'Somewhat controversial'])
            dfs.append(df)
        
        df = pd.concat(dfs).groupby('topic').agg(lambda x: list(x)).reset_index()
        df['controversial'] = df['controversial'].apply(lambda x: np.mean(x) > .5)
        df.to_csv(f'{controversial_dir}/{name}.csv')

        n = len(df)
        n_controversial = df['controversial'].sum()
        if controversial:
            total_controversial_n += n
            total_controversial += n_controversial
        else:
            total_uncontroversial_n += n
            total_uncontroversial += n_controversial
        result['# (%) Controversial'] =  f"{n_controversial} / {n} ({n_controversial / n:.0%})"

        ### Record the stats
        
        stats.append(result)

cont_str = f"{total_controversial} / {total_controversial_n} ({total_controversial / total_controversial_n:.0%})"
uncont_str = f"{total_uncontroversial} / {total_uncontroversial_n} ({total_uncontroversial / total_uncontroversial_n:.0%})"

stats.append({'# (%) Equivalent' : f"{total_equivalent} / {total_equivalent_n} ({total_equivalent / total_equivalent_n:.0%})",
              '# (%) Controversial' : f"{cont_str}\n{uncont_str}"
             })

In [35]:
controversial_n = len(all_uncontroversial) + len(all_controversial)
controversial_n

546

In [27]:
print(pd.DataFrame(stats).to_latex())

\begin{tabular}{llllll}
\toprule
 & Controversial & Language & Country & # (%) Controversial & # (%) Equivalent \\
\midrule
0 & \cmark & English & U.S. & 22 / 28 (79%) & NaN \\
1 & \cmark & German & Germany & 19 / 28 (68%) & 100 / 137 (73%) \\
2 & \cmark & Chinese & China & 16 / 22 (73%) & 70 / 101 (69%) \\
3 & \cmark & Japanese & Japan & 19 / 21 (90%) & 54 / 84 (64%) \\
4 & \xmark & English & U.S. & 11 / 20 (55%) & NaN \\
5 & \xmark & German & Germany & 7 / 18 (39%) & 51 / 68 (75%) \\
6 & \xmark & Chinese & China & 7 / 23 (30%) & 59 / 87 (68%) \\
7 & \xmark & Japanese & Japan & 12 / 20 (60%) & 55 / 85 (65%) \\
8 & NaN & NaN & NaN & 76 / 99 (77%)
37 / 81 (46%) & 389 / 562 (69%) \\
\bottomrule
\end{tabular}



In [29]:
pd.DataFrame(stats)

Unnamed: 0,Controversial,Language,Country,# (%) Controversial,# (%) Equivalent
0,\cmark,English,U.S.,22 / 28 (79%),
1,\cmark,German,Germany,19 / 28 (68%),100 / 137 (73%)
2,\cmark,Chinese,China,16 / 22 (73%),70 / 101 (69%)
3,\cmark,Japanese,Japan,19 / 21 (90%),54 / 84 (64%)
4,\xmark,English,U.S.,11 / 20 (55%),
5,\xmark,German,Germany,7 / 18 (39%),51 / 68 (75%)
6,\xmark,Chinese,China,7 / 23 (30%),59 / 87 (68%)
7,\xmark,Japanese,Japan,12 / 20 (60%),55 / 85 (65%)
8,,,,76 / 99 (77%)\n37 / 81 (46%),389 / 562 (69%)


In [32]:
import scipy.stats

scipy.stats.ttest_ind(all_uncontroversial, all_controversial)

TtestResult(statistic=7.477942000824478, pvalue=3.0434879452707325e-13, df=544.0)

In [34]:
scipy.stats.binomtest(k=total_equivalent, n=total_equivalent_n, p=.5, alternative='two-sided')

BinomTestResult(k=389, n=562, alternative='two-sided', statistic=0.6921708185053381, pvalue=4.1259838577612583e-20)