# Lex Rosetta: Explore Cleaned Data Set for ICAIL 2021

In [1]:
import os
import pandas as pd
from pathlib import Path

## Create Pandas Data Frame

In [2]:
PWD = Path()
DATASETS_DIR = PWD/'data'
INCLUDE_CONFIG = {
    'Canada-EN-1': {
        'annotator-1': 0,
        'annotator-2': 1
    },
    'Czech_Republic-CZ-1': {
        'annotator-1': 1,
        'annotator-2': 0
    },
    'France-FR-1': {
        'annotator-1': 1,
        'annotator-2': 0
    },
    'Germany-DE-1': {
        'annotator-1': 1
    },
    'Italy-IT-1': {
        'annotator-1': 1
    },
    'Poland-PL-1': {
        'annotator-1': 1
    },
    'United_States-EN-1': {
        'annotator-1': 0,
        'annotator-2': 1
    },
    'United_States-EN-2': {
        'annotator-1': 1
    }
}

In [46]:
# Read annotations
dataset_dfs = []
for dataset_name, annotators in INCLUDE_CONFIG.items():
    for annotator, inclusion_flag in annotators.items():
        if not inclusion_flag:
            continue
        dataset_df = pd.read_csv(DATASETS_DIR/dataset_name/f'{annotator}-ICAIL2021.csv')
        dataset_excl_df = pd.read_csv(DATASETS_DIR/dataset_name/f'{annotator}-ICAIL2021-excl.csv')
        dataset_df['Dataset'] = [dataset_name] * dataset_df.shape[0]
        dataset_excl_df['Dataset'] = [dataset_name] * dataset_excl_df.shape[0]
        dataset_dfs.append(dataset_df)
        dataset_dfs.append(dataset_excl_df)

data_df = pd.concat(dataset_dfs)
data_df_no_is = data_df[data_df.Type != 'L2 Introductory Summary']  # avoid double counting

In [51]:
data_df_no_is

Unnamed: 0,Type,Document,Ordering,Text,Dataset
0,L1 Analysis,Canada-EN-1-1.txt,0,Having regard for the governing standard of ap...,Canada-EN-1
1,L1 Analysis,Canada-EN-1-1.txt,1,"CanLII 57 (SCC), [1987] 1 S.C.R. 801 at 824 an...",Canada-EN-1
2,L1 Analysis,Canada-EN-1-1.txt,2,"Henry, 1990 CanLII 2648 (SK CA), [1990] 5 W.W....",Canada-EN-1
3,L1 Analysis,Canada-EN-1-1.txt,3,In coming to this conclusion we note that the ...,Canada-EN-1
4,L1 Analysis,Canada-EN-1-1.txt,4,He also found the mother\'s interpretation of ...,Canada-EN-1
...,...,...,...,...,...
3050,L0 Heading,United_States-EN-2-99.txt,0,BACKGROUND,United_States-EN-2
3051,L0 Heading,United_States-EN-2-99.txt,0,JURISDICTION AND STANDARD OF REVIEW,United_States-EN-2
3052,L0 Heading,United_States-EN-2-99.txt,0,DISCUSSION,United_States-EN-2
3053,L0 Heading,United_States-EN-2-99.txt,0,CONCLUSION,United_States-EN-2


## General Sentence Statistics

### Document Count

In [52]:
data_df_no_is.groupby(['Dataset', 'Document']).size().to_frame().groupby(['Dataset']).size()

Dataset
Canada-EN-1            100
Czech_Republic-CZ-1    100
France-FR-1            100
Germany-DE-1           104
Italy-IT-1             100
Poland-PL-1            101
United_States-EN-1     102
United_States-EN-2     100
dtype: int64

### Sentence Count

In [61]:
data_df_no_is.groupby(['Dataset', 'Document']).size().to_frame().groupby(['Dataset']).sum()

Unnamed: 0_level_0,0
Dataset,Unnamed: 1_level_1
Canada-EN-1,12168
Czech_Republic-CZ-1,11283
France-FR-1,5507
Germany-DE-1,10724
Italy-IT-1,4534
Poland-PL-1,9791
United_States-EN-1,24898
United_States-EN-2,10756


In [62]:
data_df_no_is.groupby(['Dataset', 'Document']).size().to_frame().groupby(['Dataset']).mean()

Unnamed: 0_level_0,0
Dataset,Unnamed: 1_level_1
Canada-EN-1,121.68
Czech_Republic-CZ-1,112.83
France-FR-1,55.07
Germany-DE-1,103.115385
Italy-IT-1,45.34
Poland-PL-1,96.940594
United_States-EN-1,244.098039
United_States-EN-2,107.56


In [63]:
data_df_no_is.groupby(['Dataset', 'Document']).size().to_frame().groupby(['Dataset']).min()

Unnamed: 0_level_0,0
Dataset,Unnamed: 1_level_1
Canada-EN-1,8
Czech_Republic-CZ-1,10
France-FR-1,8
Germany-DE-1,12
Italy-IT-1,10
Poland-PL-1,4
United_States-EN-1,34
United_States-EN-2,24


In [64]:
data_df_no_is.groupby(['Dataset', 'Document']).size().to_frame().groupby(['Dataset']).max()

Unnamed: 0_level_0,0
Dataset,Unnamed: 1_level_1
Canada-EN-1,888
Czech_Republic-CZ-1,701
France-FR-1,583
Germany-DE-1,806
Italy-IT-1,207
Poland-PL-1,1232
United_States-EN-1,1121
United_States-EN-2,397


## Number of Sentences per Dataset and Type

In [65]:
data_df.groupby(['Dataset', 'Type']).size()
# Intro summary needs to be subtracted from Analysis to avoid double-counting.

Dataset              Type                   
Canada-EN-1          L0 Heading                   438
                     L0 Out of Scope              873
                     L1 Analysis                 7210
                     L1 Background               3319
                     L2 Introductory Summary       20
                     L2 Outcome                   328
Czech_Republic-CZ-1  L0 Heading                  1257
                     L0 Out of Scope              945
                     L1 Analysis                 5424
                     L1 Background               3379
                     L2 Introductory Summary        2
                     L2 Outcome                   278
France-FR-1          L0 Heading                   220
                     L0 Out of Scope             3811
                     L1 Analysis                  631
                     L1 Background                485
                     L2 Outcome                   360
Germany-DE-1         L0 Heading      

## Number of Sentences per Type

In [66]:
data_df.groupby(['Type']).size()
# Intro summary needs to be subtracted from Analysis to avoid double-counting.

Type
L0 Heading                  5534
L0 Out of Scope             9588
L1 Analysis                49337
L1 Background              22982
L2 Introductory Summary     1174
L2 Outcome                  2220
dtype: int64

In [67]:
data_df.describe()
# Intro summary needs to be subtracted to avoid double-counting.

Unnamed: 0,Ordering
count,90835.0
mean,105.642605
std,146.48381
min,0.0
25%,14.0
50%,52.0
75%,134.0
max,1079.0
