In [1]:
from datasets import load_dataset
import pandas as pd

In [2]:
echr = load_dataset("ecthr_cases",  "violation-prediction")

In [3]:
train_dataset, val_dataset, test_dataset = echr['train'], echr['validation'], echr['test']
train_dataset, val_dataset, test_dataset = [dataset.map( lambda examples: {"text": "\n".join(examples["facts"])}) for dataset in [train_dataset, val_dataset, test_dataset]]
train_dataset, val_dataset, test_dataset = [dataset.map( lambda examples: {'labels' :list(1 if examples['labels'][i] else 0 for i in range(len(examples['labels'])))}, batched=True) for dataset in [train_dataset, val_dataset, test_dataset]]

In [4]:
train_df = pd.DataFrame(data={'text': train_dataset['text'], 'result': train_dataset['labels'], 'labels': echr['train']['labels'], 'facts': echr['train']['facts']})

In [5]:
train_df['result'] = train_df.result.apply(lambda x: 'VIOLATED' if x==1 else 'NON_VIOLATED')

In [6]:
train_df[train_df['result'] == 'VIOLATED'].count()

text      8238
result    8238
labels    8238
facts     8238
dtype: int64

In [7]:
val_df = pd.DataFrame(data={'text': val_dataset['text'], 'result': val_dataset['labels'], 'labels': echr['validation']['labels'],  'facts': echr['validation']['facts']})
val_df['result'] = val_df.result.apply(lambda x: 'VIOLATED' if x==1 else 'NON_VIOLATED')

In [8]:
val_df[val_df['result'] == 'VIOLATED'].count()

text      852
result    852
labels    852
facts     852
dtype: int64

In [9]:
test_df = pd.DataFrame(data={'text': test_dataset['text'], 'result': test_dataset['labels'], 'labels': echr['test']['labels'],  'facts': echr['test']['facts']})
test_df['result'] = test_df.result.apply(lambda x: 'VIOLATED' if x==1 else 'NON_VIOLATED')

In [10]:
test_df[test_df['result'] == 'VIOLATED'].count()

text      865
result    865
labels    865
facts     865
dtype: int64

In [11]:
train_df['words_count'] = train_df['text'].str.split().apply(lambda x: len(x))
test_df['words_count'] = test_df['text'].str.split().apply(lambda x: len(x))
val_df['words_count'] = val_df['text'].str.split().apply(lambda x: len(x))

In [12]:
total_df = pd.concat([train_df, test_df, val_df], axis=0)

In [13]:
non_violated_df = total_df[total_df['result'] == 'NON_VIOLATED'].sample(800)

In [14]:
non_violated_df

Unnamed: 0,text,result,labels,facts,words_count
4151,7. The applicant was born in 1956 and is curr...,NON_VIOLATED,[],[7. The applicant was born in 1956 and is cur...,3645
1388,7. The applicant was born in 1941 and lives i...,NON_VIOLATED,[],[7. The applicant was born in 1941 and lives ...,1057
274,4. The applicant was born in 1966 and lives i...,NON_VIOLATED,[],[4. The applicant was born in 1966 and lives ...,4169
797,5. The applicant company is a legal entity re...,NON_VIOLATED,[],[5. The applicant company is a legal entity r...,2785
8665,5. The applicant was born in 1946 and lives i...,NON_VIOLATED,[],[5. The applicant was born in 1946 and lives ...,3388
...,...,...,...,...,...
818,"5. The first applicant, Franz-Olivier Giesber...",NON_VIOLATED,[],"[5. The first applicant, Franz-Olivier Giesbe...",7075
7554,"5. The facts of the case, as submitted by the...",NON_VIOLATED,[],"[5. The facts of the case, as submitted by th...",1589
7160,5. The applicant was born in 1944 and lives i...,NON_VIOLATED,[],[5. The applicant was born in 1944 and lives ...,2544
177,"5. The applicants are sisters, who were born ...",NON_VIOLATED,[],"[5. The applicants are sisters, who were born...",2709


In [15]:
violated_df = total_df[total_df['result'] == 'VIOLATED'].sample(800)

In [16]:
total_df.words_count.median()

1032.5

In [17]:
total_df.words_count.describe()

count    11000.000000
mean      1662.080455
std       1967.597964
min         69.000000
25%        505.000000
50%       1032.500000
75%       2069.250000
max      35416.000000
Name: words_count, dtype: float64

In [18]:
train_df['words_count'].describe()

count     9000.000000
mean      1619.237111
std       1995.117127
min         69.000000
25%        480.750000
50%        984.000000
75%       2002.250000
max      35416.000000
Name: words_count, dtype: float64

In [19]:
train_df[train_df['result'] == 'VIOLATED'].count()

text           8238
result         8238
labels         8238
facts          8238
words_count    8238
dtype: int64

In [20]:
train_df[train_df['result'] == 'NON_VIOLATED'].count()

text           762
result         762
labels         762
facts          762
words_count    762
dtype: int64

In [21]:
total_df[total_df['result'] == 'NON_VIOLATED'].count()

text           1045
result         1045
labels         1045
facts          1045
words_count    1045
dtype: int64

In [22]:
total_df.shape

(11000, 5)

In [23]:
train_df.shape

(9000, 5)

In [24]:
test_df.shape

(1000, 5)

In [25]:
val_df.shape

(1000, 5)

In [26]:
total_df.loc[0, 'facts'].to_csv('facts.csv')

In [27]:
total_df.loc[0, 'text'].to_csv('text.csv')

In [30]:
sub_train_df = train_df[train_df['result'] == 'VIOLATED'].sample(762)

In [35]:
pd.concat([sub_train_df, train_df[train_df['result'] == 'NON_VIOLATED']]).sort_index()

Unnamed: 0,text,result,labels,facts,words_count
1,9. The applicant is the monarch of Liechtenst...,NON_VIOLATED,[],[9. The applicant is the monarch of Liechtens...,2758
4,"9. The applicant is an Italian citizen, born ...",NON_VIOLATED,[],"[9. The applicant is an Italian citizen, born...",571
11,"9. On 13 August 1992, following proceedings o...",VIOLATED,[5],"[9. On 13 August 1992, following proceedings ...",835
12,8. On 13 January 1993 the applicant was detai...,VIOLATED,"[6, 5]",[8. On 13 January 1993 the applicant was deta...,1486
14,9. On 4 October 1993 the applicant was arrest...,VIOLATED,"[6, 5]",[9. On 4 October 1993 the applicant was arres...,3162
...,...,...,...,...,...
8983,5. The applicant party was founded in 2007 an...,NON_VIOLATED,[],[5. The applicant party was founded in 2007 a...,1384
8990,"7. The facts of the case, as submitted by the...",VIOLATED,"[8, 14]","[7. The facts of the case, as submitted by th...",1969
8992,4. The applicant was born in 1950 and lives i...,VIOLATED,[6],[4. The applicant was born in 1950 and lives ...,332
8995,5. The applicant was born in 1960 and lives i...,NON_VIOLATED,[],[5. The applicant was born in 1960 and lives ...,2165
