# Multi-label Legal Text Classification for CIA

## Data Collection

### III. Legal Texts with Labels

In [1]:
# main
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import openpyxl
import os 
import re
from langdetect import detect, DetectorFactory
from deep_translator import GoogleTranslator
from functions.source_parsing import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/janinedevera/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/janinedevera/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
os.chdir("../..")
os.getcwd()

'/Users/janinedevera/Documents/School/MDS 2021-2023/Thesis/multilabel-legal-text-classification-CIA'

#### a. Manual annotations

In [7]:
# manual annotation
text_labels_manual = pd.read_excel("data/legislation/manual_annotations.xlsx")

In [8]:
text_labels_manual

Unnamed: 0,Law,Paragraph,Text,Category,Theme,Description
0,Law 7565/1986 - Brazilian Aeronautical Code,Art. 40,The use of airport areas by air service provid...,A1,Exemption from bid,"The use of airport areas by air carriers, as w..."
1,Law 7565/1986 - Brazilian Aeronautical Code,Art. 40,The use of airport areas by air service provid...,A3,Exemption from bid,"The use of airport areas by air carriers, as w..."
2,Law 7565/1986 - Brazilian Aeronautical Code,Art. 42,The legislation on urban leases does not apply...,A4,Use of airport areas,Urban lease legislation does not apply to leas...
3,Law 7565/1986 - Brazilian Aeronautical Code,Art. 42,The legislation on urban leases does not apply...,B4,Use of airport areas,Urban lease legislation does not apply to leas...
4,Law 7565/1986 - Brazilian Aeronautical Code,Art. 156 par 1,Crew members are duly qualified people who exe...,A2,Crew nationality,When a national company operates a national or...
...,...,...,...,...,...,...
748,Resolution 140/2010 by National Civil Aviation...,Art. 12,"ANAC may, at any time, conduct audits, request...",,,
749,Resolution 140/2010 by National Civil Aviation...,Art. 13,The non-fulfillment of the obligations establi...,,,
750,Resolution 140/2010 by National Civil Aviation...,Art. 14,This Resolution replaces the dispositions of a...,,,
751,Resolution 140/2010 by National Civil Aviation...,Art. 15 item I,This Resolution goes into effect on July 1st 2...,,,


#### b. OECD annotations

In [23]:
path = "data/legislation/"
oecd_files = glob(os.path.join(path, "oecd_annotations*"))
oecd_files

['data/legislation/oecd_annotations_mx.xlsx',
 'data/legislation/oecd_annotations_br_hp.xlsx',
 'data/legislation/oecd_annotations_br_ca.xlsx']

In [24]:
dfs = []
for file in oecd_files:
    df = pd.read_excel(file, index_col=0)
    df = df[df['No. and title of Regulation'].notna()]
    dfs.append(df)

In [25]:
text_labels_oecd = pd.concat(dfs, axis=0, ignore_index=True)

In [26]:
text_labels_oecd

Unnamed: 0,No. and title of Regulation,Article,Brief description of the potential obstacle,Toolkit question,Policy Objective,Harm to competition,Thematic category,Keyword,Recommendation,Recommendations
0,No law addresses this issue yet,,Mexico currently has no law regulating which b...,D1,A 2016 analysis by ProPublica showed that doct...,Despite the existance of CETIFARMA's Ethics Co...,,,,
1,Reglamento de Insumos para la Salud,117,The pharmaceutical retalier registers in a con...,C2,The objective of the provision is likey to ass...,Pharmaceutical companies are interested in mon...,,,,
2,Reglamento de la Ley General de Salud en Mater...,64,Precriptions of a medicine by a doctor must co...,C2,The law does not specify any particular object...,"Theoretically, pharmacies could collect this d...",,,,
3,Acuerdo por el que se determinan los lineamien...,Second,"For prescriptions of antibiotics, pharmacies m...",C2,The law does not specify any particular object...,"Theoretically, pharmacies could collect this d...",,,,
4,Reglamento de Insumos para la Salud,31,"When prescribing a medicine, doctors can eithe...","D1, D2",To protect the Mexican population against sani...,Consumers are forced to buy the branded medici...,,,,
...,...,...,...,...,...,...,...,...,...,...
718,Resolution 18/2006 by National Petroleum Agenc...,Art. 11,An independent reseller must only purchase avi...,A3,[In the recital] The objective of the resoluti...,Although the provision may limit suppliers’ ab...,Sale of fuel,Fuel,,No recommendation.
719,Resolution 18/2006 by National Petroleum Agenc...,Art. 12,An independent reseller can only sale aviation...,A3,[In the recital] The objective of the resoluti...,Although the provision may limit suppliers’ ab...,Sale of fuel,Fuel,,No recommendation.
720,Resolution 18/2006 by National Petroleum Agenc...,Art. 13,Tied (branding) or independent resellers must ...,A4,[In the recital] The objective of the resoluti...,Although the provision may limit suppliers’ ab...,Sale of fuel,Fuel,,No recommendation.
721,Resolution 279/2013 by National Civil Aviation...,"Appendix, 2.1",Only a legal entity holding a certificate (OE-...,A2,The need for a certificate issued by ANAC for ...,Airports must have employees with specific qua...,Safety,Safety,,No recommendation.


#### c. Cleaning and merging

In [27]:
text_oecd_clean = (
    text_labels_oecd
    .assign(Category=text_labels_oecd['Toolkit question'].str.split('/|,')).explode('Category') # unique row per category
    .loc[:, ["No. and title of Regulation", "Article", "Brief description of the potential obstacle", "Category"]] # subset 
    .rename(columns={"No. and title of Regulation": "Law", "Article": "Paragraph", "Brief description of the potential obstacle": "Text"}) #
)

In [28]:
text_oecd_clean['Category'].unique()

array(['D1', 'C2', ' D2', 'A3', 'A4', ' A4', 'A2', ' A3', 'B1', ' C1', '',
       'B4', ' B4', 'A1', ' B3', ' A5', 'A5', 'B2', ' B2', 'B3', 'C1',
       'Grandfather rights', 'A3 ', ' Administrative burden',
       'Administrative burden', ' administrative burden', 'A4 ',
       'A4 Administrative burden', 'administrative burden',
       'Administrative Burden'], dtype=object)

In [29]:
text_oecd_clean = text_oecd_clean[~text_oecd_clean['Law'].isin(text_labels_manual['Law'].unique())] # remove laws in manually annotated file

In [30]:
text_labels = pd.concat([text_oecd_clean, text_labels_manual]).dropna(axis=1).reset_index(drop=True) # merge files 

In [31]:
text_labels['text_clean'] = preprocess_corpus_keep_stop_words(text_labels['Text'])
text_labels['text_clean'] = [stem_lemmatize(text) for text in text_labels['text_clean']]

In [32]:
text_labels.to_csv("data/01 legal_texts_with_labels_stopwords.csv")

#### d. Grouped labels

In [4]:
#text_labels = pd.read_csv("data/01 legal_texts_with_labels_stopwords.csv")

In [33]:
text_labels.Category.value_counts()

None                        663
A3                          406
A4                          193
A2                          191
A5                           72
B4                           58
B1                           40
A1                           37
Administrative burden        27
C2                           20
 A5                          18
 B4                          14
 administrative burden        7
B2                            7
B3                            7
 C1                           5
D1                            4
 A4                           4
C1                            4
A3                            3
 D2                           2
Administrative Burden         1
administrative burden         1
A4 Administrative burden      1
A4                            1
                              1
 Administrative burden        1
 B2                           1
 A3                           1
 B3                           1
Grandfather rights            1
Name: Ca

In [34]:
text_labels_grouped = text_labels
text_labels_grouped['Category'] = text_labels_grouped['Category'].apply(lambda x: ''.join(filter(str.isalpha, x))) #keep letter categories

In [35]:
text_labels_grouped.Category.value_counts()

A                        926
None                     663
B                        128
C                         29
Administrativeburden      28
administrativeburden       8
D                          6
                           1
Grandfatherrights          1
AAdministrativeburden      1
AdministrativeBurden       1
Name: Category, dtype: int64

In [36]:
categories = ['A', 'B', 'C', 'None']

#text_labels_grouped = text_labels_grouped[text_labels_grouped['Category'].isin(categories)].drop_duplicates()
text_labels_grouped['Category_New'] = np.where(text_labels_grouped['Category'].isin(categories),
                                               text_labels_grouped['Category'], 'Others') #add "others" category

text_labels_grouped.Category_New.value_counts()

A         926
None      663
B         128
Others     46
C          29
Name: Category_New, dtype: int64

In [37]:
text_labels_grouped.to_csv("data/01 legal_texts_with_labels_grouped.csv")