# Multi-label Legal Text Classification for CIA

## Data Collection

### III. Legal Texts with Labels

In [2]:
# main
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import openpyxl
import os 
import re
from langdetect import detect, DetectorFactory
from deep_translator import GoogleTranslator
from functions.source_parsing import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/janinedevera/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/janinedevera/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
os.chdir("../..")
os.getcwd()

'/Users/janinedevera/Documents/School/MDS 2021-2023/Thesis/multilabel-legal-text-classification-CIA'

#### a. Manual annotations

In [14]:
# manual annotation
text_labels_manual = pd.read_excel("data/legislation/Brazil/manual_annotations.xlsx")

In [15]:
text_labels_manual

Unnamed: 0,Law,Paragraph,Text,Category,Theme,Description
0,Law 7565/1986 - Brazilian Aeronautical Code,Art. 40,The use of airport areas by air service provid...,A1,Exemption from bid,"The use of airport areas by air carriers, as w..."
1,Law 7565/1986 - Brazilian Aeronautical Code,Art. 40,The use of airport areas by air service provid...,A3,Exemption from bid,"The use of airport areas by air carriers, as w..."
2,Law 7565/1986 - Brazilian Aeronautical Code,Art. 42,The legislation on urban leases does not apply...,A4,Use of airport areas,Urban lease legislation does not apply to leas...
3,Law 7565/1986 - Brazilian Aeronautical Code,Art. 42,The legislation on urban leases does not apply...,B4,Use of airport areas,Urban lease legislation does not apply to leas...
4,Law 7565/1986 - Brazilian Aeronautical Code,Art. 156 par 1,Crew members are duly qualified people who exe...,A2,Crew nationality,When a national company operates a national or...
...,...,...,...,...,...,...
748,Resolution 140/2010 by National Civil Aviation...,Art. 12,"ANAC may, at any time, conduct audits, request...",,,
749,Resolution 140/2010 by National Civil Aviation...,Art. 13,The non-fulfillment of the obligations establi...,,,
750,Resolution 140/2010 by National Civil Aviation...,Art. 14,This Resolution replaces the dispositions of a...,,,
751,Resolution 140/2010 by National Civil Aviation...,Art. 15 item I,This Resolution goes into effect on July 1st 2...,,,


#### b. OECD annotations

In [16]:
path = "data/legislation/Brazil/"
oecd_files = glob(os.path.join(path, "oecd_annotations*"))
oecd_files

['data/legislation/Brazil/oecd_annotations_ca.xlsx',
 'data/legislation/Brazil/oecd_annotations_hp.xlsx']

In [17]:
dfs = []
for file in oecd_files:
    df = pd.read_excel(file)
    df = df[df['No. and title of Regulation'].notna()]
    dfs.append(df)

In [18]:
text_labels_oecd = pd.concat(dfs, axis=0, ignore_index=True)

In [8]:
text_labels_oecd

Unnamed: 0,No.,No. and title of Regulation,Article,Thematic category,Brief description of the potential obstacle,Toolkit question,Keyword,Policy Objective,Harm to competition,Recommendations,Recommendation
0,1,Law 7565/1986 - Brazilian Aeronautical Code,Art. 40,Direct contract,"The use of airport areas by air carriers, as w...",A1/A3,Exemption from bid,There is no official recital on the objective....,Exempting use of airport areas from public ten...,No recommendation.,
1,2,Law 7565/1986 - Brazilian Aeronautical Code,Art. 42,Use of airport areas,Urban lease legislation does not apply to leas...,A4/B4,Use of airport areas,There is no official recital on the objective....,Imposing a different regime for leasing airpor...,No recommendation.,
2,3,Law 7565/1986 - Brazilian Aeronautical Code,Art. 156 par 1,Crew nationality,When a national company operates a national or...,A2/A3,Crew nationality,Although there is no official recital on the o...,"On the one hand, although requiring a professi...",Brazilian authorities should consider relaxing...,
3,4,Law 7565/1986 - Brazilian Aeronautical Code,Art. 156 par 3,Crew nationality,Foreign flight attendants may be employed for ...,A3,Crew nationality,Although there is no official recital on the o...,Although less restrictive than the rule for do...,Brazilian authorities should consider relaxing...,
4,5,Law 7565/1986 - Brazilian Aeronautical Code,Art. 158,Crew nationality,"Only if there are no Brazilian crew members, f...",A2/A3,Crew nationality,Although there is no official recital on the o...,Brazilian legislation allows foreigners to be ...,Brazilian authorities should consider relaxing...,
...,...,...,...,...,...,...,...,...,...,...,...
544,173,Ordinance 137/2016 by Navy - tax for the use o...,104,Tax for the use of the lighthouses,Foreign ships that demand the use of Brazilian...,A3/A4,Foreign vessel,The objective of the provision is to establish...,The provision increases entry costs for foreig...,,No recommendation.
545,174,Ordinance 137/2016 by Navy (NORMAN 31) - tax f...,107,Tax for the use of the lighthouses,"Ships owned or leased by national companies, w...",A3/A4,Foreign vessel,There is no official recital on the objective....,The provision increases entry costs for foreig...,,No recommendation.
546,175,Resolution 72/2022 (Former Normative Resolutio...,Art.5,Storage,Services not covered by the Box Rate and stora...,C2,Box rate,The segregation and delivery service fee (SSE)...,Such an unclear legal framework may enable por...,,Brazilian authorities should address the lack ...
547,176,Resolution 72/2022 (Former Normative Resolutio...,Art.5 par 1,Storage,"ANTAQ, in case of conflict, may arbitrate the ...",B1,Box rate,The segregation and delivery service fee (SSE)...,Such an unclear legal framework may enable por...,,Brazilian authorities should address the lack ...


#### c. Cleaning and merging

In [19]:
text_oecd_clean = (
    text_labels_oecd
    .assign(Category=text_labels_oecd['Toolkit question'].str.split('/')).explode('Category') # unique row per category
    .loc[:, ["No. and title of Regulation", "Article", "Brief description of the potential obstacle", "Category"]] # subset 
    .rename(columns={"No. and title of Regulation": "Law", "Article": "Paragraph", "Brief description of the potential obstacle": "Text"}) #
)

In [20]:
text_oecd_clean = text_oecd_clean[~text_oecd_clean['Law'].isin(text_labels_manual['Law'].unique())] # remove laws in manually annotated file

In [27]:
text_labels = pd.concat([text_oecd_clean, text_labels_manual]).dropna(axis=1).reset_index(drop=True) # merge files 

In [29]:
text_labels['text_clean'] = preprocess_corpus_keep_stop_words(text_labels['Text'])
text_labels['text_clean'] = [stem_lemmatize(text) for text in text_labels['text_clean']]

In [32]:
text_labels.to_csv("data/01 legal_texts_with_labels_stopwords.csv")

#### d. Grouped labels

In [5]:
text_labels.Category.value_counts()

None                        663
A3                          390
A4                          168
A2                          151
B4                           52
A1                           35
B1                           33
Administrative burden        27
A5                           16
C2                           15
 administrative burden        7
B3                            4
A3                            3
administrative burden         1
Grandfather rights            1
 Administrative burden        1
 A4                           1
A4                            1
A4 Administrative burden      1
D1                            1
Administrative Burden         1
Name: Category, dtype: int64

In [6]:
text_labels_grouped = text_labels
text_labels_grouped['Category'] = text_labels_grouped['Category'].apply(lambda x: ''.join(filter(str.isalpha, x)))

In [7]:
categories = ['A', 'B', 'C', 'D', 'None']
text_labels_grouped = text_labels_grouped[text_labels_grouped['Category'].isin(categories)]
text_labels_grouped.Category.value_counts()

A       765
None    663
B        89
C        15
D         1
Name: Category, dtype: int64

In [8]:
text_labels_grouped.to_csv("data/01 legal_texts_with_labels_grouped.csv")