In [30]:
%pip install -qU FlagEmbedding~=1.2.11 pymilvus pymilvus[model]

Note: you may need to restart the kernel to use updated packages.


### Definition Dataset exploration

In [2]:
import polars as pl
from pathlib import Path
import pickle

#pl.Config.set_tbl_rows(200)
pl.Config.set_fmt_str_lengths(300)

polars.config.Config

In [3]:
dataset_path = Path('../data/definitions_corpus/definitions.parquet')

df = pl.read_parquet(dataset_path)
df.select(pl.len())

len
u32
16522


In [11]:
df

id,definition_text,def_n,label,dataset,document_id,references,frbr_work,frbr_expression
u32,str,str,str,str,str,list[str],str,str
0,"""obstacles to trade: means any trade practice adopted or maintained by a third country in respect of which international trade rules establish a right of action; such a right of action exists when international trade rules either prohibit a practice outright, or give another party affected by the pra…","""#def_1""","""#obstaclesToTrade""","""EurLex""","""32015R1843.xml""",[],"""/akn/eu/act/regulation/2015-10-06/1843/!main""","""/akn/eu/act/regulation/2015-10-06/1843/eng@/!main"""
1,"""injury: means any material injury which an obstacle to trade causes or threatens to cause, in respect of a product or service, to a Union industry, on the market of the Union;""","""#def_2""","""#injury""","""EurLex""","""32015R1843.xml""",[],"""/akn/eu/act/regulation/2015-10-06/1843/!main""","""/akn/eu/act/regulation/2015-10-06/1843/eng@/!main"""
2,"""adverse trade effects: means the adverse effects which an obstacle to trade causes or threatens to cause, in respect of a product or service, to Union enterprises, on the market of any third country, and which have a material impact on the economy of the Union or of a region of the Union, or on a se…","""#def_3""","""#adverseTradeEffects""","""EurLex""","""32015R1843.xml""",[],"""/akn/eu/act/regulation/2015-10-06/1843/!main""","""/akn/eu/act/regulation/2015-10-06/1843/eng@/!main"""
3,"""Union industry: means either: all Union producers or providers: of products or services identical or similar to the product or service which is the subject of an obstacle to trade; of products or services competing directly with that product or service; or who are consumers or processors of the prod…","""#def_4""","""#UnionIndustry""","""EurLex""","""32015R1843.xml""",[],"""/akn/eu/act/regulation/2015-10-06/1843/!main""","""/akn/eu/act/regulation/2015-10-06/1843/eng@/!main"""
4,"""Union enterprise: means a company or firm formed in accordance with the law of a Member State and having its registered office, central administration or principal place of business within the Union, that is directly concerned by the production of goods or the provision of services which are the sub…","""#def_5""","""#UnionEnterprise""","""EurLex""","""32015R1843.xml""",[],"""/akn/eu/act/regulation/2015-10-06/1843/!main""","""/akn/eu/act/regulation/2015-10-06/1843/eng@/!main"""
…,…,…,…,…,…,…,…,…
16517,"""contratto di sponsorizzazione: il contratto in forza del quale un ente sportivo, a fronte del pagamento di un corrispettivo, concede a un soggetto il diritto di associare le proprie attività commerciali ai segni distintivi e alle attività sportive e alle attività di impresa del medesimo ente, al sol…","""#def_8""","""#contrattoDiSponsorizzazione""","""PDL""","""18PDL0040890_PD.xml""",[],"""/akn/it/bill/propostaDiLegge/2018-12-11/1438/!main""","""/akn/it/bill/propostaDiLegge/2018-12-11/1438/ita@/!main"""
16518,"""rappresentanti di interessi: i soggetti che rappresentano presso i decisori pubblici, come definiti allalettera b), direttamente o indirettamente, interessi leciti di rilevanza non generale, anche di natura non economica, al fine di promuovere l'avvio di processi decisionali pubblici o di incidere s…","""#def_1""","""#rappresentantiDiInteressi""","""PDL""","""19PDL0004650_PD.xml""","[""#item_b""]","""/akn/it/bill/propostaDiLegge/2022-10-13/308/!main""","""/akn/it/bill/propostaDiLegge/2022-10-13/308/ita@/!main"""
16519,"""decisori pubblici: i membri del Parlamento e del Governo; i presidenti, gli assessori e i consiglieri regionali, i presidenti e i consiglieri delle province e delle città metropolitane, i sindaci, gli assessori e i consiglieri comunali dei comuni con popolazione pari o superiore a 100.000 abitanti, …","""#def_2""","""#decisoriPubblici""","""PDL""","""19PDL0004650_PD.xml""",[],"""/akn/it/bill/propostaDiLegge/2022-10-13/308/!main""","""/akn/it/bill/propostaDiLegge/2022-10-13/308/ita@/!main"""
16520,"""attività di rappresentanza di interessi: ogni attività, non sollecitata da un decisore pubblico, finalizzata alla rappresentanza di interessi leciti di rilevanza non generale nell'ambito di processi decisionali pubblici, svolta professionalmente dai rappresentanti di interessi attraverso la presenta…","""#def_3""","""#attivitàDiRappresentanzaDiInteressi""","""PDL""","""19PDL0004650_PD.xml""",[],"""/akn/it/bill/propostaDiLegge/2022-10-13/308/!main""","""/akn/it/bill/propostaDiLegge/2022-10-13/308/ita@/!main"""


In [15]:

# 1. Count how many definitions exist per term (def_n)
definitions_per_term = df.group_by('label').agg([
    pl.col('def_n').n_unique().alias('unique_definitions'),
    pl.col('definition_text').count().alias('total_definitions')
])

# 2. Count how many unique terms (labels) exist
unique_terms = df['label'].n_unique()

# 3. Frequency of terms with multiple definitions
multiple_definitions = df.group_by('label').agg([
    pl.col('def_n').n_unique().alias('unique_definitions')
]).filter(pl.col('unique_definitions') > 1)

# 4. Count the number of documents per term
definitions_per_document = df.group_by('label').agg([
    pl.col('document_id').n_unique().alias('unique_documents')
])

# 5. References analysis
definitions_with_references = df.filter(pl.col('references').is_not_null())

# 7. Aggregate data and calculate the length distribution

# Display the results
print("Definitions per term:\n", definitions_per_term)
print("Unique terms count:", unique_terms)
print("Terms with multiple definitions:\n", multiple_definitions)
print("Definitions per document:\n", definitions_per_document)
print("Definitions with references:\n", definitions_with_references)


Definitions per term:
 shape: (10_844, 3)
┌───────────────────────────────────────────────┬────────────────────┬───────────────────┐
│ label                                         ┆ unique_definitions ┆ total_definitions │
│ ---                                           ┆ ---                ┆ ---               │
│ str                                           ┆ u32                ┆ u32               │
╞═══════════════════════════════════════════════╪════════════════════╪═══════════════════╡
│ #runway-holdingPosition                       ┆ 1                  ┆ 1                 │
│ #thirdCountryManufacturer                     ┆ 1                  ┆ 1                 │
│ #leCollezioniFaunisticheDiQualsiasiTipo       ┆ 1                  ┆ 1                 │
│ #groupLiquidityRiskAssessmentReport           ┆ 1                  ┆ 1                 │
│ #alimentiDiOrigineAnimale                     ┆ 1                  ┆ 1                 │
│ …                                             

In [16]:
# 1. Basic counts
total_definitions = df.height
unique_terms = df.select(pl.col("label")).unique().height
definitions_per_term = df.group_by("label").count()

# 2. Definition text length analysis
df = df.with_columns(
    pl.col("definition_text").str.len_chars().alias("def_length")
)
length_stats = df.select([
    pl.col("def_length").mean().alias("avg_length"),
    pl.col("def_length").median().alias("median_length"),
    pl.col("def_length").min().alias("min_length"),
    pl.col("def_length").max().alias("max_length")
])

# 3. Inconsistencies across datasets
inconsistent_terms = (
    df.group_by("label")
    .agg(pl.col("definition_text").n_unique().alias("unique_defs"))
    .filter(pl.col("unique_defs") > 1)
)

# 4. Definitions per document
definitions_per_doc = df.group_by("document_id").count()

# 5. Shared terms across datasets
shared_terms = (
    df.group_by("label")
    .agg(pl.col("dataset").n_unique().alias("datasets_count"))
    .filter(pl.col("datasets_count") > 1)
)

# Display results
print(f"Total definitions: {total_definitions}")
print(f"Unique terms: {unique_terms}")
print(f"Average definition length stats: {length_stats}")
print(f"Inconsistent terms: {inconsistent_terms}")
print(f"Definitions per document: {definitions_per_doc}")
print(f"Shared terms across datasets: {shared_terms}")


Total definitions: 16522
Unique terms: 10844
Average definition length stats: shape: (1, 4)
┌────────────┬───────────────┬────────────┬────────────┐
│ avg_length ┆ median_length ┆ min_length ┆ max_length │
│ ---        ┆ ---           ┆ ---        ┆ ---        │
│ f64        ┆ f64           ┆ u32        ┆ u32        │
╞════════════╪═══════════════╪════════════╪════════════╡
│ 231.028871 ┆ 180.0         ┆ 13         ┆ 4512       │
└────────────┴───────────────┴────────────┴────────────┘
Inconsistent terms: shape: (2_080, 2)
┌──────────────────────────────────────────┬─────────────┐
│ label                                    ┆ unique_defs │
│ ---                                      ┆ ---         │
│ str                                      ┆ u32         │
╞══════════════════════════════════════════╪═════════════╡
│ #gasDay                                  ┆ 2           │
│ #subsidiaryUndertaking                   ┆ 4           │
│ #soggettiCheOperanoNelSettoreDellaSalute ┆ 2           │

  definitions_per_term = df.group_by("label").count()
  definitions_per_doc = df.group_by("document_id").count()


In [17]:
term_definition_counts = (
    df.group_by("label")
    .agg(pl.col("definition_text").n_unique().alias("unique_def_count"))
)

# Filter terms with more than one definition
terms_with_multiple_defs = term_definition_counts.filter(pl.col("unique_def_count") > 1)

# Calculate frequency
total_terms = term_definition_counts.height
multiple_def_terms = terms_with_multiple_defs.height
frequency = (multiple_def_terms / total_terms) * 100

# Display results
print(f"Total Terms: {total_terms}")
print(f"Terms with Multiple Definitions: {multiple_def_terms}")
print(f"Frequency of Terms with Multiple Definitions: {frequency:.2f}%")


Total Terms: 10844
Terms with Multiple Definitions: 2080
Frequency of Terms with Multiple Definitions: 19.18%


In [9]:
df.group_by('dataset').agg(pl.len())

dataset,len
str,u32
"""EurLex""",11354
"""PDL""",413
"""Normattiva""",4585


In [37]:
df.group_by('label').agg(pl.len()).sort(pl.col('len'), descending=True).head(12)

label,len
str,u32
"""#competentAuthority""",61
"""#placingOnTheMarket""",47
"""#manufacturer""",46
"""#makingAvailableOnTheMarket""",36
"""#quota""",36
…,…
"""#Ministero""",32
"""#""",32
"""#authorisedRepresentative""",28
"""#withdrawal""",26


In [20]:
df.filter(pl.col('label') == '#competentAuthority').select('definition_text').unique()

definition_text
str
"""competent authority: means the central authority of a Member State competent for the organisation of official controls or any other authority to which that competence has been conferred, including the competent authority referred to in point (h) of Article 2 of Directive 2009/156/EC"""
"""competent authority: means a competent authority as defined in point (5) of Article 3(1) of Directive (EU) 2019/2034"""
"""competent authority: means the central authority or authorities of a Member State, or, where applicable, of a third country, responsible for the organisation of official controls and of other official activities, or any other authority to which that responsibility has been conferred, in accordance w…"
"""competent authority: means the authority designated in accordance with Article 3(1);"""
"""competent authority: means a competent authority as defined in Article 4(1)(40) of Regulation (EU) No 575/2013"""
…
"""competent authority: means an authority designated as competent by a Member State in accordance with Article 21."""
"""competent authority: means a governmental authority or authorities designated by a Member State or a third country as responsible for ship recycling facilities, within a specified geographical area or an area of expertise, relating to all operations within the jurisdiction of that state;"""
"""competent authority: means a national governmental authority or a national regulatory authority designated by a Member State to ensure the implementation of the measures provided for in this Regulation;"""
"""competent authority: means the competent authority referred to in the legislation referred to in point (8) of this Article, the competent authority referred to in Article 10(5) or the authority designated by each Member State in accordance with Article 22;"""


In [4]:
df.group_by(pl.col('definition_text').str.split(':').list[0]).agg('id')

definition_text,id
str,list[u32]
"""Member State of enforcement""","[3883, 4191, … 9125]"
"""dangerous substance""",[5163]
"""stato del dispositivo""",[12523]
"""small volume manufacturers""",[8221]
"""presidente""","[11717, 11926, … 13660]"
…,…
"""combination boiler""",[2890]
"""packed goods""",[4086]
"""elenco degli operatori aerei""",[14717]
"""number of passengers""",[5695]


In [12]:
df.filter(pl.col('id').is_in([7603, 10379, 11200]))

id,definition_text,def_n,dataset,document_id,references,frbr_work,frbr_expression,defs_with_refs
u32,str,str,str,str,list[str],str,str,str
7603,"""tax authorities: means the national authorities in the Member State responsible for applying Council Directive 2006/112/EC Council Directive 2006/112/EC of 28 November 2006 on the common system of value added tax ( OJ L 347, 11.12.2006, p. 1 ). ;""","""#def_p""","""EurLex""","""32019R2152.xml""","[""/akn/eu/act/directive/2006/112/!main"", ""/akn/eu/act/directive/2006/112/!main"", ""/akn/eu/documentCollection/L/gu/2006-12-11/347/!main#eop_1""]","""/akn/eu/act/regulation/2019-11-27/2152/!main""","""/akn/eu/act/regulation/2019-11-27/2152/eng@/!main""","""tax authorities: means the national authorities in the Member State responsible for applying Council Directive 2006/112/EC Council Directive 2006/112/EC of 28 November 2006 on the common system of value added tax ( OJ L 347, 11.12.2006, p. 1 ). ;"""
10379,"""tax authorities: means public authorities and other bodies which are responsible for taxation or tax-related activities;""","""#def_2""","""EurLex""","""32021R0847.xml""",[],"""/akn/eu/act/regulation/2021-05-20/847/!main""","""/akn/eu/act/regulation/2021-05-20/847/eng@/!main""","""tax authorities: means public authorities and other bodies which are responsible for taxation or tax-related activities;"""
11200,"""tax authorities: means the public authorities and other bodies in the participating countries which are responsible for administering taxation or tax-related activities;""","""#def_1""","""EurLex""","""32013R1286.xml""",[],"""/akn/eu/act/regulation/2013-12-11/1286/!main""","""/akn/eu/act/regulation/2013-12-11/1286/eng@/!main""","""tax authorities: means the public authorities and other bodies in the participating countries which are responsible for administering taxation or tax-related activities;"""


In [5]:
full_def_df = df.with_columns(
    pl.when(
        pl.col("definendum").is_null() | pl.col("definiens").is_null()
    ).then(
        pl.col("full_definition")
    ).otherwise(
        pl.concat_str(
            [
                pl.col("definendum"),
                pl.col("definiens"),
            ],
            separator=": ",
        )
    ).alias("joined_definition")
)

In [6]:
full_def_df.select(pl.len())

len
u32
16525


In [20]:
import matplotlib.pyplot as plt


(
    df
    .sort(pl.col('definition_text').str.len_chars(), descending=True)
    .with_columns(
        pl.col('definition_text').str.len_chars().alias('length')
    )
)


id,definition_text,def_n,dataset,document_id,references,frbr_work,frbr_expression,defs_with_refs,length
u32,str,str,str,str,list[str],str,str,str,u32
14535,"""persona: 1) una persona fisica; 2) una persona giuridica o dove la normativa vigente lo preveda, un'associazione di persone alla quale e' riconosciuta la capacita' di compiere atti giuridici, ma che e' priva di personalita' giuridica; 3) qualsiasi altro istituto giuridico di qualunque natura e forma…","""#def_8""","""Normattiva""","""20140317_14G00038_VIGENZA_20230326.xml""","[""/akn/it/act/decretoDelPresidenteDellaRepubblica/stato/1973-09-29/600/!main#art_31ter-com1"", ""/akn/it/act/legge/stato/2014-12-23/190/!main#art_1-com37"", … ""/akn/eu/act/directive/2011/16/!main""]","""/akn/it/act/decretoLegislativo/stato/2014-03-04/29/!main""","""/akn/it/act/decretoLegislativo/stato/2014-03-04/29/ita@2023-03-26/!main""","""persona: 1) una persona fisica; 2) una persona giuridica o dove la normativa vigente lo preveda, un'associazione di persone alla quale e' riconosciuta la capacita' di compiere atti giuridici, ma che e' priva di personalita' giuridica; 3) qualsiasi altro istituto giuridico di qualunque natura e forma…",4169
12033,"""asportazione di materiale a bacino vuoto: l'operazione di sfangamento o sghiaiamento che utilizza macchine per il movimento e per la rimozione del materiale sedimentato; i)«asportazione di materiale a bacino pieno»: l'operazione di sfangamento o sghiaiamento che utilizza sistemi di pompaggio o di dr…","""#def_8""","""Normattiva""","""20230110_23G00002_ORIGINALE.xml""","[""/akn/it/act/decretoLegislativo/stato/1998-03-31/112/!main#art_89-com1-letb"", ""/akn/it/act/decretoLegislativo/stato/1998/112/!main#art_91-com1"", … ""/akn/it/act/decretoDelPresidenteDellaRepubblica/stato/1991-01-24/85/!main""]","""/akn/it/act/decreto/ministeroInfrastruttureMobilitaSostenibili/2022-10-12/205/!main""","""/akn/it/act/decreto/ministeroInfrastruttureMobilitaSostenibili/2022-10-12/205/ita@2023-01-25/!main""","""asportazione di materiale a bacino vuoto: l'operazione di sfangamento o sghiaiamento che utilizza macchine per il movimento e per la rimozione del materiale sedimentato; i)«asportazione di materiale a bacino pieno»: l'operazione di sfangamento o sghiaiamento che utilizza sistemi di pompaggio o di dr…",3702
12592,"""pianta da frutto: una pianta che e' destinata, dopo la commercializzazione, ad essere piantata o trapiantata""; ee) «pianta in fruttificazione» una pianta moltiplicata da una pianta madre e coltivata per la produzione di frutta, al fine di consentire la verifica dell'identita' varietale di tale piant…","""#def_25""","""Normattiva""","""20210225_21G00023_VIGENZA_20221104.xml""","[""/akn/it/act/legge/stato/2019-10-04/117/!main#art_11""]","""/akn/it/act/decretoLegislativo/stato/2021-02-02/18/!main""","""/akn/it/act/decretoLegislativo/stato/2021-02-02/18/ita@2022-11-04/!main""","""pianta da frutto: una pianta che e' destinata, dopo la commercializzazione, ad essere piantata o trapiantata""; ee) «pianta in fruttificazione» una pianta moltiplicata da una pianta madre e coltivata per la produzione di frutta, al fine di consentire la verifica dell'identita' varietale di tale piant…",2696
13922,"""del presente comma: se utilizzati conformemente a tali allegati per il trasporto di gas della classe 2, esclusi i gas o gli oggetti con codici di classificazione contenenti le cifre 6 e 7, nonche' per il trasporto delle sostanze pericolose di altre classi indicate nell'allegato I del presente decret…","""#def_2""","""Normattiva""","""20120615_012G0099_ORIGINALE.xml""","[""/akn/eu/act/directive/2008/68/!main"", ""/akn/eu/act/directive/2008/68/!main"", … ""/akn/eu/act/directive/2008/68/!main""]","""/akn/it/act/decretoLegislativo/stato/2012-06-12/78/!main""","""/akn/it/act/decretoLegislativo/stato/2012-06-12/78/ita@2012-06-16/!main""","""del presente comma: se utilizzati conformemente a tali allegati per il trasporto di gas della classe 2, esclusi i gas o gli oggetti con codici di classificazione contenenti le cifre 6 e 7, nonche' per il trasporto delle sostanze pericolose di altre classi indicate nell'allegato I del presente decret…",2220
14577,"""violazioni: comportamenti, atti od omissioni che ledono l'interesse pubblico o l'integrita' dell'amministrazione pubblica o dell'ente privato e che consistono in: 1) illeciti amministrativi, contabili, civili o penali che non rientrano nei numeri 3), 4), 5) e 6); 2) condotte illecite rilevanti ai se…","""#def_1""","""Normattiva""","""20230315_23G00032_ORIGINALE.xml""","[""/akn/it/act/decretoLegislativo/stato/2001-06-08/231/!main"", ""/akn/eu/act/directive/2019/1937/!main""]","""/akn/it/act/decretoLegislativo/stato/2023-03-10/24/!main""","""/akn/it/act/decretoLegislativo/stato/2023-03-10/24/ita@2023-03-30/!main""","""violazioni: comportamenti, atti od omissioni che ledono l'interesse pubblico o l'integrita' dell'amministrazione pubblica o dell'ente privato e che consistono in: 1) illeciti amministrativi, contabili, civili o penali che non rientrano nei numeri 3), 4), 5) e 6); 2) condotte illecite rilevanti ai se…",2206
…,…,…,…,…,…,…,…,…,…
7967,"""exporter: means""","""#def_19""","""EurLex""","""32015R2446.xml""",[],"""/akn/eu/act/regulation/2015-07-28/2446/!main""","""/akn/eu/act/regulation/2015-07-28/2446/eng@/!main""","""exporter: means""",15
8007,"""exporter: means""","""#def_19""","""EurLex""","""32015R2446.xml""",[],"""/akn/eu/act/regulation/2015-07-28/2446/!main""","""/akn/eu/act/regulation/2015-07-28/2446/eng@/!main""","""exporter: means""",15
1129,"""EU AIF: means:""","""#def_k""","""EurLex""","""32011L0061.xml""",[],"""/akn/eu/act/directive/2011-06-08/61/!main""","""/akn/eu/act/directive/2011-06-08/61/eng@/!main""","""EU AIF: means:""",14
8402,"""crisis: means:""","""#def_21""","""EurLex""","""32018R1046.xml""",[],"""/akn/eu/act/regulation/2018-07-18/1046/!main""","""/akn/eu/act/regulation/2018-07-18/1046/eng@/!main""","""crisis: means:""",14


In [34]:
# drop the longest definitions

full_def_df = full_def_df.filter(pl.col('full_definition').str.len_chars() < 5000)

defs = full_def_df['joined_definition'].to_list()

In [35]:
with open('definitions_list.pkl', 'wb') as f:
    pickle.dump(defs, f)

In [13]:
full_def_df.select(
    pl.col('joined_definition'),
    pl.col('provenance'),
    pl.col('document'),
    pl.col('references'),
).with_columns(
    pl.col('references').map_elements(eval, return_dtype=pl.List(pl.String)),
).with_columns(
    pl.col('references').list.len().alias('ref_len'),
).sort(pl.col('ref_len'), descending=True).head()

joined_definition,provenance,document,references,ref_len
str,str,str,list[str],u32
"""fishing licence: means a licence as defined in point (9) of Article 4 of Council Regulation (EC) No 1224/2009 Council Regulation (EC) No 1224/2009 of 20 November 2009 establishing a Community control system for ensuring compliance with the rules of the common fisheries policy, amending Regulations (…","""EurLex""","""32013R1380.xml""","[""/akn/eu/act/regulation/ep/2009/1224/~art_4"", ""/akn/eu/act/regulation/ep/2009/1224/"", … ""/akn/eu/documentCollection/L/gu/2009-12-22/343/!main#eop_1""]",19
"""supervised entity: means any of the following: a credit institution as defined in point (1) of Article 4(1) of Regulation (EU) No 575/2013 of the European Parliament and of the Council an investment firm as defined in point (1) of Article 4(1) of Directive 2014/65/EU; an insurance undertaking as def…","""EurLex""","""32016R1011.xml""","[""/akn/eu/act/regulation/ep/2013/575/~art_4(1)"", ""/akn/eu/act/regulation/ep/2013/575/"", … ""/akn/eu/act/regulation/ep/2012/648/""]",19
"""operator: means a natural or legal person as defined in Article 4(19) of Council Regulation (EC) No 1224/2009 Council Regulation (EC) No 1224/2009 of 20 November 2009 establishing a Union control system for ensuring compliance with the rules of the common fisheries policy, amending Regulations (EC) …","""EurLex""","""32017R0218.xml""","[""/akn/eu/act/regulation/ep/2009/1224/~art_4(19)"", ""/akn/eu/act/regulation/ep/2009/1224/"", … ""/akn/eu/documentCollection/L/gu/2009-12-22/343/!main#eop_1""]",19
"""control and inspection: means any measures taken by Member States, in particular pursuant to Articles 5, 11, 71, 91 and 117 and Title VII of Council Regulation (EC) No 1224/2009 Council Regulation (EC) No 1224/2009 of 20 November 2009 establishing a Union control system for ensuring compliance with …","""EurLex""","""32019R0473.xml""","[""/akn/eu/act/regulation/ep/2009/1224/"", ""/akn/eu/act/regulation/ep/2009/1224/"", … ""/akn/eu/documentCollection/L/gu/2009-12-22/343/!main#eop_1""]",18
"""Union funds: means the European Structural and Investment Funds referred to in Article 1 of Regulation (EU) No 223/2014 of the European Parliament and of the Council Regulation (EU) No 223/2014 of the European Parliament and of the Council of 11 March 2014 , on the Fund for European Aid to the Most …","""EurLex""","""32017R0825.xml""","[""/akn/eu/act/regulation/ep/2014/223/~art_1"", ""/akn/eu/act/regulation/ep/2014/223/"", … ""/akn/eu/documentCollection/L/gu/2014-05-20/150/!main#eop_143""]",17


---

In [10]:
with open('definitions_list.pkl', 'wb') as f:
    pickle.dump(defs, f)

### Store embeddings in MilvusDB

In [2]:
from pymilvus import MilvusClient, connections, utility, FieldSchema, CollectionSchema, DataType, Collection
from pymilvus.model.hybrid import BGEM3EmbeddingFunction
import pickle

In [3]:
# setup embedding model

ef = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")
dense_dim = ef.dim["dense"]

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [4]:
#defs_embeddings = ef(defs)
with open('../data/def_embeddings_refs.pkl', 'rb') as f:
    defs_embeddings = pickle.load(f)

df = pl.read_parquet('../data/definitions/definitions_with_refs.parquet')

def_list = df["defs_with_refs"].to_list()


In [None]:
final_df = full_def_df.select(
    pl.col('joined_definition').alias('definition_text'),
    pl.col('provenance').alias('dataset'),
    pl.col('document').alias('document_id'),
    pl.col('references'),
).with_columns(
    pl.col('references').map_elements(eval, return_dtype=pl.List(pl.String)),
).with_row_index('id')

final_df.head()

In [6]:
#MILVUS_URL = "https://localhost:19530"
MILVUS_URL = "/home/leo/Desktop/dhdk/Master thesis/.project/LegalDefAgent/vec_db/definitions_vectors_refs.db"

client = MilvusClient(
    uri=MILVUS_URL
)

connections.connect(uri=MILVUS_URL)

# Define collection schema in Milvus
fields = [
    # Use auto generated id as primary key
    FieldSchema(name="id", dtype=DataType.INT64,
                is_primary=True, auto_id=True, max_length=100),
    # Store the original text to retrieve based on semantically distance
    FieldSchema(name="definition_text", dtype=DataType.VARCHAR, max_length=5000),
    FieldSchema(name="dataset", dtype=DataType.VARCHAR, max_length=10),
    FieldSchema(name="document_id", dtype=DataType.VARCHAR, max_length=40),
    #FieldSchema(name="references", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_capacity=20),
    FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR,
                dim=dense_dim),

]

schema = CollectionSchema(fields, "Definitions embeddings")

COLLECTION_NAME = "Definitions"
if utility.has_collection(COLLECTION_NAME):
    Collection(COLLECTION_NAME).drop()
collection = Collection(COLLECTION_NAME, schema, consistency_level="Strong")


#if client.has_collection(collection_name="Definitions"):
    #client.drop_collection(collection_name="Definitions")
#else:
    #client.create_collection(
        #collection_name="Definitions",
        #dimension=1024,
        #consistency_level="Strong",
        #schema=schema
    #)

In [7]:
client.get_collection_stats(COLLECTION_NAME)

{'row_count': 0}

In [None]:
# To make vector search efficient, we need to create indices for the vector fields
sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type":"IP"}
collection.create_index("sparse_vector", sparse_index)
dense_index = {"index_type": "FLAT", "metric_type": "COSINE"}
collection.create_index("dense_vector", dense_index)
collection.load()

2025-01-02 18:09:01,378 [ERROR][handler]: RPC error: [create_index], <MilvusException: (code=65535, message=metric type BM25 not found or not supported, supported: IP: )>, <Time:{'RPC start': '2025-01-02 18:09:01.374512', 'RPC error': '2025-01-02 18:09:01.378459'}> (decorators.py:140)


MilvusException: <MilvusException: (code=65535, message=metric type BM25 not found or not supported, supported: IP: )>

In [18]:
collection.indexes[0]

<pymilvus.orm.index.Index at 0x7fc0d86e95b0>

In [19]:
# For efficiency, we insert 50 records in each small batch
for i in range(0, len(def_list), 50):
    batched_entities = [
        #defs_list[i : i + 50],
        df[i: i + 50, 'definition_text'],
        df[i: i + 50, 'dataset'],
        df[i: i + 50, 'document_id'],
        #final_df[i: i + 50, 'references'],
        defs_embeddings["sparse"][i : i + 50],
        defs_embeddings["dense"][i : i + 50],
    ]
    collection.insert(batched_entities)
print("Number of entities inserted:", collection.num_entities)

Number of entities inserted: 16352


In [7]:
MILVUS_URL = "/home/leo/Desktop/dhdk/Master thesis/.project/LegalDefAgent/vec_db/definitions_vectors.db"
COLLECTION_NAME = "Definitions"

client = MilvusClient(
    uri=MILVUS_URL
)

connections.connect(uri=MILVUS_URL)

client.get_collection_stats(COLLECTION_NAME)

collection = Collection(COLLECTION_NAME)

In [22]:
# test query
results = collection.query(expr="", output_fields=["definition_text", 'document_id'], limit=4)

results

data: ["{'id': 455037346697969664, 'definition_text': 'obstacles to trade: means any trade practice adopted or maintained by a third country in respect of which international trade rules establish a right of action; such a right of action exists when international trade rules either prohibit a practice outright, or give another party affected by the practice a right to seek elimination of the effect of the practice in question;', 'document_id': '32015R1843.xml'}", "{'id': 455037346697969665, 'definition_text': 'injury: means any material injury which an obstacle to trade causes or threatens to cause, in respect of a product or service, to a Union industry, on the market of the Union;', 'document_id': '32015R1843.xml'}", "{'id': 455037346697969666, 'definition_text': 'adverse trade effects: means the adverse effects which an obstacle to trade causes or threatens to cause, in respect of a product or service, to Union enterprises, on the market of any third country, and which have a mater

In [12]:
from LegalDefAgent.src.retriever.vector_store import setup_vectorstore

vector_store = setup_vectorstore(milvusdb_uri=MILVUS_URL)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [17]:
vector_store.similarity_search('dog')

[Document(metadata={'dataset': 'EurLex', 'def_n': '#def_10', 'document_id': '32020R0689.xml', 'id': 4284, 'references': []}, page_content='dog: means a kept animal of the Canis lupus species;'),
 Document(metadata={'dataset': 'EurLex', 'def_n': '#def_1', 'document_id': '32019R2035.xml', 'id': 2307, 'references': []}, page_content='dog: means a kept animal of the Canis lupus species;'),
 Document(metadata={'dataset': 'EurLex', 'def_n': '#def_9', 'document_id': '32013R0576.xml', 'id': 3843, 'references': []}, page_content='documentary check: means verification of the identification document accompanying the pet animal;'),
 Document(metadata={'dataset': 'EurLex', 'def_n': '#def_1', 'document_id': '32016R0429.xml', 'id': 1455, 'references': []}, page_content='animals: means vertebrate and invertebrate animals;')]

### Test Retrieval

In [1]:
from pymilvus import connections, Collection, MilvusClient

#MILVUS_URL = "../vec_db/definitions_vectors.db"
MILVUS_URL = "/home/leo/Desktop/dhdk/Master thesis/.project/LegalDefAgent/vec_db/definitions_vectors.db"

connections.connect(
  uri=MILVUS_URL
)

collection = Collection("Definitions")

print([index.params for index in collection.indexes])

[{'index_type': 'FLAT', 'metric_type': 'COSINE', 'dim': '1024'}, {'index_type': 'SPARSE_INVERTED_INDEX', 'metric_type': 'IP'}]


In [2]:
from milvus_model.hybrid import BGEM3EmbeddingFunction

ef = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [24]:
connections.list_connections()

[('default', <pymilvus.client.grpc_handler.GrpcHandler at 0x7f2debf30ec0>),
 ('0364657d9ce64635b4de7ed82188296c',
  <pymilvus.client.grpc_handler.GrpcHandler at 0x7f2ec4f01400>),
 ('68e3b9d5874f4a1eb3b1a79a5172129d',
  <pymilvus.client.grpc_handler.GrpcHandler at 0x7f2e09b6faa0>)]

In [22]:
connections.remove_connection('default')

In [7]:
from pymilvus import (
    AnnSearchRequest,
    WeightedRanker,
)


def dense_search(col, query_dense_embedding, limit=10):
    search_params = {"metric_type": "COSINE", "params": {}}
    res = col.search(
        [query_dense_embedding],
        anns_field="dense_vector",
        limit=limit,
        output_fields=["definition_text"],
        param=search_params,
    )[0]
    return [hit.get("definition_text") for hit in res]


def sparse_search(col, query_sparse_embedding, limit=10):
    search_params = {
        "metric_type": "IP",
        "params": {},
    }
    res = col.search(
        [query_sparse_embedding],
        anns_field="sparse_vector",
        limit=limit,
        output_fields=["definition_text"],
        param=search_params,
    )[0]
    return [hit.get("definition_text") for hit in res]


def hybrid_search(
    col,
    query_dense_embedding,
    query_sparse_embedding,
    sparse_weight=1.0,
    dense_weight=1.0,
    limit=10,
):
    dense_search_params = {"metric_type": "COSINE", "params": {}}
    dense_req = AnnSearchRequest(
        [query_dense_embedding], "dense_vector", dense_search_params, limit=limit
    )
    sparse_search_params = {"metric_type": "IP", "params": {}}
    sparse_req = AnnSearchRequest(
        [query_sparse_embedding], "sparse_vector", sparse_search_params, limit=limit
    )
    rerank = WeightedRanker(sparse_weight, dense_weight)
    res = col.hybrid_search(
        [sparse_req, dense_req], rerank=rerank, limit=limit, output_fields=["definition_text"]
    )[0]
    return [hit.get("definition_text") for hit in res]


In [8]:
def doc_text_formatting(ef, query, docs):
    tokenizer = ef.model.tokenizer
    query_tokens_ids = tokenizer.encode(query, return_offsets_mapping=True)
    query_tokens = tokenizer.convert_ids_to_tokens(query_tokens_ids)
    formatted_texts = []

    for doc in docs:
        ldx = 0
        landmarks = []
        encoding = tokenizer.encode_plus(doc, return_offsets_mapping=True)
        tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])[1:-1]
        offsets = encoding["offset_mapping"][1:-1]
        for token, (start, end) in zip(tokens, offsets):
            if token in query_tokens:
                if len(landmarks) != 0 and start == landmarks[-1]:
                    landmarks[-1] = end
                else:
                    landmarks.append(start)
                    landmarks.append(end)
        close = False
        formatted_text = ""
        for i, c in enumerate(doc):
            if ldx == len(landmarks):
                pass
            elif i == landmarks[ldx]:
                if close:
                    formatted_text += "</span>"
                else:
                    formatted_text += "<span style='color:red'>"
                close = not close
                ldx = ldx + 1
            formatted_text += c
        if close is True:
            formatted_text += "</span>"
        formatted_texts.append(formatted_text)
    return formatted_texts

In [9]:
from IPython.display import Markdown, display

query = input("Enter your search query: ")
print("Query: " + query)

query_embeddings = ef([query])

dense_results = dense_search(collection, query_embeddings["dense"][0], limit=5)
sparse_results = sparse_search(collection, query_embeddings["sparse"]._getrow(0), limit=5)
hybrid_results = hybrid_search(
    collection,
    query_embeddings["dense"][0],
    query_embeddings["sparse"]._getrow(0),
    sparse_weight=0.7,
    dense_weight=1.0,
    limit=5
)

display(Markdown("**Dense Search Results:**"))
formatted_results = doc_text_formatting(ef, query, dense_results)
for result in dense_results:
    display(Markdown(result))

display(Markdown("\n**Sparse Search Results:**"))
formatted_results = doc_text_formatting(ef, query, sparse_results)
for result in formatted_results:
    display(Markdown(result))

display(Markdown("\n**Hybrid Search Results:**"))
formatted_results = doc_text_formatting(ef, query, hybrid_results)
for result in formatted_results:
    display(Markdown(result))

Query: dog


**Dense Search Results:**

dog: means a kept animal of the Canis lupus species;

dog: means a kept animal of the Canis lupus species;

dog: means a kept animal of the Canis lupus species;

documentary check: means verification of the identification document accompanying the pet animal;

animals: means vertebrate and invertebrate animals;


**Sparse Search Results:**

<span style='color:red'>dog</span>: means a kept animal of the Canis lupus species;

<span style='color:red'>dog</span>: means a kept animal of the Canis lupus species;

<span style='color:red'>dog</span>: means a kept animal of the Canis lupus species;

assembly centre of<span style='color:red'> dog</span>s: cats and ferrets" means an establishment where those animals of the same health status are assembled from more than one establishment;

other carnivores: means animals of the species belonging to the order Carnivora other than<span style='color:red'> dog</span>s, cats and ferrets;


**Hybrid Search Results:**

<span style='color:red'>dog</span>: means a kept animal of the Canis lupus species;

<span style='color:red'>dog</span>: means a kept animal of the Canis lupus species;

<span style='color:red'>dog</span>: means a kept animal of the Canis lupus species;

documentary check: means verification of the identification document accompanying the pet animal;

animals: means vertebrate and invertebrate animals;

---

In [None]:
from LegalDefAgent.src.retriever.vector_store import setup_retriever, BGEMilvusSparseEmbeddings

retriever = setup_retriever()

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [33]:
from LegalDefAgent.src.retriever.vector_store import setup_retriever, BGEMilvusSparseEmbeddings
s = BGEMilvusSparseEmbeddings()


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [40]:
s.embed_query('dog')

query_embeddings = ef(['dog'])

dense_results = dense_search(collection, query_embeddings["dense"][0], limit=5)
sparse_results = sparse_search(collection, query_embeddings["sparse"]._getrow(0), limit=5)

search_params = {
    "metric_type": "IP",
    "params": {},
}
res = collection.search(
    [query_embeddings["sparse"]],
    anns_field="sparse_vector",
    limit=7,
    output_fields=["definition_text"],
    param=search_params,
)[0]

res

["id: 8263, distance: 0.0928601399064064, entity: {'definition_text': 'dog: means a kept animal of the Canis lupus species;'}", "id: 5008, distance: 0.0928601399064064, entity: {'definition_text': 'dog: means a kept animal of the Canis lupus species;'}", "id: 2772, distance: 0.0928601399064064, entity: {'definition_text': 'dog: means a kept animal of the Canis lupus species;'}", 'id: 2777, distance: 0.061868853867053986, entity: {\'definition_text\': \'assembly centre of dogs: cats and ferrets" means an establishment where those animals of the same health status are assembled from more than one establishment;\'}', "id: 8266, distance: 0.03597733750939369, entity: {'definition_text': 'other carnivores: means animals of the species belonging to the order Carnivora other than dogs, cats and ferrets;'}", "id: 441, distance: 0.031396761536598206, entity: {'definition_text': 'limited market: means a market for one of the following medicinal product types: veterinary medicinal products for th

In [54]:
t1 = s.embed_query('cat')

t2 = ef(['dog'])['sparse']

t2.data

array([0.30564666])

In [15]:
retriever.invoke('dog')

2025-01-03 13:13:38,607 [ERROR][handler]: RPC error: [hybrid_search], <MilvusException: (code=2000, message=Assert "static_cast<int>(field_meta.get_data_type()) == static_cast<int>(info.type())" at /workspace/milvus-lite/thirdparty/milvus/internal/core/src/query/Plan.cpp:48
 => vector type must be the same, field sparse_vector - type VECTOR_SPARSE_FLOAT, search info type VECTOR_FLOAT: segcore error)>, <Time:{'RPC start': '2025-01-03 13:13:38.588631', 'RPC error': '2025-01-03 13:13:38.607084'}> (decorators.py:140)


Assert "static_cast<int>(field_meta.get_data_type()) == static_cast<int>(info.type())" at /workspace/milvus-lite/thirdparty/milvus/internal/core/src/query/Plan.cpp:48
 => vector type must be the same, field sparse_vector - type VECTOR_SPARSE_FLOAT, search info type VECTOR_FLOAT
Assert "static_cast<int>(field_meta.get_data_type()) == static_cast<int>(info.type())" at /workspace/milvus-lite/thirdparty/milvus/internal/core/src/query/Plan.cpp:48
 => vector type must be the same, field sparse_vector - type VECTOR_SPARSE_FLOAT, search info type VECTOR_FLOAT
Assert "static_cast<int>(field_meta.get_data_type()) == static_cast<int>(info.type())" at /workspace/milvus-lite/thirdparty/milvus/internal/core/src/query/Plan.cpp:48
 => vector type must be the same, field sparse_vector - type VECTOR_SPARSE_FLOAT, search info type VECTOR_FLOAT
Assert "static_cast<int>(field_meta.get_data_type()) == static_cast<int>(info.type())" at /workspace/milvus-lite/thirdparty/milvus/internal/core/src/query/Plan.cpp

MilvusException: <MilvusException: (code=2000, message=Assert "static_cast<int>(field_meta.get_data_type()) == static_cast<int>(info.type())" at /workspace/milvus-lite/thirdparty/milvus/internal/core/src/query/Plan.cpp:48
 => vector type must be the same, field sparse_vector - type VECTOR_SPARSE_FLOAT, search info type VECTOR_FLOAT: segcore error)>