# Data Preprocessing

In [76]:
import numpy as np
import pandas as pd
from datasets import Dataset

## Configs

In [92]:
jobs_file   : str = 'isco_08.xlsx'
patents_file: str = 'patent_descriptions.csv'
text_to_lower: bool = False

## Data

In [93]:
jobs_df = pd.read_excel(f'data/{jobs_file}')
patent_list = pd.read_csv(f'data/{patents_file}', sep='|', on_bad_lines='warn')

Skipping line 69112: expected 11 fields, saw 201



In [94]:
patent_list

Unnamed: 0,id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,representative figure link,description
0,US-9530091-B2,"Methods, architecture, and apparatus for imple...","Numenta, Inc.","Jeffrey Hawkins, Dileep George",2004-12-10,2012-04-03,2016-12-27,2016-12-27,https://patents.google.com/patent/US9530091B2/en,https://patentimages.storage.googleapis.com/d3...,This is a continuation of co-pending U.S. pate...
1,US-7493295-B2,"Method, system and computer program for develo...",Francisco J. Ayala,Francisco J. Ayala,2003-01-17,2005-03-31,2009-02-17,2009-02-17,https://patents.google.com/patent/US7493295B2/en,https://patentimages.storage.googleapis.com/47...,The present application is a continuation-in-p...
2,US-7089218-B1,Method for inclusion of psychological temperam...,"Neuric Technologies, Llc",Thomas A. Visel,2004-01-06,2005-06-16,2006-08-08,2006-08-08,https://patents.google.com/patent/US7089218B1/en,https://patentimages.storage.googleapis.com/33...,This is a continuation application of U.S. Ser...
3,US-7454388-B2,Device for the autonomous bootstrapping of use...,Thaler Stephen L,Stephen L. Thaler,2005-05-07,2006-05-08,2008-11-18,2008-11-18,https://patents.google.com/patent/US7454388B2/en,https://patentimages.storage.googleapis.com/df...,This application claims the priority of provis...
4,US-10417563-B1,Intelligent control with hierarchical stacked ...,"Michael Lamport Commons, Mitzi Sturgeon White","Michael Lamport Commons, Mitzi Sturgeon White",2002-09-30,2017-04-07,2019-09-17,2019-09-17,https://patents.google.com/patent/US10417563B1/en,https://patentimages.storage.googleapis.com/c1...,The present application is a Continuation of U...
...,...,...,...,...,...,...,...,...,...,...,...
69794,NL-2032693-B1,Monitoring a vessel,D M B Degroote Beheer B V,Maarten Bob Degroote Dirk,2022-08-05,2022-08-05,2024-02-09,2024-02-09,https://patents.google.com/patent/NL2032693B1/en,,Monitoring a vessel The invention relates to a...
69795,LU-501574-B1,System and method for constructing molecular r...,Univ Northeastern,Xiangying Meng,2022-03-02,2022-03-02,2022-09-01,2022-09-01,https://patents.google.com/patent/LU501574B1/en,,SYSTEM AND METHOD FOR CONSTRUCTING MOLECULAR R...
69796,US-11784749-B2,Polar coding Reed-Muller node optimization usi...,Qualcomm Incorporated,"Erman KOKEN, Gabi SARKIS, Hobin Kim, Hari Sank...",2022-02-14,2022-02-14,2023-10-10,2023-10-10,https://patents.google.com/patent/US11784749B2/en,https://patentimages.storage.googleapis.com/69...,The present disclosure relates generally to co...
69797,US-11719778-B1,Multicontrast synthetic late gadolinium enhanc...,"Regents Of The University Of Michigan, Case We...","Jesse Hamilton, Imran Rashid, Nicole SEIBERLIC...",2022-01-31,2022-01-31,2023-08-08,2023-08-08,https://patents.google.com/patent/US11719778B1/en,https://patentimages.storage.googleapis.com/04...,The invention generally relates to magnetic re...


Create some additional features

In [95]:
patent_list["priority date"] = np.where(patent_list["priority date"].isna(), patent_list["grant date"], patent_list["priority date"])
patent_list["country"] = patent_list["id"].str[:2]
patent_list["year"] = patent_list["priority date"].str[:4] 

Filter out patents that have no description

In [96]:
patent_list = patent_list[patent_list["description"].str.len() > 30].reset_index()

Collapse level 3 and level 4 descriptions

In [97]:
jobs_df['ISCO 08 Code'] = jobs_df['ISCO 08 Code'].apply(str)

jobs_copy = jobs_df
jobs_copy = jobs_copy[jobs_copy['Level'] == 3][["ISCO 08 Code", "Title EN"]]
jobs_df = jobs_df[jobs_df['Level'] >= 3]
jobs_df["ISCO 08 Code"] = np.where(jobs_df['Level'] == 3, jobs_df["ISCO 08 Code"], jobs_df["ISCO 08 Code"].str.slice(0, 3))
jobs_df = jobs_df.groupby(["ISCO 08 Code"])["Definition"].apply(' '.join).reset_index()
jobs_df = jobs_df.merge(jobs_copy, how="inner", on="ISCO 08 Code")
jobs_df = jobs_df[["ISCO 08 Code", "Title EN", "Definition"]]
jobs_df["ISCO 08 Code"] = np.where(jobs_df["ISCO 08 Code"].str.len() == 2, '0' + jobs_df["ISCO 08 Code"], jobs_df["ISCO 08 Code"])

Create text features starting from descriptions

In [98]:
if text_to_lower:
    jobs_df['text'] = jobs_df['Definition'].str.lower()
    patent_list['text'] = patent_list['description'].str.lower()
else:
    jobs_df['text'] = jobs_df['Definition']
    patent_list['text'] = patent_list['description']

In [99]:
patent_list

Unnamed: 0,index,id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,representative figure link,description,country,year,text
0,0,US-9530091-B2,"Methods, architecture, and apparatus for imple...","Numenta, Inc.","Jeffrey Hawkins, Dileep George",2004-12-10,2012-04-03,2016-12-27,2016-12-27,https://patents.google.com/patent/US9530091B2/en,https://patentimages.storage.googleapis.com/d3...,This is a continuation of co-pending U.S. pate...,US,2004,This is a continuation of co-pending U.S. pate...
1,1,US-7493295-B2,"Method, system and computer program for develo...",Francisco J. Ayala,Francisco J. Ayala,2003-01-17,2005-03-31,2009-02-17,2009-02-17,https://patents.google.com/patent/US7493295B2/en,https://patentimages.storage.googleapis.com/47...,The present application is a continuation-in-p...,US,2003,The present application is a continuation-in-p...
2,2,US-7089218-B1,Method for inclusion of psychological temperam...,"Neuric Technologies, Llc",Thomas A. Visel,2004-01-06,2005-06-16,2006-08-08,2006-08-08,https://patents.google.com/patent/US7089218B1/en,https://patentimages.storage.googleapis.com/33...,This is a continuation application of U.S. Ser...,US,2004,This is a continuation application of U.S. Ser...
3,3,US-7454388-B2,Device for the autonomous bootstrapping of use...,Thaler Stephen L,Stephen L. Thaler,2005-05-07,2006-05-08,2008-11-18,2008-11-18,https://patents.google.com/patent/US7454388B2/en,https://patentimages.storage.googleapis.com/df...,This application claims the priority of provis...,US,2005,This application claims the priority of provis...
4,4,US-10417563-B1,Intelligent control with hierarchical stacked ...,"Michael Lamport Commons, Mitzi Sturgeon White","Michael Lamport Commons, Mitzi Sturgeon White",2002-09-30,2017-04-07,2019-09-17,2019-09-17,https://patents.google.com/patent/US10417563B1/en,https://patentimages.storage.googleapis.com/c1...,The present application is a Continuation of U...,US,2002,The present application is a Continuation of U...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69794,69794,NL-2032693-B1,Monitoring a vessel,D M B Degroote Beheer B V,Maarten Bob Degroote Dirk,2022-08-05,2022-08-05,2024-02-09,2024-02-09,https://patents.google.com/patent/NL2032693B1/en,,Monitoring a vessel The invention relates to a...,NL,2022,Monitoring a vessel The invention relates to a...
69795,69795,LU-501574-B1,System and method for constructing molecular r...,Univ Northeastern,Xiangying Meng,2022-03-02,2022-03-02,2022-09-01,2022-09-01,https://patents.google.com/patent/LU501574B1/en,,SYSTEM AND METHOD FOR CONSTRUCTING MOLECULAR R...,LU,2022,SYSTEM AND METHOD FOR CONSTRUCTING MOLECULAR R...
69796,69796,US-11784749-B2,Polar coding Reed-Muller node optimization usi...,Qualcomm Incorporated,"Erman KOKEN, Gabi SARKIS, Hobin Kim, Hari Sank...",2022-02-14,2022-02-14,2023-10-10,2023-10-10,https://patents.google.com/patent/US11784749B2/en,https://patentimages.storage.googleapis.com/69...,The present disclosure relates generally to co...,US,2022,The present disclosure relates generally to co...
69797,69797,US-11719778-B1,Multicontrast synthetic late gadolinium enhanc...,"Regents Of The University Of Michigan, Case We...","Jesse Hamilton, Imran Rashid, Nicole SEIBERLIC...",2022-01-31,2022-01-31,2023-08-08,2023-08-08,https://patents.google.com/patent/US11719778B1/en,https://patentimages.storage.googleapis.com/04...,The invention generally relates to magnetic re...,US,2022,The invention generally relates to magnetic re...


In [88]:
patent_dataset = Dataset.from_pandas(patent_list)
#patent_dataset.push_to_hub('istat-ai/patents_dataset', private=True)

In [89]:
patent_dataset

Dataset({
    features: ['index', 'id', 'title', 'assignee', 'inventor/author', 'priority date', 'filing/creation date', 'publication date', 'grant date', 'result link', 'representative figure link', 'description', 'country', 'year', 'Text'],
    num_rows: 64737
})

<hr>

## Embedding Model

In [None]:
from FlagEmbedding import BGEM3FlagModel
from sklearn.metrics.pairwise import cosine_similarity

Load the model

In [None]:
BGE_model = BGEM3FlagModel('BAAI/bge-m3')

Limit the number of characters to be processed. This prevents out of memory errors

In [38]:
patent_list["Text_limit"] = patent_list["Text"].str[:5000]
jobs_df["Text_limit"] = jobs_df["Text"].str[:5000]

Map the texts to the embedding space

In [39]:
patent_texts_list = list(patent_list["Text_limit"])
jobs_desc_list = list(jobs_df["Text_limit"])

patent_embeddings = BGE_model.encode(patent_texts_list, return_dense=True, return_sparse=False, return_colbert_vecs=False)
jobs_embeddings = BGE_model.encode(jobs_desc_list, return_dense=True, return_sparse=False, return_colbert_vecs=False)

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

## Results
Now we can extract similarity scores for each patent

In [60]:
result = []

for patent_idx, patent_df_row in patent_list.iterrows():
    if len(patent_df_row['Text']) > 0:
        for job_idx , job_df_row in jobs_df.iterrows():
            result.append(
                [patent_df_row["id"],
                patent_df_row["title"],
                patent_df_row["assignee"],
                patent_df_row["inventor/author"],
                patent_df_row["priority date"],
                patent_df_row["country"],
                patent_df_row["year"],
                job_df_row["ISCO 08 Code"],
                job_df_row["Title EN"],
                patent_embeddings['dense_vecs'][patent_idx] @ jobs_embeddings['dense_vecs'][job_idx].T
                ]
            )
            #spar_sim.append([i,j,BGE_model.compute_lexical_matching_score(embeddings1['lexical_weights'][i], embeddings2['lexical_weights'][j])])
            #colb_sim.append([i,j,BGE_model.colbert_score(embeddings1['colbert_vecs'][i], embeddings2['colbert_vecs'][j]).item()])
    else:
        result.append(
            [patent_df_row["id"],
            patent_df_row["title"],
            patent_df_row["assignee"],
            patent_df_row["inventor/author"],
            patent_df_row["priority date"],
            patent_df_row["country"],
            patent_df_row["year"],
            '',
            '',
            ''
            ]
        )

Create a dataframe to store results

In [61]:
result_df = pd.DataFrame(
    result,
    columns=
        [
            "id",
            "title",
            "assignee",
            "inventor",
            "date",
            "country",
            "year",
            "DENS_CODICE",
            "DENS_TITOLO",
            "DENS_SIM"
         ]
    )

In [62]:
result_df["SIMILARITY"] = result_df["DENS_SIM"].apply(lambda x: round(x,1))

In [64]:
result_df_pivot = result_df.pivot_table(
    values="id", index=["year","country","DENS_TITOLO"], columns="SIMILARITY",
    aggfunc="count", margins=True, margins_name="Totals"
).fillna(0)

In [None]:
result_df_pivot.to_csv('output.csv')