In [1]:
from transformers import BertForSequenceClassification, BertTokenizer
import pandas as pd
import torch
from sklearn.metrics.pairwise import cosine_similarity, paired_distances
from scipy import spatial
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch
import json

In [2]:
# !pip install torch
# !pip install transformers
# !pip install sentence_transformers

Collecting torch
  Using cached torch-1.10.2-cp39-cp39-manylinux1_x86_64.whl (881.9 MB)
Installing collected packages: torch
Successfully installed torch-1.10.2
Collecting transformers
  Using cached transformers-4.16.2-py3-none-any.whl (3.5 MB)
Collecting sacremoses
  Using cached sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
Collecting tqdm>=4.27
  Using cached tqdm-4.63.0-py2.py3-none-any.whl (76 kB)
Collecting huggingface-hub<1.0,>=0.1.0
  Using cached huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
Collecting tokenizers!=0.11.3,>=0.10.1
  Using cached tokenizers-0.11.6-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
Collecting regex!=2019.12.17
  Using cached regex-2022.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (763 kB)
Collecting filelock
  Using cached filelock-3.6.0-py3-none-any.whl (10.0 kB)
Collecting joblib
  Using cached joblib-1.1.0-py2.py3-none-any.whl (306 kB)
Installing collected packages: tqdm, regex, joblib, filelock, tokenizers, 

In [2]:
# Load model
tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')
model = AutoModel.from_pretrained('anferico/bert-for-patents')

Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# this function splits the data
def get_split(text,length):
    l_total = []
    l_parcial = []
    n = len(text.split())//length
    i = 0
    while i < n:
        l_parcial = text.split()[i*length:(i+1)*length]
        l_total.append(" ".join(l_parcial))
        i+=1
        
    l_parcial = text.split()[i*length:]
    l_total.append(" ".join(l_parcial))
        
    return l_total


# this function is for pooling
def cls_pooling(model_output, attention_mask):
    return model_output[0][:,0]

# this function generates similarity score from two text
def get_sim_score(text1,text2,tokenizer,model):
    # prepare the input texts
    len_of_split = 300
    text1 = get_split(str(text1),len_of_split)
    text2 = get_split(str(text2),len_of_split)
    sep = len(text1)
    combined_text = text1+text2
    
    # generate the vector sets
    encoded_input = tokenizer(combined_text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    combined_vec = cls_pooling(model_output, encoded_input['attention_mask'])
    
    # compare each part of two texts and calculate the similarity score
    p1 = combined_vec[:sep]
    p2 = combined_vec[sep:]
    similarity_score = 0
    for i in p1:
        for j in p2:
            cosine_sim = 1 - spatial.distance.cosine(i, j)
            if cosine_sim>similarity_score:
                similarity_score = cosine_sim


    return similarity_score


# this function transform the text in the patent dictionary into vec
def generate_vec_for_one(patent,tokenizer,model):
    # save all the splited texts
    content_all = []
    # record the key and length of splited texts
    ref_list = []
    # traverse the plaintiff
    patent_length = 0
    for key,value in patent.items():
        content = get_split(str(value),300)
        length = len(content)
        content_all+=content
        ref_list.append((key,length))
        patent_length+=length
    
    # calculate the vec
    encoded_input = tokenizer(content_all, max_length=512, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    content_vec = cls_pooling(model_output, encoded_input['attention_mask'])
    
    # write the vec back to dict
    patent_vec_dict = {}

    idx = 0
    i = 0
    while idx < patent_length: 
        key = ref_list[i][0]
        length = ref_list[i][1]
        content = content_vec[idx:idx+length]
        patent_vec_dict[key] = content
        idx +=length
        i+=1

    
    return patent_vec_dict


# this function generates a dataframe of similarity scores of different combination of each parts of plantiff and defendant patent 
def compare_two_patents(plaintiff_no,defendant_no,data_dict,tokenizer,model):
    score_dict = {'Plaintiff_Patent':[],'Defendant_Patent':[],'Plaintiff_Category':[],'Defendant_Category':[],'Similarity_Score':[]}
    plaintiff = data_dict[plaintiff_no]
    defendant = data_dict[defendant_no]
    p_vec, d_vec = generate_vec(plaintiff,defendant,tokenizer,model)
    
    # generate the df
    for keyp,valuep in p_vec.items():
        for keyd,valued in d_vec.items():

            # calculate the max for the combination
            sim_score = 0
            for vec1 in valuep:
                for vec2 in valued:
                    cosine_sim = 1 - spatial.distance.cosine(vec1, vec2)
                    if cosine_sim > sim_score:
                        sim_score = cosine_sim

            score_dict['Plaintiff_Patent'].append(plaintiff_no)
            score_dict['Defendant_Patent'].append(defendant_no)
            score_dict['Plaintiff_Category'].append(keyp)
            score_dict['Defendant_Category'].append(keyd)
            score_dict['Similarity_Score'].append(sim_score)
            
    score_df = pd.DataFrame(score_dict)
    return score_df

# this function use less memory but much longer time 
def compare_two_patents_less_memo(plaintiff_no,defendant_no,data_dict):
    score_dict = {'Plaintiff_Patent':[],'Defendant_Patent':[],'Plaintiff_Category':[],'Defendant_Category':[],'Similarity_Score':[]}
    plaintiff = data_dict[plaintiff_no]
    defendant = data_dict[defendant_no]
    for keyp,valuep in plaintiff.items():
        for keyd,valued in defendant.items():
            score_dict['Plaintiff_Patent'].append(plaintiff_no)
            score_dict['Defendant_Patent'].append(defendant_no)
            score_dict['Plaintiff_Category'].append(keyp)
            
            score_dict['Defendant_Category'].append(keyd)
            score = get_sim_score(valuep,valued,tokenizer,model)
            score_dict['Similarity_Score'].append(score)
            
    score_df = pd.DataFrame(score_dict)
    return score_df

# this function is the advanced version of the previous one with dict
def compare_two_patents_with_dict(plaintiff_no,defendant_no,data_dict,tokenizer,model):
    score_dict = {'Plaintiff_Patent':[],'Defendant_Patent':[],'Plaintiff_Category':[],'Defendant_Category':[],'Similarity_Score':[]}
    plaintiff = data_dict[plaintiff_no]
    defendant = data_dict[defendant_no]
    
    # generate the vec for patents
    if plaintiff_no in patent_vec_dict:
        p_vec = patent_vec_dict[plaintiff_no]
    else:
        p_vec = generate_vec_for_one(plaintiff,tokenizer,model)
        patent_vec_dict[plaintiff_no] = p_vec
        
    if defendant_no in patent_vec_dict:
        d_vec = patent_vec_dict[defendant_no]
    else:
        d_vec = generate_vec_for_one(defendant,tokenizer,model)
        patent_vec_dict[defendant_no] = d_vec
    
    
    # generate the df
    for keyp,valuep in p_vec.items():
        for keyd,valued in d_vec.items():

            # calculate the max for the combination
            sim_score = 0
            for vec1 in valuep:
                for vec2 in valued:
                    cosine_sim = 1 - spatial.distance.cosine(vec1, vec2)
                    if cosine_sim > sim_score:
                        sim_score = cosine_sim

            score_dict['Plaintiff_Patent'].append(plaintiff_no)
            score_dict['Defendant_Patent'].append(defendant_no)
            score_dict['Plaintiff_Category'].append(keyp)
            score_dict['Defendant_Category'].append(keyd)
            score_dict['Similarity_Score'].append(sim_score)
            
    score_df = pd.DataFrame(score_dict)
    return score_df

In [4]:
# load data dictionary
with open("patent_data.json",'r', encoding='UTF-8') as f:
     patent_data = json.load(f)

In [5]:
# load the plantiff and defendant patent no.
df_train_no = pd.read_csv('Simplified_training_data_new.csv')

In [6]:
# generate the df for the first 50 pairs
df_train_no = df_train_no[350:370]
df_train_no

Unnamed: 0,plaintiff_patent,defendant_patent,Label
350,US6027520A,US7976648B1,True
351,US6027520A,US6582448B1,True
352,US6027520A,US6855161B2,True
353,US6027520A,US6569184B2,True
354,US6027520A,US6746469B2,True
355,US6027520A,US6551341B2,True
356,US6027520A,US6575996B1,True
357,US6027520A,US7344549B2,True
358,US6027520A,US7942892B2,True
359,US6027520A,US20090105644A1,True


In [7]:
# set the dict to score vec
patent_vec_dict = {}

In [8]:
# the final output dataframe
score_dict = {'Plaintiff_Patent':[],'Defendant_Patent':[],'Plaintiff_Category':[],'Defendant_Category':[],'Similarity_Score':[]}
score_df = pd.DataFrame(score_dict)
# calculate the similarity dataframe for each pair and append them to the score_df
n = 1
for plaintiff_no, defendant_no in zip(df_train_no['plaintiff_patent'],df_train_no['defendant_patent']):
    print('Now start the ',n,' th pair calculation.')
    new_score_df = compare_two_patents_with_dict(plaintiff_no,defendant_no,patent_data,tokenizer,model)
    score_df = score_df.append(new_score_df, ignore_index=True)
    print(n,'th has finished.')
    n+=1


Now start the  1  th pair calculation.
1 th has finished.
Now start the  2  th pair calculation.
2 th has finished.
Now start the  3  th pair calculation.
3 th has finished.
Now start the  4  th pair calculation.
4 th has finished.
Now start the  5  th pair calculation.
5 th has finished.
Now start the  6  th pair calculation.
6 th has finished.
Now start the  7  th pair calculation.
7 th has finished.
Now start the  8  th pair calculation.
8 th has finished.
Now start the  9  th pair calculation.
9 th has finished.
Now start the  10  th pair calculation.
10 th has finished.
Now start the  11  th pair calculation.
11 th has finished.
Now start the  12  th pair calculation.
12 th has finished.
Now start the  13  th pair calculation.
13 th has finished.
Now start the  14  th pair calculation.
14 th has finished.
Now start the  15  th pair calculation.
15 th has finished.
Now start the  16  th pair calculation.
16 th has finished.
Now start the  17  th pair calculation.
17 th has finished

In [22]:
score_df

Unnamed: 0,Plaintiff_Patent,Defendant_Patent,Plaintiff_Category,Defendant_Category,Similarity_Score
0,US4762129B1,US7909811B2,Abstract,Abstract,0.271154
1,US4762129B1,US7909811B2,Abstract,Claim_Set_1,0.200510
2,US4762129B1,US7909811B2,Abstract,Claim_Set_2,0.259635
3,US4762129B1,US7909811B2,Abstract,Claim_Set_3,0.231864
4,US4762129B1,US7909811B2,Abstract,Figures_Desc,0.256694
...,...,...,...,...,...
668,US6413289B2,US20070084169A1,Disclosure,Claim_Set_1,0.764612
669,US6413289B2,US20070084169A1,Disclosure,Claim_Set_2,0.778324
670,US6413289B2,US20070084169A1,Disclosure,Claim_Set_3,0.783793
671,US6413289B2,US20070084169A1,Disclosure,Figures_Desc,0.875504


In [9]:
# save the score dataframe to csv
score_df.to_csv('Pairs_Part_Scores350-369.csv')

In [39]:
len(patent_data['US8216209B2']['Figures_Desc'])

99316

In [107]:
# this part use to clean the similarity vectors for the patent with float 'nan' Abstract
import numpy as np

def get_useless_set(patent_data):
    useless_set = set()
    for key in patent_data:
        if type(patent_data[key]['Abstract'])==float:
            useless_set.add(key)
    return useless_set

def clean_abstract(df,useless_set):
    mask1 = []
    mask2 = []
    for i in range(len(df)):
        if (df.iloc[i]['Plaintiff_Patent'] in useless_set) and (df.iloc[i]['Plaintiff_Category']=='Abstract'):
            mask1.append(True)
        else:
            mask1.append(False)

        if (df.iloc[i]['Defendant_Patent'] in useless_set) and (df.iloc[i]['Defendant_Category']=='Abstract'):
            mask2.append(True)
        else:
            mask2.append(False)
        
    mask = np.array(mask1)+np.array(mask2)
    
    df_new = df[~mask]
    
    return df_new
    

In [108]:
df_needtoclean = pd.read_csv('Pairs_Part_Scores0-999.csv')
useless_set = get_useless_set(patent_data)
df_clean = clean_abstract(df_needtoclean,useless_set)

In [110]:
df_clean

Unnamed: 0.1,Unnamed: 0,Plaintiff_Patent,Defendant_Patent,Plaintiff_Category,Defendant_Category,Similarity_Score
0,0,US5897576A,WO2012063215A2,Abstract,Abstract,0.664770
1,1,US5897576A,WO2012063215A2,Abstract,Disclosure,0.735784
2,2,US5897576A,WO2012063215A2,Claim_Set_1,Abstract,0.607359
3,3,US5897576A,WO2012063215A2,Claim_Set_1,Disclosure,0.754643
4,4,US5897576A,WO2012063215A2,Claim_Set_2,Abstract,0.587054
...,...,...,...,...,...,...
29338,1808,US5707392A,US20050235725A1,Disclosure,Claim_Set_2,0.791960
29339,1809,US5707392A,US20050235725A1,Disclosure,Claim_Set_3,0.744056
29340,1810,US5707392A,US20050235725A1,Disclosure,Claim_Set_4,0.756714
29341,1811,US5707392A,US20050235725A1,Disclosure,Figures_Desc,0.795352


In [112]:
df_clean.to_csv('test_train_data.csv')