In [2]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# CPC Classification Preprocessing

In [3]:
df_cpc_y02 = pd.read_csv('/mnt/hdd01/patentsview/CPC Classification/CPCTitleList202308/cpc-section-Y_20230801.txt', sep='\t', header=None, names=['cpc_classification','sequence', 'title'])

In [4]:
for index, row in tqdm(df_cpc_y02.iterrows()):
    # Check length of cpc_classification in order to assign sequences
    if len(row['cpc_classification']) == 1:
        df_cpc_y02.loc[index, 'sequence'] = -3
    elif len(row['cpc_classification']) == 3:
        df_cpc_y02.loc[index, 'sequence'] = -2
    elif len(row['cpc_classification']) == 4:
        df_cpc_y02.loc[index, 'sequence'] = -1


15523it [00:00, 34729.51it/s]


In [None]:
# Delete all rows where cpc_classification does not contain "Y02"
df_cpc_y02 = df_cpc_y02[df_cpc_y02['cpc_classification'].str.contains('Y02')]

In [None]:
df_cpc_y02['full_title'] = df_cpc_y02['title']

In [None]:
df_cpc_y02.to_csv('/home/thiesen/Documents/Cleantech_Concepts/df_cpc_y02.csv')

Adjusted the full titles without code, as hard to handle all cases in the code.
Whenever the title starts with lower case, the full title is added by looking at the superior group.

# CPC Classification Processing

In [5]:
df_cpc_y02 = pd.read_json('/mnt/hdd01/patentsview/CPC Classification/df_cpc_y02_cleantech.json')

In [None]:
# df_cpc_y02['title_lower'] = df_cpc_y02['title'].str.lower()

In [3]:
import yake

# Create a list of all the titles
titles_lower = df_cpc_y02['title_lower'].tolist()

# Specify custom parameters
language = "en"
max_ngram_size = 3
deduplication_threshold = 0.25
deduplication_algo = "seqm"
windowSize = 5
numOfKeywords = 10

# Initialize YAKE model
kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold,
                                    dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords,
                                    features=None)

# Extract keywords from titles
yake_keywords_titles_lower = []
for title in tqdm(titles_lower):
    keywords = kw_extractor.extract_keywords(title)
    temp_keyword_list = []
    for kw in keywords:
        temp_keyword_list.append(kw)
    yake_keywords_titles_lower.append(temp_keyword_list)

# Append the YAKE keywords to the dataframe
df_cpc_y02['yake_keywords_titles_lower'] = yake_keywords_titles_lower

100%|██████████| 340/340 [00:00<00:00, 347.69it/s]


In [8]:
min_yake_conf = 0.05
df_cpc_y02['yake_keywords_titles_filtered'] = [[] for _ in range(len(df_cpc_y02))]

# Iterate over rows in dataframe
for index,row in tqdm(df_cpc_y02.iterrows()):
    # Check if 'keywords_yake' column is not a list
    if not isinstance(row['yake_keywords_titles_lower'], list):
        continue
    # Check if 'keywords_yake' column is an empty list or contains only empty lists
    if not any(keyword for keyword in row['yake_keywords_titles_lower']):
        continue
    # Iterate over keywords in 'keywords_yake' column
    else:
        for keyword in row['yake_keywords_titles_lower']:
            # Check if keyword[1] is greater than or equal to min_yake_conf
            if keyword[1] <= min_yake_conf:
                # Append keyword to 'keywords_yake_filtered' column
                df_cpc_y02.loc[index, 'yake_keywords_titles_filtered'].append(keyword[0])

340it [00:00, 9615.62it/s]


In [9]:
import nltk
from nltk.corpus import stopwords

# Download stopwords
# nltk.download('stopwords')

# Generate copy of df_keywords_list_unique_pruned
df_cpc_y02 = df_cpc_y02.copy()

# Delete all stopwords from 'keyword_yake' column
df_cpc_y02.loc[:, 'yake_keywords_titles_filtered'] = df_cpc_y02['yake_keywords_titles_filtered'].progress_apply(
    lambda x: '' if x in stopwords.words('english') else x
)

# Remove empty rows
df_cpc_y02 = df_cpc_y02[df_cpc_y02['yake_keywords_titles_filtered'] != '']

# Reset index
df_cpc_y02.reset_index(drop=True, inplace=True)

100%|██████████| 340/340 [00:00<00:00, 18334.11it/s]


In [10]:
import spacy

nlp = spacy.load('en_core_web_lg')

# POS tagging of 'full_title'
df_cpc_y02['pos_tags_full_title'] = df_cpc_y02['full_title'].progress_apply(lambda x: [(token.text, token.pos_) for token in nlp(x)])

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 340/340 [00:01<00:00, 301.93it/s]


In [11]:
def filter_noun_phrases(keywords, pos_tags):
    valid_keywords = []
    for keyword in keywords:
        # Split the keyword into words
        keyword_words = keyword.split()
        # Check if at least one word in the keyword string is a noun or proper noun
        if any(any(word == tag[0] and tag[1] in ['NOUN', 'PRON', 'PROPN'] for tag in pos_tags) for word in keyword_words):
            valid_keywords.append(keyword)
    return valid_keywords

# Apply the function to filter the keywords
df_cpc_y02['yake_keywords_titles_pos'] = df_cpc_y02.progress_apply(lambda row: filter_noun_phrases(row['yake_keywords_titles_filtered'], row['pos_tags_full_title']), axis=1)

100%|██████████| 340/340 [00:00<00:00, 54297.26it/s]


In [12]:
# Generate list of all classifications with exactly 4 characters
cpc_subclass = df_cpc_y02[df_cpc_y02['cpc_classification'].str.len() == 4]['cpc_classification'].tolist()

df_cpc_y02_subclass = pd.DataFrame(columns=['cpc_subclass', 'yake_keywords'])
# Set cpc_subclass to 'cpc_subclass' column
df_cpc_y02_subclass['cpc_subclass'] = cpc_subclass

In [16]:
# Initialize 'yake_keywords' column as empty string if it's not already a string
df_cpc_y02_subclass['yake_keywords'] = df_cpc_y02_subclass['yake_keywords'].apply(lambda x: '' if pd.isnull(x) else x)

for index, row in tqdm(df_cpc_y02_subclass.iterrows()):
    # For each row in df_cpc_y02, check every row in df_cpc_y02_subclass
    for _, sub_row in df_cpc_y02.iterrows():
        # Check if 'cpc_classification' contains string in 'cpc_subclass' column
        if row['cpc_subclass'] in sub_row['cpc_classification']:
            # Convert the list of keywords to a string and append to 'yake_keywords' column
            keywords_string = ', '.join(sub_row['yake_keywords_titles_pos'])
            df_cpc_y02_subclass.at[index, 'yake_keywords'] += keywords_string + ', '

# Cast cells of 'yake_keywords' column to lists
df_cpc_y02_subclass['yake_keywords'] = df_cpc_y02_subclass['yake_keywords'].str.split(', ')

8it [00:00, 92.77it/s]


In [17]:
# Generate list of all keywords
keyword_list = sum(df_cpc_y02_subclass['yake_keywords'].tolist(), [])

# Generate list of unique keywords
unique_keywords = list(set(keyword_list))

In [20]:
# Extract titles from nested lists in column 'yake_keywords_titles_lower'
df_cpc_y02['yake_keywords_titles_lower_kw'] = df_cpc_y02['yake_keywords_titles_lower'].progress_apply(lambda x: [keyword[0] for keyword in x])

100%|██████████| 340/340 [00:00<00:00, 203548.87it/s]


In [21]:
df_keyword = pd.DataFrame(columns=['keyword', 'cpc_subclass'])

# Set keyword to 'keyword' column
df_keyword['keyword'] = unique_keywords

# Get the confidence of the keyword
def get_confidence(keyword):
    for index, row in df_cpc_y02.iterrows():
        for yake_keyword in row['yake_keywords_titles_lower']:
            if yake_keyword[0] == keyword:
                return yake_keyword[1]
    return None  # return None if the keyword is not found

df_keyword['yake_confidence'] = df_keyword['keyword'].progress_apply(get_confidence)

# Initialize 'cpc_subclass' and 'cpc_classification' column as empty string if it's not already a string
df_keyword['cpc_subclass'] = df_keyword['cpc_subclass'].apply(lambda x: '' if pd.isnull(x) else x)

# Initialize 'cpc_classification' column
df_keyword['cpc_classification'] = ''
df_keyword['cpc_classification'] = df_keyword['cpc_classification'].apply(lambda x: '' if pd.isnull(x) else x)

for index,row in tqdm(df_keyword.iterrows()):
    # For each row in df_cpc_y02, check every row in df_cpc_y02_subclass
    for _, sub_row in df_cpc_y02.iterrows():
        # Check if 'yake_keywords' contains string in 'keyword' column
        if row['keyword'] in sub_row['yake_keywords_titles_lower_kw']:
            # Append 'cpc_subclass' and 'cpc_classification' to 'cpc_subclass' or 'cpc_classification' column
            # if sub_row['cpc_subclass'] not in df_keyword.at[index, 'cpc_subclass']:
            #     df_keyword.at[index, 'cpc_subclass'] += sub_row['cpc_subclass'] + ', '
            if sub_row['cpc_classification'] not in df_keyword.at[index, 'cpc_classification']:
                df_keyword.at[index, 'cpc_classification'] += sub_row['cpc_classification'] + ', '
    # Remove trailing comma and whitespace
    df_keyword.at[index, 'cpc_subclass'] = df_keyword.at[index, 'cpc_subclass'][:-2]
    df_keyword.at[index, 'cpc_classification'] = df_keyword.at[index, 'cpc_classification'][:-2]

# Cast cells of 'cpc_subclass' column to lists
# df_keyword['cpc_subclass'] = df_keyword['cpc_subclass'].str.split(', ')
df_keyword['cpc_classification'] = df_keyword['cpc_classification'].str.split(', ')

100%|██████████| 507/507 [00:02<00:00, 248.05it/s]
507it [00:04, 114.73it/s]


In [22]:
# Delete rows where 'keyword' is empty
df_keyword = df_keyword[df_keyword['keyword'] != '']
# Reset index
df_keyword.reset_index(drop=True, inplace=True)

In [None]:
# Delete all rows where n_gram is <=2 words - CURRENTLY NOT IMPLEMENTED
# df_keyword = df_keyword[df_keyword['keyword'].str.count(' ') >= 1]

In [24]:
df_keyword_cpc_explode = df_keyword.explode('cpc_subclass')
# Reset index
df_keyword_cpc_explode = df_keyword_cpc_explode.reset_index(drop=True)

In [10]:
df_keyword.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/df_keyword_y02_classification_embeddings_processed.json', orient='records')

# Keyword Embedding

In [3]:
df_keyword = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/df_keyword_y02_classification_embeddings_processed.json')

In [5]:
from sentence_transformers import SentenceTransformer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
model_climatebert = SentenceTransformer('climatebert/distilroberta-base-climate-f')
model_bertforpatents = SentenceTransformer('anferico/bert-for-patents')
model_patentsberta = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')

No sentence-transformers model found with name /home/thiesen/.cache/torch/sentence_transformers/climatebert_distilroberta-base-climate-f. Creating a new one with MEAN pooling.
Some weights of RobertaModel were not initialized from the model checkpoint at /home/thiesen/.cache/torch/sentence_transformers/climatebert_distilroberta-base-climate-f and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
No sentence-transformers model found with name /home/thiesen/.cache/torch/sentence_transformers/anferico_bert-for-patents. Creating a new one with MEAN pooling.


In [7]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU available: {}".format(torch.cuda.get_device_name(0)))
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU available: NVIDIA RTX A4500


In [8]:
df_keyword['keyword_patentsberta_embedding'] = df_keyword['keyword'].progress_apply(lambda x: model_patentsberta.encode(x))
df_keyword['keyword_climatebert_embedding'] = df_keyword['keyword'].progress_apply(lambda x: model_climatebert.encode(x))
df_keyword['keyword_bertforpatents_embedding'] = df_keyword['keyword'].progress_apply(lambda x: model_bertforpatents.encode(x))

100%|██████████| 506/506 [00:04<00:00, 122.41it/s]
100%|██████████| 506/506 [00:02<00:00, 239.74it/s]
100%|██████████| 506/506 [00:06<00:00, 73.95it/s]


In [9]:
df_keyword['cpc_subclass'] = df_keyword['cpc_classification'].progress_apply(lambda x: x[-1][:4] if x and len(x[-1]) >= 4 else None)

100%|██████████| 506/506 [00:00<00:00, 941996.37it/s]


# Dimensionality Reduction

In [None]:
import numpy as np
from openTSNE import TSNE
from sklearn.model_selection import train_test_split

In [None]:
# Generate train/test split
patentsberta_x_train, patentsberta_x_test = train_test_split(df_keyword['keyword_patentsberta_embedding'].tolist(), test_size=0.2, random_state=42)
patentsberta_x_train_np = np.array(patentsberta_x_train)
patentsberta_x_test_np = np.array(patentsberta_x_test)

In [None]:
tsne = TSNE(
    perplexity=30,
    metric="euclidean",
    n_jobs=8,
    random_state=42,
    n_iter=1000,
    verbose=True,
)

In [None]:
# Perform t-SNE on training data
patentsberta_embedding_train = tsne.fit(patentsberta_x_train_np)

In [None]:
patentsberta_embedding_test = patentsberta_embedding_train.transform(patentsberta_x_test_np)

In [None]:
patentsberta_embedding = np.concatenate((patentsberta_embedding_train, patentsberta_embedding_test), axis=0)
df_keyword['keyword_patentsberta_embedding_tsne'] = patentsberta_embedding.tolist()
df_keyword['keyword_patentsberta_embedding_tsne_x'] = df_keyword['keyword_patentsberta_embedding_tsne'].apply(lambda x: x[0])
df_keyword['keyword_patentsberta_embedding_tsne_y'] = df_keyword['keyword_patentsberta_embedding_tsne'].apply(lambda x: x[1])

# Plotting

In [None]:
df_keyword = pd.read_json('/mnt/hdd01/patentsview/CPC Classification/df_keyword_y02_postprocessed.json')

In [None]:
df_keyword.head()

In [None]:
import plotly.express as px

In [None]:
fig_patentsberta = px.scatter(df_keyword, x="keyword_bertforpatents_embedding_tsne_x", y="keyword_bertforpatents_embedding_tsne_y", hover_name="keyword", color="solar", title="PatentsBERTa t-SNE")
fig_patentsberta.show()