In [2]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# CPC Classification Preprocessing

In [2]:
df_cpc_y02 = pd.read_csv('/mnt/hdd01/patentsview/CPC Classification/CPCTitleList202308/cpc-section-Y_20230801.txt', sep='\t', header=None, names=['cpc_classification','sequence', 'title'])

In [3]:
for index, row in tqdm(df_cpc_y02.iterrows()):
    # Check length of cpc_classification in order to assign sequences
    if len(row['cpc_classification']) == 1:
        df_cpc_y02.loc[index, 'sequence'] = -3
    elif len(row['cpc_classification']) == 3:
        df_cpc_y02.loc[index, 'sequence'] = -2
    elif len(row['cpc_classification']) == 4:
        df_cpc_y02.loc[index, 'sequence'] = -1


15523it [00:00, 36469.37it/s]


In [4]:
# Delete all rows where cpc_classification does not contain "Y02"
df_cpc_y02 = df_cpc_y02[df_cpc_y02['cpc_classification'].str.contains('Y02')]

In [7]:
df_cpc_y02['full_title'] = df_cpc_y02['title']

In [9]:
df_cpc_y02.to_csv('/home/thiesen/Documents/Cleantech_Concepts/df_cpc_y02.csv')

Adjusted the full titles without code, as hard to handle all cases in the code.
Whenever the title starts with lower case, the full title is adjusted accordingly.

# CPC Classification Processing

In [154]:
df_cpc_y02 = pd.read_csv('/home/thiesen/Documents/Cleantech_Concepts/df_cpc_y02.csv')

In [155]:
df_cpc_y02['title_lower'] = df_cpc_y02['title'].str.lower()

In [156]:
import yake

# Create a list of all the titles
titles_lower = df_cpc_y02['title_lower'].tolist()

# Specify custom parameters
language = "en"
max_ngram_size = 3
deduplication_threshold = 0.25
deduplication_algo = "seqm"
windowSize = 5
numOfKeywords = 10

# Initialize YAKE model
kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold,
                                    dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords,
                                    features=None)

# Extract keywords from titles
yake_keywords_titles_lower = []
for title in tqdm(titles_lower):
    keywords = kw_extractor.extract_keywords(title)
    temp_keyword_list = []
    for kw in keywords:
        temp_keyword_list.append(kw)
    yake_keywords_titles_lower.append(temp_keyword_list)

# Append the YAKE keywords to the dataframe
df_cpc_y02['yake_keywords_titles_lower'] = yake_keywords_titles_lower

100%|██████████| 340/340 [00:01<00:00, 337.99it/s]


In [157]:
min_yake_conf = 0.2
df_cpc_y02['yake_keywords_titles_filtered'] = [[] for _ in range(len(df_cpc_y02))]

# Iterate over rows in dataframe
for index,row in tqdm(df_cpc_y02.iterrows()):
    # Check if 'keywords_yake' column is not a list
    if not isinstance(row['yake_keywords_titles_lower'], list):
        continue
    # Check if 'keywords_yake' column is an empty list or contains only empty lists
    if not any(keyword for keyword in row['yake_keywords_titles_lower']):
        continue
    # Iterate over keywords in 'keywords_yake' column
    else:
        for keyword in row['yake_keywords_titles_lower']:
            # Check if keyword[1] is greater than or equal to min_yake_conf
            if keyword[1] <= min_yake_conf:
                # Append keyword to 'keywords_yake_filtered' column
                df_cpc_y02.loc[index, 'yake_keywords_titles_filtered'].append(keyword[0])

340it [00:00, 19737.63it/s]


In [158]:
import nltk
from nltk.corpus import stopwords

# Download stopwords
# nltk.download('stopwords')

# Generate copy of dataframe
df_cpc_y02 = df_cpc_y02.copy()

# Delete all stopwords from 'yake_keywords_titles_filtered' column
df_cpc_y02.loc[:, 'yake_keywords_titles_filtered'] = df_cpc_y02['yake_keywords_titles_filtered'].progress_apply(
    lambda x: [word for word in x if word not in (stopwords.words('english'))]
)

100%|██████████| 340/340 [00:00<00:00, 6048.41it/s]


In [160]:
import spacy

nlp = spacy.load('en_core_web_lg')

# POS tagging of 'full_title'
df_cpc_y02['pos_tags_full_title'] = df_cpc_y02['full_title'].progress_apply(lambda x: [(token.text, token.pos_) for token in nlp(x)])

100%|██████████| 340/340 [00:01<00:00, 299.63it/s]


In [161]:
def filter_noun_phrases(keywords, pos_tags):
    valid_keywords = []
    for keyword in keywords:
        # Split the keyword into words
        keyword_words = keyword.split()
        # Check if at least one word in the keyword string is a noun or proper noun
        if any(any(word == tag[0] and tag[1] in ['NOUN', 'PRON', 'PROPN'] for tag in pos_tags) for word in keyword_words):
            valid_keywords.append(keyword)
    return valid_keywords

# Apply the function to filter the keywords
df_cpc_y02['yake_keywords_titles_pos'] = df_cpc_y02.progress_apply(lambda row: filter_noun_phrases(row['yake_keywords_titles_filtered'], row['pos_tags_full_title']), axis=1)

100%|██████████| 340/340 [00:00<00:00, 74984.93it/s]


In [163]:
# Generate list of all classifications with exactly 4 characters
cpc_subclass = df_cpc_y02[df_cpc_y02['cpc_classification'].str.len() == 4]['cpc_classification'].tolist()

df_cpc_y02_subclass = pd.DataFrame(columns=['cpc_subclass', 'yake_keywords'])
# Set cpc_subclass to 'cpc_subclass' column
df_cpc_y02_subclass['cpc_subclass'] = cpc_subclass

In [187]:
# Initialize 'yake_keywords' column as empty string if it's not already a string
df_cpc_y02_subclass['yake_keywords'] = df_cpc_y02_subclass['yake_keywords'].apply(lambda x: '' if pd.isnull(x) else x)

for index, row in tqdm(df_cpc_y02_subclass.iterrows()):
    # For each row in df_cpc_y02, check every row in df_cpc_y02_subclass
    for _, sub_row in df_cpc_y02.iterrows():
        # Check if 'cpc_classification' contains string in 'cpc_subclass' column
        if row['cpc_subclass'] in sub_row['cpc_classification']:
            # Convert the list of keywords to a string and append to 'yake_keywords' column
            keywords_string = ', '.join(sub_row['yake_keywords_titles_pos'])
            df_cpc_y02_subclass.at[index, 'yake_keywords'] += keywords_string + ', '

# Cast cells of 'yake_keywords' column to lists
df_cpc_y02_subclass['yake_keywords'] = df_cpc_y02_subclass['yake_keywords'].str.split(', ')

8it [00:00, 83.81it/s]


In [190]:
# Generate list of all keywords
keyword_list = sum(df_cpc_y02_subclass['yake_keywords'].tolist(), [])

# Generate list of unique keywords
unique_keywords = list(set(keyword_list))

In [224]:
df_keyword = pd.DataFrame(columns=['keyword', 'cpc_subclass'])
# Set keyword to 'keyword' column
df_keyword['keyword'] = keyword_list

# Get the confidence of the keyword
def get_confidence(keyword):
    for index, row in df_cpc_y02.iterrows():
        for yake_keyword in row['yake_keywords_titles_lower']:
            if yake_keyword[0] == keyword:
                return yake_keyword[1]
    return None  # return None if the keyword is not found

df_keyword['yake_confidence'] = df_keyword['keyword'].progress_apply(get_confidence)

# Initialize 'cpc_subclass' column as empty string if it's not already a string
df_keyword['cpc_subclass'] = df_keyword['cpc_subclass'].apply(lambda x: '' if pd.isnull(x) else x)


for index,row in tqdm(df_keyword.iterrows()):
    # For each row in df_cpc_y02, check every row in df_cpc_y02_subclass
    for _, sub_row in df_cpc_y02_subclass.iterrows():
        # Check if 'yake_keywords' contains string in 'keyword' column
        if row['keyword'] in sub_row['yake_keywords']:
            # Append 'cpc_subclass' to 'cpc_subclass' column
            if sub_row['cpc_subclass'] not in df_keyword.at[index, 'cpc_subclass']:
                df_keyword.at[index, 'cpc_subclass'] += sub_row['cpc_subclass'] + ', '
    # Remove trailing comma and whitespace
    df_keyword.at[index, 'cpc_subclass'] = df_keyword.at[index, 'cpc_subclass'][:-2]

# Cast cells of 'cpc_subclass' column to lists
df_keyword['cpc_subclass'] = df_keyword['cpc_subclass'].str.split(', ')

100%|██████████| 818/818 [00:02<00:00, 277.24it/s] 
818it [00:00, 3006.75it/s]


In [217]:
# Delete all rows where n_gram is <=2 words
df_keyword = df_keyword[df_keyword['keyword'].str.count(' ') >= 1]

In [226]:
df_keyword_cpc_explode = df_keyword.explode('cpc_subclass')
# Reset index
df_keyword_cpc_explode = df_keyword_cpc_explode.reset_index(drop=True)

In [234]:
df_keyword.to_json('/mnt/hdd01/patentsview/CPC Classification/df_keyword_y02_postprocessed.json', orient='records')

# Keyword Embedding

In [200]:
from sentence_transformers import SentenceTransformer
import torch

In [201]:
# model_climatebert = SentenceTransformer('climatebert/distilroberta-base-climate-f')
# model_bertforpatents = SentenceTransformer('anferico/bert-for-patents')
model_patentsberta = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')

In [202]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU available: {}".format(torch.cuda.get_device_name(0)))
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU available: NVIDIA RTX A4500


In [203]:
df_keyword['keyword_patentsberta_embedding'] = df_keyword['keyword'].progress_apply(lambda x: model_patentsberta.encode(x))

100%|██████████| 818/818 [00:06<00:00, 132.15it/s]


# Dimensionality Reduction

In [129]:
import numpy as np
from openTSNE import TSNE
from sklearn.model_selection import train_test_split

In [139]:
# Generate train/test split
patentsberta_x_train, patentsberta_x_test = train_test_split(df_keyword['keyword_patentsberta_embedding'].tolist(), test_size=0.2, random_state=42)
patentsberta_x_train_np = np.array(patentsberta_x_train)
patentsberta_x_test_np = np.array(patentsberta_x_test)

In [131]:
tsne = TSNE(
    perplexity=30,
    metric="euclidean",
    n_jobs=8,
    random_state=42,
    n_iter=1000,
    verbose=True,
)

In [140]:
# Perform t-SNE on training data
patentsberta_embedding_train = tsne.fit(patentsberta_x_train_np)

--------------------------------------------------------------------------------
TSNE(early_exaggeration=12, n_iter=1000, n_jobs=8, random_state=42,
     verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using exact search using euclidean distance...
   --> Time elapsed: 0.02 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.00 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.03 seconds
===> Running optimization with exaggeration=12.00, lr=45.00 for 250 iterations...
Iteration   50, KL divergence 0.3761, 50 iterations in 0.1219 sec
Iteration  100, KL divergence -2.9699, 50 iterations in 0.0399 sec
Iteration  150, KL divergence 2.6295, 50 iterations in 0.0422 sec
Iteration  200, KL divergence 2.6295, 50 iterations in 0.0431 sec
Iteration  250, KL divergence 2.6295, 50 iterations in 0.0350 sec
   --> Time elapsed: 0.28 seconds
===> Running optimization with exaggerat

In [141]:
patentsberta_embedding_test = patentsberta_embedding_train.transform(patentsberta_x_test_np)

===> Finding 15 nearest neighbors in existing embedding using exact search...
   --> Time elapsed: 0.01 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 1025.1242, 50 iterations in 0.0564 sec
Iteration  100, KL divergence 1002.3553, 50 iterations in 0.0569 sec
Iteration  150, KL divergence 993.3766, 50 iterations in 0.0590 sec
Iteration  200, KL divergence 992.2317, 50 iterations in 0.0294 sec
Iteration  250, KL divergence 989.5238, 50 iterations in 0.0353 sec
   --> Time elapsed: 0.24 seconds


In [142]:
patentsberta_embedding = np.concatenate((patentsberta_embedding_train, patentsberta_embedding_test), axis=0)
df_keyword['keyword_patentsberta_embedding_tsne'] = patentsberta_embedding.tolist()
df_keyword['keyword_patentsberta_embedding_tsne_x'] = df_keyword['keyword_patentsberta_embedding_tsne'].apply(lambda x: x[0])
df_keyword['keyword_patentsberta_embedding_tsne_y'] = df_keyword['keyword_patentsberta_embedding_tsne'].apply(lambda x: x[1])

# Plotting

In [143]:
import plotly.express as px

In [149]:
fig_patentsberta = px.scatter(df_keyword, x="keyword_bertforpatents_embedding_tsne_x", y="keyword_bertforpatents_embedding_tsne_y", hover_name="keyword", color="solar", title="PatentsBERTa t-SNE")
fig_patentsberta.show()

In [151]:
df_keyword.columns

Index(['keyword', 'cpc_subclass', 'keyword_patentsberta_embedding',
       'keyword_patentsberta_embedding_tsne',
       'keyword_patentsberta_embedding_tsne_x',
       'keyword_patentsberta_embedding_tsne_y',
       'keyword_bertforpatents_embedding',
       'keyword_bertforpatents_embedding_tsne',
       'keyword_bertforpatents_embedding_tsne_x',
       'keyword_bertforpatents_embedding_tsne_y', 'solar'],
      dtype='object')