In [None]:
import re
import pandas as pd
import unicodedata
import multiprocessing as mp
from wordtrie import WordTrie
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

tqdm.pandas()
lemmatizer = WordNetLemmatizer()

In [None]:
def clean_and_lemmatize(text):
    """
    Normalize, clean, and lemmatize the input text.

    :param text: A string containing the text to be processed.
    :return: A string representing the processed text.
    """
    # Normalize the text with unicodedata
    text = unicodedata.normalize("NFKD", text).encode('ASCII', 'ignore').decode('utf-8')

    # Remove URLs, brackets, and non-alphabetic characters; convert to lowercase
    text = re.sub(r"\[.*?\]|\(.*?\)|\{.*?\}", "", text)
    text = re.sub(r"https?:\/\/\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9- .,;!?]", "", text).lower().strip()

    # Lemmatize each word
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    return lemmatized_text

# Prepare Cleantech Data

In [None]:
### Prepare Cleantech Data
# Co-Occurrence Directory
# co_occurrence_dir = '/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Co-Occurrence Analysis/'
# co_occurrence_files = glob.glob(co_occurrence_dir + '*.csv')
co_occurrence_files = '/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Co-Occurrence Analysis/co_occurrence_matrix_yake_keywords_cleantech_uspto_epo_rel_ids_semantic_similarity_02.csv'

# Similarity Directory
# similarity_dir = '/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Similarity Search/'
# similarity_files = glob.glob(similarity_dir + '*.json')
similarity_files = '/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Similarity Search/df_keyword_titles_cosine_similarity_radius_025_neighbors_100_noun_chunks.json'

# Co-Occurrence Threshold
# co_occurrence_threshold = [0.01, 0.025, 0.05, 0.1, 0.15]
co_occurrence_threshold = [0.01]

In [None]:
# Load the data
df_cleantech_cooccurrence = pd.read_csv(co_occurrence_files, index_col=0)
df_cleantech_cooccurrence.dropna(how='all', inplace=True)

df_cleantech_similarity = pd.read_json(similarity_files)

# Co-Occurrence Threshold
co_occurrence_threshold = 0.01  # Assuming you are using a single threshold value

# Create a mask for the co-occurrence threshold
mask = df_cleantech_cooccurrence.applymap(lambda x: x >= co_occurrence_threshold)

# Apply mask to DataFrame
filtered_co_occurrence_df = df_cleantech_cooccurrence[mask]

# Extract keywords
co_occurrence_list = filtered_co_occurrence_df.columns[filtered_co_occurrence_df.any()].tolist()

# Processing similarity data
similarity_series = pd.concat([df_cleantech_similarity['keyword_yake_lemma'], df_cleantech_similarity['keywords_keyword_yake_bertforpatents_embedding'].explode()], ignore_index=True)
similarity_list = similarity_series.drop_duplicates().tolist()

# Combine and deduplicate lists
cleantech_list = list(set(co_occurrence_list + similarity_list))
cleantech_list = [str(keyword) for keyword in cleantech_list]

# # Create DataFrame
df_cleantech = pd.DataFrame(cleantech_list, columns=['keyword_yake_lemma'])
# df_cleantech['cleantech'] = 1

del df_cleantech_cooccurrence
del df_cleantech_similarity
del co_occurrence_list
del similarity_list

In [None]:
# Build WordTrie
def make_wordtrie(keyword_list):
    trie = WordTrie()
    if keyword_list is None:
        return None
    i = 0
    for keyword in keyword_list:
        if isinstance(keyword, str):
            trie.add(keyword, i)
            i += 1
    print(f"Added {i} keywords to trie")
    return trie

# Build WordTrie
cleantech_trie = make_wordtrie(df_cleantech['keyword_yake_lemma'].tolist())

# Prepare Trie Counts

## USPTO - Cleantech

In [None]:
# Load the data
g_uspto_cleantech = pd.read_json('/mnt/hdd01/patentsview/Patentsview - Cleantech Patents/g_patent_claims_fulltext_cleantech.json')
g_uspto_cleantech['patent_id'] = 'us-' + g_uspto_cleantech['patent_id'].astype(str)

# Create a pool of workers
pool = mp.Pool(min(mp.cpu_count(),6))

# Apply the function to the 'claim_fulltext' column using the pool of workers
results = []
for result in tqdm(pool.imap(clean_and_lemmatize, g_uspto_cleantech['claim_fulltext']), total=g_uspto_cleantech.shape[0]):
    results.append(result)

g_uspto_cleantech['claim_fulltext'] = results

# Close the pool
pool.close()

In [None]:
# Perform the search and explode the trie
g_uspto_cleantech["trie"] = g_uspto_cleantech["claim_fulltext"].progress_apply(lambda x: cleantech_trie.search(x, return_nodes=True))
g_uspto_cleantech["trie"] = g_uspto_cleantech["trie"].progress_apply(lambda x: [' '.join(y[0]) for y in x] if len(x) > 0 else None)

In [None]:
g_uspto_cleantech = g_uspto_cleantech[['patent_id', 'trie']]

In [None]:
g_uspto_cleantech.to_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_uspto_cleantech_trie.csv')

## USPTO - Non Cleantech

In [10]:
# Load the data
g_uspto_non_cleantech = pd.read_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/g_uspto_non_cleantech_claims_fulltext.json')
g_uspto_non_cleantech['patent_id'] = 'us-' + g_uspto_non_cleantech['patent_id'].astype(str)

# Create a pool of workers
pool = mp.Pool(min(mp.cpu_count(),6))

# Apply the function to the 'claim_fulltext' column using the pool of workers
results = []
for result in tqdm(pool.imap(clean_and_lemmatize, g_uspto_non_cleantech['claim_fulltext']), total=g_uspto_non_cleantech.shape[0]):
    results.append(result)

g_uspto_non_cleantech['claim_fulltext'] = results

# Close the pool
pool.close()

100%|██████████| 599997/599997 [03:10<00:00, 3155.46it/s]


In [11]:
# Perform the search and explode the trie
g_uspto_non_cleantech["trie"] = g_uspto_non_cleantech["claim_fulltext"].progress_apply(lambda x: cleantech_trie.search(x, return_nodes=True))
g_uspto_non_cleantech["trie"] = g_uspto_non_cleantech["trie"].progress_apply(lambda x: [' '.join(y[0]) for y in x] if len(x) > 0 else None)

100%|██████████| 599997/599997 [04:29<00:00, 2226.40it/s]
100%|██████████| 599997/599997 [00:13<00:00, 43362.17it/s]


In [12]:
g_uspto_non_cleantech = g_uspto_non_cleantech[['patent_id', 'trie']]

In [13]:
g_uspto_non_cleantech.to_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_uspto_non_cleantech_trie.csv')

## EPO - Cleantech

In [14]:
# Load the data
g_epo_cleantech = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/cleantech_epo_text_data_pivot_cleaned.json')
g_epo_cleantech['patent_id'] = 'ep-' + g_epo_cleantech['publn_nr'].astype(str)

# Create a pool of workers
pool = mp.Pool(min(mp.cpu_count(),6))

# Delete all rows where 'cleaned_claims' is not str
g_epo_cleantech = g_epo_cleantech[g_epo_cleantech['cleaned_claims'].apply(lambda x: isinstance(x, str))]

# Apply the function to the 'cleaned_claims' column using the pool of workers
results = []
for result in tqdm(pool.imap(clean_and_lemmatize, g_epo_cleantech['cleaned_claims']), total=g_epo_cleantech.shape[0]):
    results.append(result)

g_epo_cleantech['cleaned_claims'] = results

# Close the pool
pool.close()

100%|██████████| 179597/179597 [00:31<00:00, 5761.08it/s]


In [15]:
# Perform the search and explode the trie
g_epo_cleantech["trie"] = g_epo_cleantech["cleaned_claims"].progress_apply(lambda x: cleantech_trie.search(x, return_nodes=True))
g_epo_cleantech["trie"] = g_epo_cleantech["trie"].progress_apply(lambda x: [' '.join(y[0]) for y in x] if len(x) > 0 else None)

100%|██████████| 179597/179597 [00:33<00:00, 5397.99it/s]
100%|██████████| 179597/179597 [00:02<00:00, 65138.64it/s]


In [16]:
# Drop all columns except 'publn_nr' and 'trie'
g_epo_cleantech = g_epo_cleantech[['publn_nr', 'trie']]

In [17]:
g_epo_cleantech.to_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_epo_cleantech_trie.csv')

## EPO - Non Cleantech

In [18]:
# Load the data
g_epo_non_cleantech = pd.read_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/df_epo_non_cleantech_text_data_pivot_claims_cleaned.json')
g_epo_non_cleantech['publn_nr'] = 'ep-' + g_epo_non_cleantech['publn_nr'].astype(str)

# Create a pool of workers
pool = mp.Pool(min(mp.cpu_count(),6))

# Apply the function to the 'cleaned_claims' column using the pool of workers
results = []
for result in tqdm(pool.imap(clean_and_lemmatize, g_epo_non_cleantech['cleaned_claims']), total=g_epo_non_cleantech.shape[0]):
    results.append(result)

g_epo_non_cleantech['cleaned_claims'] = results

# Close the pool
pool.close()

100%|██████████| 181920/181920 [00:33<00:00, 5511.48it/s]


In [19]:
# Perform the search and explode the trie
g_epo_non_cleantech["trie"] = g_epo_non_cleantech["cleaned_claims"].progress_apply(lambda x: cleantech_trie.search(x, return_nodes=True))
g_epo_non_cleantech["trie"] = g_epo_non_cleantech["trie"].progress_apply(lambda x: [' '.join(y[0]) for y in x] if len(x) > 0 else None)

100%|██████████| 181920/181920 [01:15<00:00, 2404.83it/s]
100%|██████████| 181920/181920 [00:03<00:00, 55564.01it/s]


In [20]:
g_epo_non_cleantech = g_epo_non_cleantech[['publn_nr', 'trie']]

In [21]:
g_epo_non_cleantech.to_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_epo_non_cleantech_trie.csv')

## REL - Cleantech

In [22]:
# Load the data
g_rel_cleantech = pd.read_json('/mnt/hdd01/patentsview/Reliance on Science - Cleantech Patents/df_oaid_cleantech_yake_noun_chunks.json')
g_rel_cleantech['oaid'] = 'rel-' + g_rel_cleantech['oaid'].astype(str)

In [23]:
# Create a pool of workers
pool = mp.Pool(min(mp.cpu_count(),6))

# Apply the function to the 'abstract' column using the pool of workers
results = []
for result in tqdm(pool.imap(clean_and_lemmatize, g_rel_cleantech['abstract']), total=g_rel_cleantech.shape[0]):
    results.append(result)

g_rel_cleantech['abstract'] = results

# Close the pool
pool.close()

100%|██████████| 623364/623364 [00:50<00:00, 12380.58it/s]


In [24]:
# Perform the search and explode the trie
g_rel_cleantech["trie"] = g_rel_cleantech["abstract"].progress_apply(lambda x: cleantech_trie.search(x, return_nodes=True))
g_rel_cleantech["trie"] = g_rel_cleantech["trie"].progress_apply(lambda x: [' '.join(y[0]) for y in x] if len(x) > 0 else None)

100%|██████████| 623364/623364 [01:27<00:00, 7116.44it/s] 
100%|██████████| 623364/623364 [00:04<00:00, 135185.05it/s]


In [25]:
g_rel_cleantech = g_rel_cleantech[['oaid', 'trie']]

In [26]:
g_rel_cleantech.to_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_rel_cleantech_trie.csv')

## REL - Non Cleantech

In [27]:
# Load the data
g_rel_non_cleantech = pd.read_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/df_oaids_non_cleantech.json')
g_rel_non_cleantech['oaid'] = 'rel-' + g_rel_non_cleantech['oaid'].astype(str)

# Create a pool of workers
pool = mp.Pool(min(mp.cpu_count(),6))

# Apply the function to the 'abstract' column using the pool of workers
results = []
for result in tqdm(pool.imap(clean_and_lemmatize, g_rel_non_cleantech['abstract']), total=g_rel_non_cleantech.shape[0]):
    results.append(result)

g_rel_non_cleantech['abstract'] = results

# Close the pool
pool.close()

100%|██████████| 611441/611441 [00:48<00:00, 12578.07it/s]


In [28]:
# Perform the search and explode the trie
g_rel_non_cleantech["trie"] = g_rel_non_cleantech["abstract"].progress_apply(lambda x: cleantech_trie.search(x, return_nodes=True))
g_rel_non_cleantech["trie"] = g_rel_non_cleantech["trie"].progress_apply(lambda x: [' '.join(y[0]) for y in x] if len(x) > 0 else None)

100%|██████████| 611441/611441 [01:32<00:00, 6604.92it/s] 
100%|██████████| 611441/611441 [00:02<00:00, 211365.44it/s]


In [29]:
g_rel_non_cleantech = g_rel_non_cleantech[['oaid', 'trie']]

In [30]:
g_rel_non_cleantech.to_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_rel_non_cleantech_trie.csv')