In [None]:
import pandas as pd
import re
from wordtrie import WordTrie
from tqdm import tqdm
tqdm.pandas()

In [None]:
df_cleantech_keywords = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/cleantech_keywords_similarity_015_co_occurrence_01.json')

In [None]:
# New column cleantech_trie_count and non_cleantech_trie_count
df_cleantech_keywords['cleantech_trie_count'] = 0
df_cleantech_keywords['non_cleantech_trie_count'] = 0

In [None]:
# Build WordTrie
def make_wordtrie(keyword_list):
    trie = WordTrie()
    if keyword_list is None:
        return None
    i = 0
    for keyword in keyword_list:
        if isinstance(keyword, str):
            trie.add(keyword, i)
            i += 1
    print(f"Added {i} keywords to trie")
    return trie

# Build WordTrie
cleantech_trie = make_wordtrie(df_cleantech_keywords['keyword_yake_lemma'].tolist())
non_cleantech_trie = make_wordtrie(df_cleantech_keywords['keyword_yake_lemma'].tolist())

# Cleantech Data

## USPTO

In [None]:
df_cleantech_uspto = pd.read_json('/mnt/hdd01/patentsview/Patentsview - Cleantech Patents/g_patent_claims_cleantech_yake.json')
df_cleantech_uspto["trie"] = df_cleantech_uspto["claim_fulltext"].apply(lambda x: cleantech_trie.search(x, return_nodes=True))
df_cleantech_uspto["trie"] = df_cleantech_uspto["trie"].apply(lambda x: [' '.join(y[0]) for y in x] if len(x) > 0 else None)

In [None]:
df_cleantech_trie_explode = df_cleantech_uspto["trie"].explode()
df_cleantech_trie_explode = pd.DataFrame(df_cleantech_trie_explode).reset_index()
df_cleantech_trie_count = df_cleantech_trie_explode.groupby('trie')['index'].count().reset_index(name='cleantech_trie_count')

In [None]:
# Merge with df_cleantech_keywords on keyword_yake_lemma and trie, add cleantech_trie_count to cleantech_trie_count in df_cleantech_keywords
df_cleantech_keywords = df_cleantech_keywords.merge(df_cleantech_trie_count, how='left', left_on='keyword_yake_lemma', right_on='trie', suffixes=('', '_from_trie_count'))
# Fill NaN values with 0 in the new 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords['cleantech_trie_count_from_trie_count'] = df_cleantech_keywords['cleantech_trie_count_from_trie_count'].fillna(0)
# Sum the values from the two 'cleantech_trie_count' columns
df_cleantech_keywords['cleantech_trie_count'] = df_cleantech_keywords['cleantech_trie_count'] + df_cleantech_keywords['cleantech_trie_count_from_trie_count']
# Drop the extra 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords = df_cleantech_keywords.drop(columns=['cleantech_trie_count_from_trie_count', 'trie'])

In [None]:
# Delete all dataframes except df_cleantech_keywords to free up memory
del df_cleantech_trie_count
del df_cleantech_trie_explode
del df_cleantech_uspto

## EPO

In [None]:
df_cleantech_epo = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/cleantech_epo_text_data_pivot_cleaned.json')
df_cleantech_epo["trie"] = df_cleantech_epo["claim_fulltext"].apply(lambda x: cleantech_trie.search(x, return_nodes=True))
df_cleantech_epo["trie"] = df_cleantech_epo["trie"].apply(lambda x: [' '.join(y[0]) for y in x] if len(x) > 0 else None)

In [None]:
df_cleantech_trie_explode = df_cleantech_epo["trie"].explode()
df_cleantech_trie_explode = pd.DataFrame(df_cleantech_trie_explode).reset_index()
df_cleantech_trie_count = df_cleantech_trie_explode.groupby('trie')['index'].count().reset_index(name='cleantech_trie_count')

In [None]:
# Merge with df_cleantech_keywords on keyword_yake_lemma and trie, add cleantech_trie_count to cleantech_trie_count in df_cleantech_keywords
df_cleantech_keywords = df_cleantech_keywords.merge(df_cleantech_trie_count, how='left', left_on='keyword_yake_lemma', right_on='trie', suffixes=('', '_from_trie_count'))
# Fill NaN values with 0 in the new 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords['cleantech_trie_count_from_trie_count'] = df_cleantech_keywords['cleantech_trie_count_from_trie_count'].fillna(0)
# Sum the values from the two 'cleantech_trie_count' columns
df_cleantech_keywords['cleantech_trie_count'] = df_cleantech_keywords['cleantech_trie_count'] + df_cleantech_keywords['cleantech_trie_count_from_trie_count']
# Drop the extra 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords = df_cleantech_keywords.drop(columns=['cleantech_trie_count_from_trie_count', 'trie'])

In [None]:
# Delete all dataframes except df_cleantech_keywords to free up memory
del df_cleantech_trie_count
del df_cleantech_trie_explode
del df_cleantech_epo

## Reliance on Science

In [None]:
df_cleantech_epo_rel = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/Reliance on Science/cleantech_epo_rel_on_science_abstract.json')
df_cleantech_uspto_rel = pd.read_json('/mnt/hdd01/patentsview/Reliance on Science - Cleantech Patents/df_oaid_Cleantech_Y02_individual_works.json')

In [None]:
df_cleantech_rel = pd.merge(df_cleantech_epo_rel, df_cleantech_uspto_rel, how='outer', left_on='oaid', right_on='oaid')
df_cleantech_rel['abstract'] = df_cleantech_rel['abstract_x'].fillna(df_cleantech_rel['abstract_y'])
df_cleantech_rel['trie'] = df_cleantech_rel['abstract'].apply(lambda x: cleantech_trie.search(x, return_nodes=True))
df_cleantech_rel['trie'] = df_cleantech_rel['trie'].apply(lambda x: [' '.join(y[0]) for y in x] if len(x) > 0 else None)

In [None]:
df_cleantech_trie_explode = df_cleantech_rel["trie"].explode()
df_cleantech_trie_explode = pd.DataFrame(df_cleantech_trie_explode).reset_index()
df_cleantech_trie_count = df_cleantech_trie_explode.groupby('trie')['index'].count().reset_index(name='cleantech_trie_count')

In [None]:
# Merge with df_cleantech_keywords on keyword_yake_lemma and trie, add cleantech_trie_count to cleantech_trie_count in df_cleantech_keywords
df_cleantech_keywords = df_cleantech_keywords.merge(df_cleantech_trie_count, how='left', left_on='keyword_yake_lemma', right_on='trie', suffixes=('', '_from_trie_count'))
# Fill NaN values with 0 in the new 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords['cleantech_trie_count_from_trie_count'] = df_cleantech_keywords['cleantech_trie_count_from_trie_count'].fillna(0)
# Sum the values from the two 'cleantech_trie_count' columns
df_cleantech_keywords['cleantech_trie_count'] = df_cleantech_keywords['cleantech_trie_count'] + df_cleantech_keywords['cleantech_trie_count_from_trie_count']
# Drop the extra 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords = df_cleantech_keywords.drop(columns=['cleantech_trie_count_from_trie_count', 'trie'])

In [None]:
# Delete all dataframes except df_cleantech_keywords to free up memory
del df_cleantech_trie_count
del df_cleantech_trie_explode
del df_cleantech_epo_rel
del df_cleantech_uspto_rel
del df_cleantech_rel

# Non Cleantech Data