In [None]:
import pandas as pd
import numpy as np
from wordtrie import WordTrie
from tqdm import tqdm
tqdm.pandas()
from multiprocessing import Pool
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
# Function to lemmatize a single text
def lemmatize_single_text(text):
    lemmatizer = WordNetLemmatizer()
    try:
        word_list = text.split()
        return ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    except:
        return text

# Function to apply lemmatization over a series in a DataFrame
def lemmatize_text(series, cores=12):
    with Pool(cores) as pool:
        return pool.map(lemmatize_single_text, series)

In [None]:
df_cleantech_keywords = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/cleantech_keywords_similarity_015_co_occurrence_01.json')

In [None]:
# New column cleantech_trie_count and non_cleantech_trie_count
df_cleantech_keywords['cleantech_trie_count'] = 0
df_cleantech_keywords['non_cleantech_trie_count'] = 0

In [None]:
# Build WordTrie
def make_wordtrie(keyword_list):
    trie = WordTrie()
    if keyword_list is None:
        return None
    i = 0
    for keyword in keyword_list:
        if isinstance(keyword, str):
            trie.add(keyword, i)
            i += 1
    print(f"Added {i} keywords to trie")
    return trie

# Build WordTrie
cleantech_trie = make_wordtrie(df_cleantech_keywords['keyword_yake_lemma'].tolist())
non_cleantech_trie = make_wordtrie(df_cleantech_keywords['keyword_yake_lemma'].tolist())

# Cleantech Data

## USPTO

In [None]:
df_cleantech_uspto = pd.read_json('/mnt/hdd01/patentsview/Patentsview - Cleantech Patents/g_patent_claims_cleantech_yake.json')
# Lemmatize the claim_fulltext
df_cleantech_uspto['claim_fulltext'] = lemmatize_text(df_cleantech_uspto['claim_fulltext'])

In [None]:
df_cleantech_uspto["trie"] = df_cleantech_uspto["claim_fulltext"].apply(lambda x: cleantech_trie.search(x, return_nodes=True))
df_cleantech_uspto["trie"] = df_cleantech_uspto["trie"].apply(lambda x: [' '.join(y[0]) for y in x] if len(x) > 0 else None)

In [None]:
df_cleantech_trie_explode = df_cleantech_uspto["trie"].explode()
df_cleantech_trie_explode = pd.DataFrame(df_cleantech_trie_explode).reset_index()
df_cleantech_trie_count = df_cleantech_trie_explode.groupby('trie')['index'].count().reset_index(name='cleantech_trie_count')

In [None]:
# Merge with df_cleantech_keywords on keyword_yake_lemma and trie, add cleantech_trie_count to cleantech_trie_count in df_cleantech_keywords
df_cleantech_keywords = df_cleantech_keywords.merge(df_cleantech_trie_count, how='left', left_on='keyword_yake_lemma', right_on='trie', suffixes=('', '_from_trie_count'))
# Fill NaN values with 0 in the new 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords['cleantech_trie_count_from_trie_count'] = df_cleantech_keywords['cleantech_trie_count_from_trie_count'].fillna(0)
# Sum the values from the two 'cleantech_trie_count' columns
df_cleantech_keywords['cleantech_trie_count'] = df_cleantech_keywords['cleantech_trie_count'] + df_cleantech_keywords['cleantech_trie_count_from_trie_count']
# Drop the extra 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords = df_cleantech_keywords.drop(columns=['cleantech_trie_count_from_trie_count', 'trie'])

In [None]:
# Delete all dataframes except df_cleantech_keywords to free up memory
del df_cleantech_trie_count
del df_cleantech_trie_explode
del df_cleantech_uspto

## EPO

In [None]:
df_cleantech_epo = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/cleantech_epo_text_data_pivot_cleaned.json')
df_cleantech_epo['cleaned_claims'] = lemmatize_text(df_cleantech_epo['cleaned_claims'])

In [None]:
df_cleantech_epo['cleaned_claims'] = lemmatize_text(df_cleantech_epo['cleaned_claims'])

In [None]:
# Delete all rows where cleaned_claims is not a string
df_cleantech_epo = df_cleantech_epo[df_cleantech_epo['cleaned_claims'].apply(lambda x: isinstance(x, str))]

In [None]:
df_cleantech_epo["trie"] = df_cleantech_epo["cleaned_claims"].apply(lambda x: cleantech_trie.search(x, return_nodes=True))
df_cleantech_epo["trie"] = df_cleantech_epo["trie"].apply(lambda x: [' '.join(y[0]) for y in x] if len(x) > 0 else None)

In [None]:
df_cleantech_trie_explode = df_cleantech_epo["trie"].explode()
df_cleantech_trie_explode = pd.DataFrame(df_cleantech_trie_explode).reset_index()
df_cleantech_trie_count = df_cleantech_trie_explode.groupby('trie')['index'].count().reset_index(name='cleantech_trie_count')

In [None]:
# Merge with df_cleantech_keywords on keyword_yake_lemma and trie, add cleantech_trie_count to cleantech_trie_count in df_cleantech_keywords
df_cleantech_keywords = df_cleantech_keywords.merge(df_cleantech_trie_count, how='left', left_on='keyword_yake_lemma', right_on='trie', suffixes=('', '_from_trie_count'))
# Fill NaN values with 0 in the new 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords['cleantech_trie_count_from_trie_count'] = df_cleantech_keywords['cleantech_trie_count_from_trie_count'].fillna(0)
# Sum the values from the two 'cleantech_trie_count' columns
df_cleantech_keywords['cleantech_trie_count'] = df_cleantech_keywords['cleantech_trie_count'] + df_cleantech_keywords['cleantech_trie_count_from_trie_count']
# Drop the extra 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords = df_cleantech_keywords.drop(columns=['cleantech_trie_count_from_trie_count', 'trie'])

In [None]:
# Delete all dataframes except df_cleantech_keywords to free up memory
del df_cleantech_trie_count
del df_cleantech_trie_explode
del df_cleantech_epo

## Reliance on Science

In [None]:
df_cleantech_epo_rel = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/Reliance on Science/cleantech_epo_rel_on_science_abstract.json')
df_cleantech_uspto_rel = pd.read_json('/mnt/hdd01/patentsview/Reliance on Science - Cleantech Patents/df_oaid_Cleantech_Y02_individual_works.json')

In [None]:
df_cleantech_rel = pd.merge(df_cleantech_epo_rel, df_cleantech_uspto_rel, how='outer', left_on='oaid', right_on='oaid')
df_cleantech_rel['abstract'] = df_cleantech_rel['abstract_x'].fillna(df_cleantech_rel['abstract_y'])
df_cleantech_rel['abstract'] = lemmatize_text(df_cleantech_rel['abstract'])

In [None]:
df_cleantech_rel['trie'] = df_cleantech_rel['abstract'].apply(lambda x: cleantech_trie.search(x, return_nodes=True))
df_cleantech_rel['trie'] = df_cleantech_rel['trie'].apply(lambda x: [' '.join(y[0]) for y in x] if len(x) > 0 else None)

In [None]:
df_cleantech_trie_explode = df_cleantech_rel["trie"].explode()
df_cleantech_trie_explode = pd.DataFrame(df_cleantech_trie_explode).reset_index()
df_cleantech_trie_count = df_cleantech_trie_explode.groupby('trie')['index'].count().reset_index(name='cleantech_trie_count')

In [None]:
# Merge with df_cleantech_keywords on keyword_yake_lemma and trie, add cleantech_trie_count to cleantech_trie_count in df_cleantech_keywords
df_cleantech_keywords = df_cleantech_keywords.merge(df_cleantech_trie_count, how='left', left_on='keyword_yake_lemma', right_on='trie', suffixes=('', '_from_trie_count'))
# Fill NaN values with 0 in the new 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords['cleantech_trie_count_from_trie_count'] = df_cleantech_keywords['cleantech_trie_count_from_trie_count'].fillna(0)
# Sum the values from the two 'cleantech_trie_count' columns
df_cleantech_keywords['cleantech_trie_count'] = df_cleantech_keywords['cleantech_trie_count'] + df_cleantech_keywords['cleantech_trie_count_from_trie_count']
# Drop the extra 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords = df_cleantech_keywords.drop(columns=['cleantech_trie_count_from_trie_count', 'trie'])

In [None]:
# Delete all dataframes except df_cleantech_keywords to free up memory
del df_cleantech_trie_count
del df_cleantech_trie_explode
del df_cleantech_epo_rel
del df_cleantech_uspto_rel
del df_cleantech_rel

# Non Cleantech Data

## USPTO

In [None]:
df_non_cleantech_uspto = pd.read_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/g_uspto_non_cleantech_claims_fulltext.json')
# Lemmatize the claim_fulltext
df_non_cleantech_uspto['claim_fulltext'] = lemmatize_text(df_non_cleantech_uspto['claim_fulltext'])

In [None]:
df_non_cleantech_uspto["trie"] = df_non_cleantech_uspto["claim_fulltext"].apply(lambda x: non_cleantech_trie.search(x, return_nodes=True))
df_non_cleantech_uspto["trie"] = df_non_cleantech_uspto["trie"].apply(lambda x: [' '.join(y[0]) for y in x] if len(x) > 0 else None)

In [None]:
df_non_cleantech_trie_explode = df_non_cleantech_uspto["trie"].explode()
df_non_cleantech_trie_explode = pd.DataFrame(df_non_cleantech_trie_explode).reset_index()
df_non_cleantech_trie_count = df_non_cleantech_trie_explode.groupby('trie')['index'].count().reset_index(name='non_cleantech_trie_count')

In [None]:
# Merge with df_cleantech_keywords on keyword_yake_lemma and trie, add cleantech_trie_count to cleantech_trie_count in df_cleantech_keywords
df_cleantech_keywords = df_cleantech_keywords.merge(df_non_cleantech_trie_count, how='left', left_on='keyword_yake_lemma', right_on='trie', suffixes=('', '_from_trie_count'))
# Fill NaN values with 0 in the new 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords['non_cleantech_trie_count_from_trie_count'] = df_cleantech_keywords['non_cleantech_trie_count_from_trie_count'].fillna(0)
# Sum the values from the two 'cleantech_trie_count' columns
df_cleantech_keywords['non_cleantech_trie_count'] = df_cleantech_keywords['non_cleantech_trie_count'] + df_cleantech_keywords['non_cleantech_trie_count_from_trie_count']
# Drop the extra 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords = df_cleantech_keywords.drop(columns=['non_cleantech_trie_count_from_trie_count', 'trie'])

In [None]:
# Delete all dataframes except df_cleantech_keywords to free up memory
del df_non_cleantech_trie_count
del df_non_cleantech_trie_explode
del df_non_cleantech_uspto

## EPO

In [None]:
df_non_cleantech_epo = pd.read_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/df_epo_non_cleantech_text_data_pivot_claims_cleaned.json')
df_non_cleantech_epo['cleaned_claims'] = lemmatize_text(df_non_cleantech_epo['cleaned_claims'])

In [None]:
df_non_cleantech_epo["trie"] = df_non_cleantech_epo["cleaned_claims"].apply(lambda x: non_cleantech_trie.search(x, return_nodes=True))
df_non_cleantech_epo["trie"] = df_non_cleantech_epo["trie"].apply(lambda x: [' '.join(y[0]) for y in x] if len(x) > 0 else None)

In [None]:
df_non_cleantech_trie_explode = df_non_cleantech_epo["trie"].explode()
df_non_cleantech_trie_explode = pd.DataFrame(df_non_cleantech_trie_explode).reset_index()
df_non_cleantech_trie_count = df_non_cleantech_trie_explode.groupby('trie')['index'].count().reset_index(name='non_cleantech_trie_count')

In [None]:
# Merge with df_cleantech_keywords on keyword_yake_lemma and trie, add cleantech_trie_count to cleantech_trie_count in df_cleantech_keywords
df_cleantech_keywords = df_cleantech_keywords.merge(df_non_cleantech_trie_count, how='left', left_on='keyword_yake_lemma', right_on='trie', suffixes=('', '_from_trie_count'))
# Fill NaN values with 0 in the new 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords['non_cleantech_trie_count_from_trie_count'] = df_cleantech_keywords['non_cleantech_trie_count_from_trie_count'].fillna(0)
# Sum the values from the two 'cleantech_trie_count' columns
df_cleantech_keywords['non_cleantech_trie_count'] = df_cleantech_keywords['non_cleantech_trie_count'] + df_cleantech_keywords['non_cleantech_trie_count_from_trie_count']
# Drop the extra 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords = df_cleantech_keywords.drop(columns=['non_cleantech_trie_count_from_trie_count', 'trie'])

In [None]:
# Delete all dataframes except df_cleantech_keywords to free up memory
del df_non_cleantech_trie_count
del df_non_cleantech_trie_explode
del df_non_cleantech_epo

## Reliance on Science

In [None]:
df_non_cleantech_rel = pd.read_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/df_oaids_non_cleantech.json')
df_non_cleantech_rel['abstract'] = lemmatize_text(df_non_cleantech_rel['abstract'])

In [None]:
df_non_cleantech_rel["trie"] = df_non_cleantech_rel["abstract"].apply(lambda x: non_cleantech_trie.search(x, return_nodes=True))
df_non_cleantech_rel["trie"] = df_non_cleantech_rel["trie"].apply(lambda x: [' '.join(y[0]) for y in x] if len(x) > 0 else None)

In [None]:
df_non_cleantech_trie_explode = df_non_cleantech_rel["trie"].explode()
df_non_cleantech_trie_explode = pd.DataFrame(df_non_cleantech_trie_explode).reset_index()
df_non_cleantech_trie_count = df_non_cleantech_trie_explode.groupby('trie')['index'].count().reset_index(name='non_cleantech_trie_count')

In [None]:
# Merge with df_cleantech_keywords on keyword_yake_lemma and trie, add cleantech_trie_count to cleantech_trie_count in df_cleantech_keywords
df_cleantech_keywords = df_cleantech_keywords.merge(df_non_cleantech_trie_count, how='left', left_on='keyword_yake_lemma', right_on='trie', suffixes=('', '_from_trie_count'))
# Fill NaN values with 0 in the new 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords['non_cleantech_trie_count_from_trie_count'] = df_cleantech_keywords['non_cleantech_trie_count_from_trie_count'].fillna(0)
# Sum the values from the two 'cleantech_trie_count' columns
df_cleantech_keywords['non_cleantech_trie_count'] = df_cleantech_keywords['non_cleantech_trie_count'] + df_cleantech_keywords['non_cleantech_trie_count_from_trie_count']
# Drop the extra 'cleantech_trie_count_from_trie_count' column
df_cleantech_keywords = df_cleantech_keywords.drop(columns=['non_cleantech_trie_count_from_trie_count', 'trie'])

In [None]:
# Delete all dataframes except df_cleantech_keywords to free up memory
del df_non_cleantech_trie_count
del df_non_cleantech_trie_explode
del df_non_cleantech_rel

# Compute the KL divergence between the two distributions

In [None]:
sum_cleantech_trie_count = df_cleantech_keywords['cleantech_trie_count'].sum()
sum_non_cleantech_trie_count = df_cleantech_keywords['non_cleantech_trie_count'].sum()

In [None]:
# If 'cleantech_trie_count' is 0, set 'cleantech_trie_count' to 1
df_cleantech_keywords.loc[df_cleantech_keywords['cleantech_trie_count'] == 0, 'cleantech_trie_count'] = 1
# If 'non_cleantech_trie_count' is 0, set 'non_cleantech_trie_count' to 0.001
df_cleantech_keywords.loc[df_cleantech_keywords['non_cleantech_trie_count'] == 0, 'non_cleantech_trie_count'] = 0.001

In [None]:
# Count zero values in cleantech_trie_count and non_cleantech_trie_count
# num_zero_cleantech_trie_count = df_cleantech_keywords[df_cleantech_keywords['cleantech_trie_count'] == 0].shape[0]
# num_zero_non_cleantech_trie_count = df_cleantech_keywords[df_cleantech_keywords['non_cleantech_trie_count'] == 0].shape[0]

In [None]:
df_cleantech_keywords['kl_divergence'] = df_cleantech_keywords.apply(lambda x: x['cleantech_trie_count']/sum_cleantech_trie_count * np.log((x['cleantech_trie_count']/sum_cleantech_trie_count)/ (x['non_cleantech_trie_count']/sum_non_cleantech_trie_count)), axis=1)

In [None]:
df_cleantech_keywords.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/cleantech_keywords_similarity_015_co_occurrence_01_kl_divergence.json')