In [1]:
import pandas as pd

In [2]:
df = pd.read_json('/mnt/hdd01/patentsview/Patentsview - Cleantech Patents/Cleantech Concepts/Yake/g_patent_claims_cleantech_yake.json')
df_cpc = pd.read_json('/mnt/hdd01/patentsview/Patentsview - Cleantech Patents/df_patentsview_patent_cpc_grouped_cleantech.json')

In [4]:
# Merge df and df_cpc on 'patent_id' keep 'claim_fulltext' and 'keywords_yake' for df, keep 'patent_title' and 'cpc' for df_cpc
df_merged = df.merge(df_cpc, on='patent_id', how='left')
df_merged = df_merged[['patent_id', 'claim_fulltext', 'keywords_yake', 'patent_title', 'cpc']]

In [6]:
# Convert df_merged['keywords_yake'] to list
df_merged['keywords_yake'] = df_merged['keywords_yake'].apply(lambda x: x.split(','))

In [7]:
# Create sets for unique keyword tracking
cpc_subclass_keywords_set = set()
cpc_subgroup_keywords_set = set()

# Iterate through cpc dictionary column of df_merged
for cpc_dict in df_merged['cpc']:
    # Iterate over nested cpc dictionary of classifications for each patent
    for outer_key, outer_value in cpc_dict.items():
        # Iterate over entries in each classification
        for inner_key, inner_value in outer_value.items():
            if inner_key == 'cpc_subclass':
                cpc_subclass_keywords_set.add(inner_value)
            elif inner_key == 'cpc_subgroup':
                cpc_subgroup_keywords_set.add(inner_value)

# Convert sets to dataframes
df_cpc_subclass_keywords = pd.DataFrame(list(cpc_subclass_keywords_set), columns=['cpc_subclass'])
df_cpc_subgroup_keywords = pd.DataFrame(list(cpc_subgroup_keywords_set), columns=['cpc_subgroup'])


In [8]:
# Create dictionaries to hold keywords
subclass_keywords_dict = {subclass: [] for subclass in df_cpc_subclass_keywords['cpc_subclass']}
subgroup_keywords_dict = {subgroup: [] for subgroup in df_cpc_subgroup_keywords['cpc_subgroup']}

# Iterate over DataFrame rows
for index, row in df_merged.iterrows():
    keywords_yake = row['keywords_yake']
    # Assuming 'cpc' column has a dictionary structure
    for outer_key, outer_value in row['cpc'].items():
        for inner_key, inner_value in outer_value.items():
            # Check and append keywords to the corresponding subclass or subgroup
            if inner_key == 'cpc_subclass' and inner_value in subclass_keywords_dict:
                subclass_keywords_dict[inner_value].append(keywords_yake)
            elif inner_key == 'cpc_subgroup' and inner_value in subgroup_keywords_dict:
                subgroup_keywords_dict[inner_value].append(keywords_yake)

# Convert the dictionaries back to DataFrame
df_cpc_subclass_keywords['keywords_yake'] = df_cpc_subclass_keywords['cpc_subclass'].map(subclass_keywords_dict)
df_cpc_subgroup_keywords['keywords_yake'] = df_cpc_subgroup_keywords['cpc_subgroup'].map(subgroup_keywords_dict)


In [9]:
# Order alphabetically
df_cpc_subclass_keywords.sort_values(by=['cpc_subclass'], inplace=True)
df_cpc_subgroup_keywords.sort_values(by=['cpc_subgroup'], inplace=True)

# Reset index
df_cpc_subclass_keywords.reset_index(drop=True, inplace=True)
df_cpc_subgroup_keywords.reset_index(drop=True, inplace=True)

In [10]:
# Define a function to flatten the list of lists
def flatten_list_of_lists(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]

# Apply the function to the 'keywords_yake' column of each DataFrame
df_cpc_subclass_keywords['keywords_yake'] = df_cpc_subclass_keywords['keywords_yake'].apply(flatten_list_of_lists)
df_cpc_subgroup_keywords['keywords_yake'] = df_cpc_subgroup_keywords['keywords_yake'].apply(flatten_list_of_lists)

In [11]:
import nltk
from collections import Counter
from nltk.stem import WordNetLemmatizer

# Ensure NLTK data is downloaded
# nltk.download('wordnet')

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to process the keywords
def process_keywords(keywords):
    # Lemmatize the keywords
    lemmatized_keywords = [lemmatizer.lemmatize(word) for word in keywords]
    # Calculate keyword frequencies
    keyword_frequencies = Counter(lemmatized_keywords)
    # Sort keywords by frequency in descending order, and remove duplicates
    sorted_keywords = [item[0] for item in keyword_frequencies.most_common()]
    return sorted_keywords

# Apply the function to the 'keywords_yake' column and store the result in the 'keywords_yake_desc' column
df_cpc_subclass_keywords['keywords_yake_desc'] = df_cpc_subclass_keywords['keywords_yake'].apply(process_keywords)
df_cpc_subgroup_keywords['keywords_yake_desc'] = df_cpc_subgroup_keywords['keywords_yake'].apply(process_keywords)


In [12]:
# Define a function to filter out keywords with less than two words
def filter_keywords(keywords):
    return [keyword for keyword in keywords if len(keyword.split()) >= 2]

# Apply the function to the 'keywords_yake_desc' column of each DataFrame
df_cpc_subclass_keywords['keywords_yake_desc'] = df_cpc_subclass_keywords['keywords_yake_desc'].apply(filter_keywords)
df_cpc_subgroup_keywords['keywords_yake_desc'] = df_cpc_subgroup_keywords['keywords_yake_desc'].apply(filter_keywords)

In [13]:
# Define a function to filter out keywords containing 'claim'
def filter_keywords(keywords):
    return [keyword for keyword in keywords if 'claim' not in keyword.lower()]

# Apply the function to the 'keywords_yake_desc' column of each DataFrame
df_cpc_subclass_keywords['keywords_yake_desc'] = df_cpc_subclass_keywords['keywords_yake_desc'].apply(filter_keywords)
df_cpc_subgroup_keywords['keywords_yake_desc'] = df_cpc_subgroup_keywords['keywords_yake_desc'].apply(filter_keywords)


In [14]:
def filter_cross_row_duplicates(df, column_name):
    seen_keywords = set()  # Set to keep track of keywords that have been seen

    def filter_duplicates(keywords_list):
        nonlocal seen_keywords  # Allow access to the outer scope variable
        new_keywords_list = []  # List to keep the filtered keywords
        for keyword in keywords_list:
            # Only add keyword to new list if it hasn't been seen before
            if keyword not in seen_keywords:
                new_keywords_list.append(keyword)
                seen_keywords.add(keyword)  # Mark keyword as seen
        return new_keywords_list

    # Apply the function to filter out duplicates from each row
    df[column_name] = df[column_name].apply(filter_duplicates)

# Apply the function to your DataFrames
filter_cross_row_duplicates(df_cpc_subclass_keywords, 'keywords_yake_desc')
filter_cross_row_duplicates(df_cpc_subgroup_keywords, 'keywords_yake_desc')


In [15]:
# Make a fourth column keywords_yake_desc_5000 that contains the first 5000 keywords of each row
df_cpc_subclass_keywords['keywords_yake_desc_5000'] = df_cpc_subclass_keywords['keywords_yake_desc'].apply(lambda x: x[:5000])
# Make a fourth column keywords_yake_desc_1000 that contains the first 1000 keywords of each row
df_cpc_subgroup_keywords['keywords_yake_desc_1000'] = df_cpc_subgroup_keywords['keywords_yake_desc'].apply(lambda x: x[:1000])

In [16]:
# Save the DataFrames to JSON files
df_cpc_subclass_keywords.to_json('/mnt/hdd01/patentsview/Patentsview - Cleantech Patents/Cleantech Concepts/df_cpc_subclass_keywords_yake.json', orient='records')
df_cpc_subgroup_keywords.to_json('/mnt/hdd01/patentsview/Patentsview - Cleantech Patents/Cleantech Concepts/df_cpc_subgroup_keywords_yake.json', orient='records')