In [6]:
from sklearn.feature_extraction.text import HashingVectorizer
from scipy.sparse import csr_matrix
import pandas as pd
import re
import unicodedata
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

lemmatizer = WordNetLemmatizer()
tqdm.pandas()

# Prepare Cleantech Keyword List

In [7]:
def clean_and_lemmatize(text):
    """
    Normalize, clean, and lemmatize the input text.

    :param text: A string containing the text to be processed.
    :return: A string representing the processed text.
    """
    # Normalize the text with unicodedata
    text = unicodedata.normalize("NFKD", text).encode('ASCII', 'ignore').decode('utf-8')

    # Remove URLs, brackets, and non-alphabetic characters; convert to lowercase
    text = re.sub(r"\[.*?\]|\(.*?\)|\{.*?\}", "", text)
    text = re.sub(r"https?:\/\/\S+", "", text)
    text = re.sub(r"[^a-zA-Z- ]", " ", text).lower().strip()

    # Lemmatize each word
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    return lemmatized_text

In [3]:
### Prepare Cleantech Data
# Co-Occurrence Directory
# co_occurrence_dir = '/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Co-Occurrence Analysis/'
# co_occurrence_files = glob.glob(co_occurrence_dir + '*.csv')
co_occurrence_files = '/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Co-Occurrence Analysis/co_occurrence_matrix_yake_keywords_cleantech_uspto_epo_rel_ids_semantic_similarity_02.csv'

# Similarity Directory
# similarity_dir = '/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Similarity Search/'
# similarity_files = glob.glob(similarity_dir + '*.json')
similarity_files = '/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Similarity Search/df_keyword_titles_cosine_similarity_radius_025_neighbors_100_noun_chunks.json'

# Co-Occurrence Threshold
# co_occurrence_threshold = [0.01, 0.025, 0.05, 0.1, 0.15]
co_occurrence_threshold = [0.01]

In [4]:
# Load the data
df_cleantech_cooccurrence = pd.read_csv(co_occurrence_files, index_col=0)
df_cleantech_cooccurrence.dropna(how='all', inplace=True)

df_cleantech_similarity = pd.read_json(similarity_files)

# Co-Occurrence Threshold
co_occurrence_threshold = 0.01  # Assuming you are using a single threshold value

# Create a mask for the co-occurrence threshold
mask = df_cleantech_cooccurrence.applymap(lambda x: x >= co_occurrence_threshold)

# Apply mask to DataFrame
filtered_co_occurrence_df = df_cleantech_cooccurrence[mask]

# Extract keywords
co_occurrence_list = filtered_co_occurrence_df.columns[filtered_co_occurrence_df.any()].tolist()

# Processing similarity data
similarity_series = pd.concat([df_cleantech_similarity['keyword_yake_lemma'], df_cleantech_similarity['keywords_keyword_yake_bertforpatents_embedding'].explode()], ignore_index=True)
similarity_list = similarity_series.drop_duplicates().tolist()

# Combine and deduplicate lists
cleantech_list = list(set(co_occurrence_list + similarity_list))
cleantech_list = [str(keyword) for keyword in cleantech_list]

# # Create DataFrame
# df_cleantech = pd.DataFrame(cleantech_list, columns=['keyword_yake_lemma'])
# df_cleantech['cleantech'] = 1

del df_cleantech_cooccurrence
del df_cleantech_similarity
del co_occurrence_list
del similarity_list

  mask = df_cleantech_cooccurrence.applymap(lambda x: x >= co_occurrence_threshold)


# Compute TF-IDF Matrices

In [8]:
g_uspto_cleantech = pd.read_json('/mnt/hdd01/patentsview/Patentsview - Cleantech Patents/g_patent_claims_fulltext_cleantech.json')
g_uspto_cleantech['patent_id'] = 'us-' + g_uspto_cleantech['patent_id'].astype(str)
g_uspto_cleantech['claim_fulltext'] = g_uspto_cleantech['claim_fulltext'].apply(clean_and_lemmatize)

In [None]:
# Create Document List for HashingVectorizer
document_list = g_uspto_cleantech['claim_fulltext'].tolist()

# Cast to string using a generator expression
document_list = (str(x) for x in document_list)

In [None]:
# Create HashingVectorizer
vectorizer_uspto_cleantech = HashingVectorizer(n_features=len(cleantech_list))

In [None]:
# Create Document Term Matrix
document_term_matrix_uspto_cleantech = vectorizer_uspto_cleantech.fit_transform(document_list)

In [None]:
# Convert the Document Term Matrix to a Compressed Sparse Column matrix
document_term_matrix_uspto_cleantech = csr_matrix(document_term_matrix_uspto_cleantech.transpose())

In [None]:
# Create DataFrame
df_uspto_cleantech = pd.DataFrame.sparse.from_spmatrix(document_term_matrix_uspto_cleantech,
                                                       columns=g_uspto_cleantech['patent_id'])

In [None]:
# Save DataFrame
df_uspto_cleantech.to_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/df_uspto_cleantech_hashing.csv')