In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split

## Load Cleantech Data

In [106]:
# Import Cleantech data
df_cleantech_similarity = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Similarity Search/df_keyword_titles_cosine_similarity_radius_025_noun_chunks.json')
df_cleantech_cooccurence = pd.read_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Co-Occurence Analysis/co_occurrence_matrix_yake_keywords_cleantech_uspto_epo_rel_ids.csv', index_col=0)

In [107]:
# Delete all rows where all values are NaN
df_cleantech_cooccurence.dropna(how='all', inplace=True)

In [119]:
## MERGE DATAFRAMES - THRESHOLD FOR CO-OCCURENCE
co_occurence_threshold = 0.15 # 0.15 # Co-Occuring at every 1/i-th occurence of core keyword
# Extract all columns where one value is above the threshold
co_occurence_list = []
# Iterate over each column in the DataFrame
for column in tqdm(df_cleantech_cooccurence.columns):
    # Check if any value in the column is greater than the threshold
    if df_cleantech_cooccurence[column].max() >= co_occurence_threshold:
        # Add the column header to the list
        co_occurence_list.append(column)
# Delete duplicates
co_occurence_list = list(set(co_occurence_list))

100%|██████████| 151044/151044 [00:01<00:00, 83659.93it/s]


In [120]:
len(co_occurence_list)

128

In [121]:
# Build list out of all columns keyword_yake_lemma and keywords from lists in column keywords_bertforpatents
similarity_list = []
for index, row in tqdm(df_cleantech_similarity.iterrows()):
    similarity_list.append(row['keyword_yake_lemma'])
    for keyword in row['keywords_bertforpatents']:
        similarity_list.append(keyword)
# Delete duplicates
similarity_list = list(set(similarity_list))

467it [00:00, 10803.21it/s]


In [122]:
cleantech_list = co_occurence_list + similarity_list
df_cleantech = pd.DataFrame(cleantech_list, columns=['keyword_yake_lemma'])
df_cleantech['cleantech'] = 1

In [123]:
len(df_cleantech)

25431

## Load Non-Cleantech Data

In [124]:
df_non_cleantech = pd.read_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/uspto_epo_rel_keywords_list_non_cleantech_noun_chunks_processed_embeddings.json')
# Randomly sample len(df_cleantech) rows from df_non_cleantech
df_non_cleantech = df_non_cleantech.sample(n=len(df_cleantech), random_state=42)
df_non_cleantech['cleantech'] = 0

In [125]:
# Drop all columns except keyword_yake_lemma and cleantech
df_non_cleantech.drop(columns=df_non_cleantech.columns.difference(['keyword_yake_lemma', 'cleantech']), inplace=True)

In [115]:
df_non_cleantech.head()

Unnamed: 0,keyword_yake_lemma,cleantech
8280,aromatase cytochrome,0
114418,radial passage,0
25603,complex scene,0
143281,tracking coil,0
116376,recently proposed model,0


# Scikit-Learn Classifier Comparison

In [None]:
from sentence_transformers import SentenceTransformer
import torch

In [None]:
model_bertforpatents = SentenceTransformer('anferico/bert-for-patents')

In [None]:
# Check if GPU is available
if torch.cuda.is_available():
    # Move model to GPU
    model_bertforpatents.to('cuda')

In [None]:
df_cleantech.head()

In [126]:
df_cleantech['keyword_yake_lemma_bertforpatents_embedding'] = model_bertforpatents.encode(df_cleantech['keyword_yake_lemma'].tolist(), show_progress_bar=True).tolist()
df_non_cleantech['keyword_yake_lemma_bertforpatents_embedding'] = model_bertforpatents.encode(df_non_cleantech['keyword_yake_lemma'].tolist(), show_progress_bar=True).tolist()

Batches:   0%|          | 0/795 [00:00<?, ?it/s]

Batches: 100%|██████████| 795/795 [00:11<00:00, 71.47it/s]
Batches: 100%|██████████| 795/795 [00:14<00:00, 54.95it/s]


In [127]:
# Concatenate dataframes
df = pd.concat([df_cleantech, df_non_cleantech], ignore_index=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['keyword_yake_lemma_bertforpatents_embedding'], df['cleantech'], test_size=0.2, random_state=42)

# Convert to numpy array
X_train = np.array(X_train.tolist())
X_test = np.array(X_test.tolist())

In [None]:
from sklearn.metrics import classification_report
from sklearn.svm import SVC
# from sklearn.linear_model import SGDClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.tree import DecisionTreeClassifier

In [128]:
# Define the classifiers
classifiers = {
    "Support Vector Machine": SVC(),
    # "Stochastic Gradient Descent": SGDClassifier(max_iter=1000, tol=1e-3),
    # "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=3),
    # "Gaussian Processes": GaussianProcessClassifier(),
    # "Naive Bayes": GaussianNB(),
    # "Decision Tree": DecisionTreeClassifier()
}

# Train and evaluate each classifier
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(f"Classifier: {name}")
    print(classification_report(y_test, predictions))
    print("-" * 50)

Classifier: Support Vector Machine
              precision    recall  f1-score   support

           0       0.92      0.79      0.85      5113
           1       0.82      0.93      0.87      5060

    accuracy                           0.86     10173
   macro avg       0.87      0.86      0.86     10173
weighted avg       0.87      0.86      0.86     10173

--------------------------------------------------
