In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split

## Load Cleantech Data

In [93]:
# Import Cleantech data
df_cleantech_similarity = pd.read_json('/Users/juergenthiesen/Documents/GitHub/Cleantech_Concepts-1/df_keyword_titles_cosine_similarity_radius_025_noun_chunks.json')
df_cleantech_cooccurence = pd.read_csv('/Users/juergenthiesen/Documents/GitHub/Cleantech_Concepts-1/co_occurrence_matrix_yake_keywords_cleantech_uspto_epo_rel.csv', index_col=0)

In [3]:
# Delete all rows where all values are NaN
df_cleantech_cooccurence.dropna(how='all', inplace=True)

In [29]:
## MERGE DATAFRAMES - THRESHOLD FOR CO-OCCURENCE
co_occurence_threshold = 0.175 # Co-Occuring at every 1/i-th occurence of core keyword
# Extract all columns where one value is above the threshold
co_occurence_list = []
# Iterate over each column in the DataFrame
for column in tqdm(df_cleantech_cooccurence.columns):
    # Check if any value in the column is greater than the threshold
    if df_cleantech_cooccurence[column].max() >= co_occurence_threshold:
        # Print the row where 'aachen' is greater than the threshold
        #print(df_cleantech_cooccurence[df_cleantech_cooccurence[column] >= co_occurence_threshold][column])
        # Add the column header to the list
        co_occurence_list.append(column)
# Delete duplicates
co_occurence_list = list(set(co_occurence_list))

100%|██████████| 151044/151044 [00:02<00:00, 60257.89it/s]


In [33]:
# Build list out of all columns keyword_yake_lemma and keywords from lists in column keywords_bertforpatents
similarity_list = []
for index, row in tqdm(df_cleantech_similarity.iterrows()):
    similarity_list.append(row['keyword_yake_lemma'])
    for keyword in row['keywords_bertforpatents']:
        similarity_list.append(keyword)
# Delete duplicates
similarity_list = list(set(similarity_list))

467it [00:00, 21996.20it/s]


In [36]:
cleantech_list = co_occurence_list + similarity_list
df_cleantech = pd.DataFrame(cleantech_list, columns=['keyword_yake_lemma'])
df_cleantech['cleantech'] = 1

## Load Non-Cleantech Data

In [45]:
df_non_cleantech = pd.read_json('/Users/juergenthiesen/Documents/GitHub/Cleantech_Concepts-1/uspto_epo_rel_keywords_list_non_cleantech_noun_chunks_processed_embeddings.json')
# Randomly sample len(df_cleantech) rows from df_non_cleantech
df_non_cleantech = df_non_cleantech.sample(len(df_cleantech), random_state=42).reset_index(drop=True)
df_non_cleantech['cleantech'] = 0

In [46]:
# Drop all columns except keyword_yake_lemma and cleantech
df_non_cleantech.drop(columns=df_non_cleantech.columns.difference(['keyword_yake_lemma', 'cleantech']), inplace=True)

In [47]:
df_non_cleantech.head()

Unnamed: 0,keyword_yake_lemma,cleantech
0,aromatase cytochrome,0
1,radial passage,0
2,complex scene,0
3,tracking coil,0
4,recently proposed model,0


# Scikit-Learn Classifier Comparison

In [53]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [54]:
model_bertforpatents = SentenceTransformer('anferico/bert-for-patents')

Downloading .gitattributes: 100%|██████████| 1.23k/1.23k [00:00<00:00, 472kB/s]
Downloading README.md: 100%|██████████| 1.56k/1.56k [00:00<00:00, 1.09MB/s]
Downloading config.json: 100%|██████████| 327/327 [00:00<00:00, 201kB/s]
Downloading model.safetensors: 100%|██████████| 1.38G/1.38G [00:14<00:00, 97.6MB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.38G/1.38G [00:54<00:00, 25.2MB/s]
Downloading vocab.txt: 100%|██████████| 329k/329k [00:00<00:00, 1.09MB/s]
No sentence-transformers model found with name /Users/juergenthiesen/.cache/torch/sentence_transformers/anferico_bert-for-patents. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /Users/juergenthiesen/.cache/torch/sentence_transformers/anferico_bert-for-patents were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.

In [55]:
df_cleantech['keyword_yake_lemma_bertforpatents_embedding'] = df_cleantech['keyword_yake_lemma'].progress_apply(lambda x: model_bertforpatents.encode(x))
df_non_cleantech['keyword_yake_lemma_bertforpatents_embedding'] = df_non_cleantech['keyword_yake_lemma'].progress_apply(lambda x: model_bertforpatents.encode(x))

  0%|          | 114/25952 [00:10<39:11, 10.99it/s]


KeyboardInterrupt: 

In [48]:
# Concatenate dataframes - mark cleantech as 1 and non-cleantech as 0
df = pd.concat([df_cleantech, df_non_cleantech], ignore_index=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['keyword_yake_lemma'], df['cleantech'], test_size=0.2, random_state=42)

In [49]:
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [50]:
# Define the classifiers
classifiers = {
    "Support Vector Machine": SVC(),
    "Stochastic Gradient Descent": SGDClassifier(max_iter=1000, tol=1e-3),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=3),
    "Gaussian Processes": GaussianProcessClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier()
}

# Train and evaluate each classifier
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(f"Classifier: {name}")
    print(classification_report(y_test, predictions))
    print("-" * 50)

ValueError: could not convert string to float: 'printing press'

- Optimization of the classifier parameters?