In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import vstack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

tqdm.pandas()

# Build Matrices for the Sci-Kit Learn Model

## Prepare Cleantech Data

In [3]:
### Prepare Cleantech Data
co_occurrence_file = '/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Co-Occurrence Analysis/co_occurrence_matrix_yake_keywords_cleantech_uspto_epo_rel_ids_semantic_similarity_02.csv'
similarity_file = '/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Similarity Search/df_keyword_titles_cosine_similarity_radius_015_neighbors_100_noun_chunks.json'

df_cleantech_cooccurrence = pd.read_csv(co_occurrence_file, index_col=0)
df_cleantech_cooccurrence.dropna(how='all', inplace=True)

df_cleantech_similarity = pd.read_json(similarity_file)

# Co-Occurrence Threshold
co_occurrence_threshold = 0.1

# Create a mask for the co-occurrence threshold
mask = df_cleantech_cooccurrence >= co_occurrence_threshold

# Apply mask to DataFrame
filtered_co_occurrence_df = df_cleantech_cooccurrence[mask]

# Extract keywords
co_occurrence_list = filtered_co_occurrence_df.columns[filtered_co_occurrence_df.any()].tolist()

# Processing similarity data
similarity_series = pd.concat([df_cleantech_similarity['keyword_yake_lemma'], df_cleantech_similarity['keywords_keyword_yake_bertforpatents_embedding'].explode()], ignore_index=True)

# Drop duplicates before converting to list
similarity_list = similarity_series.drop_duplicates().tolist()

# Combine and deduplicate lists
cleantech_list = list(set(co_occurrence_list + similarity_list))
cleantech_list = [str(keyword) for keyword in cleantech_list]

# Create DataFrame
df_cleantech = pd.DataFrame(cleantech_list, columns=['keyword_yake_lemma'])

In [4]:
g_epo_cleantech = pd.read_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_epo_cleantech_trie.csv')
g_uspto_cleantech = pd.read_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_uspto_cleantech_trie.csv')
g_rel_cleantech = pd.read_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_rel_cleantech_trie.csv')

# Delete all rows where trie is NaN or empty
g_epo_cleantech.dropna(subset=['trie'], inplace=True)
g_uspto_cleantech.dropna(subset=['trie'], inplace=True)
g_rel_cleantech.dropna(subset=['trie'], inplace=True)

# Concatenate list of strings in trie column to a single string
g_epo_cleantech['trie'] = g_epo_cleantech['trie'].progress_apply(lambda x: ' '.join(eval(x)))
g_uspto_cleantech['trie'] = g_uspto_cleantech['trie'].progress_apply(lambda x: ' '.join(eval(x)))
g_rel_cleantech['trie'] = g_rel_cleantech['trie'].progress_apply(lambda x: ' '.join(eval(x)))

# Concatenate the three DataFrames
g_cleantech = pd.concat([g_epo_cleantech, g_uspto_cleantech, g_rel_cleantech], ignore_index=True)

100%|██████████| 179387/179387 [00:12<00:00, 14111.48it/s]
100%|██████████| 515602/515602 [01:06<00:00, 7791.12it/s] 
100%|██████████| 618506/618506 [00:17<00:00, 35136.77it/s]


## Prepare Non Cleantech Data

In [5]:
g_epo_non_cleantech = pd.read_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_epo_non_cleantech_trie.csv')
g_uspto_non_cleantech = pd.read_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_uspto_non_cleantech_trie.csv')
g_rel_non_cleantech = pd.read_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_rel_non_cleantech_trie.csv')

# Delete all rows where trie is NaN or empty
g_epo_non_cleantech.dropna(subset=['trie'], inplace=True)
g_uspto_non_cleantech.dropna(subset=['trie'], inplace=True)
g_rel_non_cleantech.dropna(subset=['trie'], inplace=True)

# Concatenate list of strings in trie column to a single string
g_epo_non_cleantech['trie'] = g_epo_non_cleantech['trie'].progress_apply(lambda x: ' '.join(eval(x)))
g_uspto_non_cleantech['trie'] = g_uspto_non_cleantech['trie'].progress_apply(lambda x: ' '.join(eval(x)))
g_rel_non_cleantech['trie'] = g_rel_non_cleantech['trie'].progress_apply(lambda x: ' '.join(eval(x)))

# Concatenate the three DataFrames
g_non_cleantech = pd.concat([g_epo_non_cleantech, g_uspto_non_cleantech, g_rel_non_cleantech], ignore_index=True)

100%|██████████| 179012/179012 [00:12<00:00, 14538.11it/s]
100%|██████████| 599676/599676 [01:14<00:00, 8051.56it/s] 
100%|██████████| 606015/606015 [00:16<00:00, 35975.17it/s]


# Build CountVectorizer Matrix for the Sci-Kit Learn Models

In [6]:
Vectorizer = CountVectorizer(
    vocabulary = cleantech_list,
    ngram_range = (1, 4),
    # max_df = 0.5,
    # min_df = 0.01,
    stop_words='english',
    lowercase=True,
)
# scaler = StandardScaler(with_mean=False)  # with_mean=False to support sparse matrices

g_cleantech_matrix = Vectorizer.fit_transform(g_cleantech['trie'])
# g_cleantech_matrix = scaler.fit_transform(g_cleantech_matrix)

g_non_cleantech_matrix = Vectorizer.fit_transform(g_non_cleantech['trie'])
# g_non_cleantech_matrix = scaler.fit_transform(g_non_cleantech_matrix)

# Build Model

In [7]:
# Function to train and evaluate a given model
def train_evaluate_model(model, X_train, X_test, y_train, y_test, df_cleantech):
    # Train the model
    model.fit(X_train, y_train)

    # Evaluate the model
    predictions = model.predict(X_test)
    print("Classification Report:\n", classification_report(y_test, predictions))
    print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

    feature_importance = model.coef_[0]
    feature_names = Vectorizer.get_feature_names_out()
    keywords_importance = zip(feature_names, feature_importance)
    sorted_keywords = sorted(keywords_importance, key=lambda x: x[1], reverse=True)
    df_keywords_importance = pd.DataFrame(sorted_keywords, columns=['keyword_yake_lemma', 'logistic_regression_importance'])
    df_cleantech = pd.merge(df_cleantech, df_keywords_importance, on='keyword_yake_lemma', how='left')
    
    return df_cleantech

In [8]:
# Concatenate data for train_test split
X = vstack([g_cleantech_matrix, g_non_cleantech_matrix])
y = np.concatenate((np.ones(g_cleantech_matrix.shape[0]), np.zeros(g_non_cleantech_matrix.shape[0])))

In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Model

In [10]:
# Train and evaluate the model
model = LogisticRegression(max_iter=1000)
df_cleantech = train_evaluate_model(model, X_train, X_test, y_train, y_test, df_cleantech)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classification Report:
               precision    recall  f1-score   support

         0.0       0.61      0.91      0.73    276531
         1.0       0.80      0.39      0.52    263109

    accuracy                           0.66    539640
   macro avg       0.71      0.65      0.63    539640
weighted avg       0.70      0.66      0.63    539640

Confusion Matrix:
 [[251235  25296]
 [160873 102236]]


In [12]:
# Print first 25 entries sorted by importance
print(df_cleantech.sort_values(by='logistic_regression_importance', ascending=True).head(25))

              keyword_yake_lemma  logistic_regression_importance
1241              fuel cell fuel                       -1.326671
2233           fuel cell segment                       -1.124320
531            microbial biomass                       -0.873696
2333            slope efficiency                       -0.810049
8                       dyestuff                       -0.747549
1423    abiotic stress tolerance                       -0.726329
2503   sewage treatment facility                       -0.696200
1607                   adversary                       -0.668568
295   reactive power compensator                       -0.617369
997                algal biomass                       -0.580287
1439  single crystalline silicon                       -0.555670
1173                        hals                       -0.540860
930               abiotic stress                       -0.499333
1728                caprolactone                       -0.495766
13        turbine electri

In [13]:
len(df_cleantech)

2796