In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import vstack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.preprocessing import StandardScaler

tqdm.pandas()

# Build Matrices for the Sci-Kit Learn Models

## Prepare Cleantech Data

In [2]:
### Prepare Cleantech Data
# Co-Occurrence Directory
# co_occurrence_dir = '/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Co-Occurrence Analysis/'
# co_occurrence_files = glob.glob(co_occurrence_dir + '*.csv')
co_occurrence_files = '/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Co-Occurrence Analysis/co_occurrence_matrix_yake_keywords_cleantech_uspto_epo_rel_ids_semantic_similarity_02.csv'

# Similarity Directory
# similarity_dir = '/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Similarity Search/'
# similarity_files = glob.glob(similarity_dir + '*.json')
similarity_files = '/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Similarity Search/df_keyword_titles_cosine_similarity_radius_025_neighbors_100_noun_chunks.json'

# Co-Occurrence Threshold
# co_occurrence_threshold = [0.01, 0.025, 0.05, 0.1, 0.15]
co_occurrence_threshold = [0.01]

In [3]:
# Load the data
df_cleantech_cooccurrence = pd.read_csv(co_occurrence_files, index_col=0)
df_cleantech_cooccurrence.dropna(how='all', inplace=True)

df_cleantech_similarity = pd.read_json(similarity_files)

# Co-Occurrence Threshold
co_occurrence_threshold = 0.01  # Assuming you are using a single threshold value

# Create a mask for the co-occurrence threshold
mask = df_cleantech_cooccurrence.applymap(lambda x: x >= co_occurrence_threshold)

# Apply mask to DataFrame
filtered_co_occurrence_df = df_cleantech_cooccurrence[mask]

# Extract keywords
co_occurrence_list = filtered_co_occurrence_df.columns[filtered_co_occurrence_df.any()].tolist()

# Processing similarity data
similarity_series = pd.concat([df_cleantech_similarity['keyword_yake_lemma'], df_cleantech_similarity['keywords_keyword_yake_bertforpatents_embedding'].explode()], ignore_index=True)
similarity_list = similarity_series.drop_duplicates().tolist()

# Combine and deduplicate lists
cleantech_list = list(set(co_occurrence_list + similarity_list))
cleantech_list = [str(keyword) for keyword in cleantech_list]

# # Create DataFrame
df_cleantech = pd.DataFrame(cleantech_list, columns=['keyword_yake_lemma'])
# df_cleantech['cleantech'] = 1

del df_cleantech_cooccurrence
del df_cleantech_similarity
del co_occurrence_list
del similarity_list

  mask = df_cleantech_cooccurrence.applymap(lambda x: x >= co_occurrence_threshold)


In [4]:
g_epo_cleantech = pd.read_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_epo_cleantech_trie.csv')
g_uspto_cleantech = pd.read_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_uspto_cleantech_trie.csv')
g_rel_cleantech = pd.read_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_rel_cleantech_trie.csv')

In [5]:
# Delete all rows where trie is NaN or empty
g_epo_cleantech.dropna(subset=['trie'], inplace=True)
g_uspto_cleantech.dropna(subset=['trie'], inplace=True)
g_rel_cleantech.dropna(subset=['trie'], inplace=True)

In [6]:
# Concatenate list of strings in trie column to a single string
g_epo_cleantech['trie'] = g_epo_cleantech['trie'].progress_apply(lambda x: ' '.join(eval(x)))
g_uspto_cleantech['trie'] = g_uspto_cleantech['trie'].progress_apply(lambda x: ' '.join(eval(x)))
g_rel_cleantech['trie'] = g_rel_cleantech['trie'].progress_apply(lambda x: ' '.join(eval(x)))

100%|██████████| 179468/179468 [00:13<00:00, 12888.90it/s]
100%|██████████| 515584/515584 [01:11<00:00, 7182.33it/s]
100%|██████████| 618653/618653 [00:17<00:00, 34846.83it/s]


In [7]:
# Concatenate the three DataFrames
g_cleantech = pd.concat([g_epo_cleantech, g_uspto_cleantech, g_rel_cleantech], ignore_index=True)

## Prepare Non Cleantech Data

In [8]:
g_epo_non_cleantech = pd.read_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_epo_non_cleantech_trie.csv')
g_uspto_non_cleantech = pd.read_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_uspto_non_cleantech_trie.csv')
g_rel_non_cleantech = pd.read_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/TFIDF Matrices/g_rel_non_cleantech_trie.csv')

In [9]:
# Delete all rows where trie is NaN or empty
g_epo_non_cleantech.dropna(subset=['trie'], inplace=True)
g_uspto_non_cleantech.dropna(subset=['trie'], inplace=True)
g_rel_non_cleantech.dropna(subset=['trie'], inplace=True)

In [10]:
# Concatenate list of strings in trie column to a single string
g_epo_non_cleantech['trie'] = g_epo_non_cleantech['trie'].progress_apply(lambda x: ' '.join(eval(x)))
g_uspto_non_cleantech['trie'] = g_uspto_non_cleantech['trie'].progress_apply(lambda x: ' '.join(eval(x)))
g_rel_non_cleantech['trie'] = g_rel_non_cleantech['trie'].progress_apply(lambda x: ' '.join(eval(x)))

100%|██████████| 179091/179091 [00:13<00:00, 13338.35it/s]
100%|██████████| 599654/599654 [01:19<00:00, 7518.83it/s] 
100%|██████████| 606097/606097 [00:16<00:00, 36052.13it/s]


In [11]:
# Concatenate the three DataFrames
g_non_cleantech = pd.concat([g_epo_non_cleantech, g_uspto_non_cleantech, g_rel_non_cleantech], ignore_index=True)

# Build CountVectorizer Matrix for the Sci-Kit Learn Models

In [12]:
Vectorizer = CountVectorizer(
    vocabulary = cleantech_list,
    ngram_range = (1, 4),
    # max_df = 0.5,
    # min_df = 0.01,
    stop_words='english',
    lowercase=True,
)
# scaler = StandardScaler(with_mean=False)  # with_mean=False to support sparse matrices

g_cleantech_matrix = Vectorizer.fit_transform(g_cleantech['trie'])
# g_cleantech_matrix = scaler.fit_transform(g_cleantech_matrix)

g_non_cleantech_matrix = Vectorizer.fit_transform(g_non_cleantech['trie'])
# g_non_cleantech_matrix = scaler.fit_transform(g_non_cleantech_matrix)

# Build Models

In [13]:
# Function to train and evaluate a given model
def train_evaluate_model(model, X_train, X_test, y_train, y_test, df_cleantech):
    # Train the model
    model.fit(X_train, y_train)

    # Evaluate the model
    predictions = model.predict(X_test)
    print("Classification Report:\n", classification_report(y_test, predictions))
    print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

    # For Logistic Regression, display the feature importance
    if isinstance(model, LogisticRegression):
        feature_importance = model.coef_[0]
        feature_names = Vectorizer.get_feature_names_out()
        keywords_importance = zip(feature_names, feature_importance)
        sorted_keywords = sorted(keywords_importance, key=lambda x: x[1], reverse=True)
        df_keywords_importance = pd.DataFrame(sorted_keywords, columns=['keyword_yake_lemma', 'logistic_regression_importance'])
        df_cleantech = pd.merge(df_cleantech, df_keywords_importance, on='keyword_yake_lemma', how='left')
    # For RandomForestClassifier, display the feature importance
    elif isinstance(model, RandomForestClassifier):
        feature_importance = model.feature_importances_
        feature_names = Vectorizer.get_feature_names_out()
        keywords_importance = zip(feature_names, feature_importance)
        sorted_keywords = sorted(keywords_importance, key=lambda x: x[1], reverse=True)
        df_keywords_importance = pd.DataFrame(sorted_keywords, columns=['keyword_yake_lemma', 'random_forest_importance'])
        df_cleantech = pd.merge(df_cleantech, df_keywords_importance, on='keyword_yake_lemma', how='left')
    # For GradientBoostingClassifier, display the feature importance
    elif isinstance(model, GradientBoostingClassifier):
        feature_importance = model.feature_importances_
        feature_names = Vectorizer.get_feature_names_out()
        keywords_importance = zip(feature_names, feature_importance)
        sorted_keywords = sorted(keywords_importance, key=lambda x: x[1], reverse=True)
        df_keywords_importance = pd.DataFrame(sorted_keywords, columns=['keyword_yake_lemma', 'gradient_boosting_importance'])
        df_cleantech = pd.merge(df_cleantech, df_keywords_importance, on='keyword_yake_lemma', how='left')
    
    return df_cleantech

In [14]:
# Concatenate data for train_test split
X = vstack([g_cleantech_matrix, g_non_cleantech_matrix])
y = np.concatenate((np.ones(g_cleantech_matrix.shape[0]), np.zeros(g_non_cleantech_matrix.shape[0])))

In [15]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Models

In [23]:
# Choose your model
model_choice = 'random_forest' # Can be 'logistic_regression', 'random_forest', or 'gradient_boosting'

if model_choice == 'logistic_regression':
    model = LogisticRegression(max_iter=5000)
elif model_choice == 'random_forest':
    model = RandomForestClassifier(n_estimators=100, n_jobs=12)
elif model_choice == 'gradient_boosting':
    model = GradientBoostingClassifier(n_estimators=500)

In [24]:
# Train and evaluate the model
df_cleantech = train_evaluate_model(model, X_train, X_test, y_train, y_test, df_cleantech)

KeyboardInterrupt: 

In [18]:
df_cleantech.head()

Unnamed: 0,keyword_yake_lemma,logistic_regression_importance
0,expanded supercritical fluid,0.349242
1,gaseous pyrolysis product,-0.187071
2,renewable energy technology,0.119731
3,powertrain system,0.0
4,plant growth,0.063515
