In [1]:
import os
import re
import json
import pandas as pd
import tomotopy as tp
from tqdm import tqdm
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords, wordnet

In [2]:
# Load Data
df_patents = pd.read_csv('/Users/juergenthiesen/Documents/Patentsview/Cleantech CSV Files/g_patent_Y02.csv')
df_cpc = pd.read_csv('/Users/juergenthiesen/Documents/Patentsview/Cleantech CSV Files/g_cpc_current_Y02.csv')

# Drop all classes unequal to Y02 from df_cpc
df_cpc = df_cpc[df_cpc['cpc_class'].str.contains('Y02')]

# Merge data on patent_id, keep duplicates
df = pd.merge(df_patents, df_cpc, on='patent_id', how='left')

# Drop all data in df with title or abstract NaN
df = df.dropna(subset=['patent_title', 'patent_abstract'])

# Merge title and abstract with [SEP] token
# df['patent_title_abstract'] = df['patent_title'] + ' [SEP] ' + df['patent_abstract']

# Merge title and abstract
df['patent_title_abstract'] = df['patent_title'] + ' ' + df['patent_abstract']

# Drop duplicates of patent_id + cpc_subclass
df = df.drop_duplicates(subset=['patent_id', 'cpc_subclass'])

# Determine the minimum count of patents per cpc_subclass
min_count = df['cpc_subclass'].value_counts().min()

# Randomly sample the rounded down count of patents per cpc_subclass
df = df.groupby('cpc_subclass').apply(lambda x: x.sample(min_count, random_state=42))

# Merge rows with same patent_id and keep all cpc_subclasses
df = df.groupby('patent_id').agg({'patent_title_abstract': 'first', 'cpc_subclass': lambda x: list(x)})

# Set label column to 0 for Y02A, 1 for Y02B, 2 for Y02C, 3 for Y02D, 4 for Y02E, 5 for Y02P, 6 for Y02T, 7 for Y02W
# df['label'] = df['cpc_subclass'].map({'Y02A': 0, 'Y02B': 1, 'Y02C': 2, 'Y02D': 3, 'Y02E': 4, 'Y02P': 5, 'Y02T': 6, 'Y02W': 7})

In [4]:
# Split data into train and test set
df_train = df.sample(frac=0.8, random_state=42)
df_test = df.drop(df_train.index)

# Function to obtain part-of-speech tags for each word in corpus (only valid options for lemmatizer)
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemmatizer = WordNetLemmatizer()
# stemmer = PorterStemmer()

# Initialize model
PLDA_model_IDF = tp.PLDAModel(tw=tp.TermWeight.IDF, topics_per_label=10, seed=42, latent_topics=1)
PLDA_model_ONE = tp.PLDAModel(tw=tp.TermWeight.ONE, topics_per_label=10, seed=42, latent_topics=1)
PLDA_model_PMI = tp.PLDAModel(tw=tp.TermWeight.PMI, topics_per_label=10, seed=42, latent_topics=1)

# Add documents to model
for index, row in tqdm(df_train.iterrows()):
    # Lowercase all words
    clean_document = row['claim_fulltext'].lower()

    ### Regex ###
    # Replace '-' and '/' with spaces
    clean_document = re.sub(r'[-/]', ' ', clean_document)
    # Remove puncutation
    clean_document = re.sub(r'[^\w\s]', '', clean_document)
    # Remove numbers
    clean_document = re.sub(r'\d+', '', clean_document)
    # # Remove words with less than 2 characters
    # clean_document = re.sub(r'\b\w{1,3}\b', '', clean_document)
    # Remove extra spaces
    clean_document = re.sub(r'\s+', ' ', clean_document)
    # Remove non-alphabetic characters
    clean_document = re.sub(r'[^a-zA-Z]', ' ', clean_document)
    # Remove stopwords
    clean_document = [token for token in clean_document.split() if token not in stopwords.words('english')]
    # Lemmatize words
    pos_tagged_tokens = nltk.pos_tag(clean_document)
    clean_document = [lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tagged_tokens]
    # Stem words
    # clean_document = [stemmer.stem(token) for token in clean_document]
    # Set labels
    label = row['cpc_subclass']
    # Add document and labels to model
    PLDA_model_IDF.add_doc(clean_document, labels=label)
    PLDA_model_ONE.add_doc(clean_document, labels=label)
    PLDA_model_PMI.add_doc(clean_document, labels=label)

40617it [05:14, 129.19it/s]


In [8]:
# Instantiate labelled LDA model (source: https://bab2min.github.io/tomotopy/v/en/#tomotopy.PLDAModel)
# Term weight options: IDF, ONE (equally weighted), PMI (Pointwise Mutual Information)

# Train a topic model using tomotopy library
PLDA_model_IDF.burn_in = 5
print('Start training model:')
for i in range(0, 100, 10):
    PLDA_model_IDF.train(iter=10, workers=0)
    # print('Iteration: {}\tLog-likelihood: {}'.format(i, PLDA_model.ll_per_word))

PLDA_model_IDF.summary()

# Save model
PLDA_model_IDF.save('/Users/juergenthiesen/Documents/Patentsview/Cleantech Concepts/PLDA_IDF_model_cleantech_concepts.bin')

Start training model:
<Basic Info>
| PLDAModel (current version: 0.12.5)
| 40617 docs, 2967403 words
| Total Vocabs: 36754, Used Vocabs: 36754
| Entropy of words: 7.47317
| Entropy of term-weighted words: 8.31698
| Removed Vocabs: <NA>
| Label of docs and its distribution
|  Y02A: 5120
|  Y02B: 5157
|  Y02P: 5156
|  Y02E: 5147
|  Y02D: 5195
|  Y02W: 5073
|  Y02C: 5172
|  Y02T: 5159
|
<Training Info>
| Iterations: 100, Burn-in steps: 5
| Optimization Interval: 10
| Log-likelihood per word: -7.49045
|
<Initial Parameters>
| tw: TermWeight.PMI
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| latent_topics: 1 (the number of latent topics, which are shared to all documents, between 1 ~ 32767)
| topics_per_label: 10 (the number of topics per label between 1 ~ 32767)
| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetri

  PLDA_model_PMI.train(iter=10, workers=0)
