## Import packages

In [2]:
import pandas as pd
import numpy as np
import random
import transformers
import tensorflow as tf
from transformers import BertTokenizer
from keras_tuner import HyperModel
from keras_tuner.tuners import RandomSearch
from tensorflow import keras
from tensorflow.keras import layers
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from bs4 import BeautifulSoup    

  from .autonotebook import tqdm as notebook_tqdm
2025-03-06 07:33:02.562392: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Create definitions to preprocess the text and model the relevant data

In [3]:
# Remove HTML tags
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# Remove stopwords, punctuation, and convert to lowercase
def preprocess_text(text):
    # Remove HTML tags
    text = remove_html_tags(text)
    
    # Replace hyphens between words with a space
    text = text.replace('-', ' ')
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize text into words
    tokens = word_tokenize(text)
    
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words and word not in punctuation]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into a single string
    processed_text = ' '.join(tokens)
    
    return processed_text
# Load the BERT tokenizer
max_length = 256
vocab_size = 30522
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_and_batch(train, train_labels, tokenizer, batch_size, shuffle_buffer_size,max_length):
    # Tokenize the train data
    input_ids = []
    attention_masks = []
    for text in train:
        train_tokens = tokenizer.encode_plus(
            text,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            #return_tensors='tf'
            )
        input_ids.append(train_tokens['input_ids'])
        attention_masks.append(train_tokens['attention_mask'])
    #print(np.array(input_ids[:5]))
    # Convert the tokens into tensors
    tensor = tf.data.Dataset.from_tensor_slices((input_ids,train_labels))
 
    # # Shuffle and Batch the tensors
    tensor = tensor.shuffle(shuffle_buffer_size)
    tensor = tensor.batch(batch_size)
    
    return tensor

class MyHyperModel(HyperModel):
    def build(self, hp):
        model = keras.Sequential()
        model.add(layers.Input(shape=(max_length,), dtype='int32'))
        model.add(layers.Embedding(input_dim=vocab_size, 
                                   output_dim=hp.Int('embed_dim', min_value=4, max_value=128),
                                   mask_zero=True))
        #try flatten or move this to the end
        model.add(layers.GlobalAveragePooling1D())
        # Add 1 to 3 dense layers
        for i in range(hp.Int('num_dense_layers', min_value=1, max_value=3)):
            model.add(layers.Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512),
                                    activation=hp.Choice('activation_' + str(i), values=['sigmoid', 'relu', 'tanh'])))
            
            # Add dropout layer between dense layers
            if hp.Boolean('dropout_' + str(i)):
                model.add(layers.Dropout(rate=0.2))
        
        # Add final dense layer with 1 node
        
        model.add(layers.Dense(units=1))
        learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
        # Compile the model
        model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
                      loss= tf.keras.losses.BinaryCrossentropy(from_logits=True),
                      metrics= tf.metrics.BinaryAccuracy(threshold=0.0)
                      )
        return model

## Train the 500 models and save the best 100
### Our sampling strategy involves taking all the relevant docuents and pairing an equal number of randomly sampled non-relevant documents, we repeat this for 10 total datasets using 10 different random stattes to ensure a diffent mix in each traing set. 20% is then split off for validation back-propigation during training

### Set the current training round

In [10]:
round = 2
df = pd.read_excel(f'R{round}_review_for_retraining.xlsx')

In [None]:
#preprocess the text if needed
df['clean_TA'] = df['title'].fillna('').apply(preprocess_text) + ' ' + df['Abstracts'].fillna('').apply(preprocess_text) 


In [8]:
# Apply preprocessing to the 'data_TA' column
data_TA = df.dropna(subset=['Relevant'])

if data_TA.loc[data_TA['Relevant'] == 5].shape[0]<data_TA.loc[data_TA['Relevant'] == 1].shape[0]:
     sample_size = data_TA.loc[data_TA['Relevant'] == 5].shape[0]
     print('Less relevant docs', sample_size)
else:
     sample_size = data_TA.loc[data_TA['Relevant'] == 1].shape[0] 
     print('Less nonrelevant docs', sample_size)  

data_TA.loc[data_TA['Relevant']==1, 'Relevant'] = 0
data_TA.loc[data_TA['Relevant']==5, 'Relevant'] = 1

# Create a list of 10 random numbers from 1 through 100
random_numbers = [15, 32, 69, 75, 83, 8, 68, 99, 44, 92
                  ] #random.sample(range(1, 101), 10)

# Iterate through the list of random numbers and create a training and validation set for each

for num in random_numbers:
    # Select sample_size rows where the 'relevant' column is 1
    relevant_1_rows = data_TA[data_TA['Relevant'] == 1].sample(n=sample_size, random_state=num)
    
    # Select sample_size rows where the 'relevant' column is 0
    relevant_0_rows = data_TA[data_TA['Relevant'] == 0].sample(n=sample_size, random_state=num)
    balanced_data = pd.concat([relevant_0_rows, relevant_1_rows]).sample(frac=1,random_state=num).reset_index(drop=True)

    # Split off 10% for a validation set
    
    valnotrel = balanced_data.loc[balanced_data['Relevant'] == 0].sample(frac=0.2, random_state=num)
    valrel = balanced_data.loc[balanced_data['Relevant'] == 1].sample(frac=0.2, random_state=num)
    raw_val = pd.concat([valnotrel, valrel]).sample(frac=1,random_state=num)
    
    raw_train = balanced_data.drop(raw_val.index).reset_index(drop=True)

# Save the training and validation sets to CSV
    raw_train[['Relevant','clean_TA']].to_csv(f'data/train__R{round}_{num}.csv', index=False)
    raw_val[['Relevant','clean_TA']].to_csv(f'data/val_R{round}_{num}.csv', index=False)

# reload the data  
    #raw_train = pd.read_csv(f'train_{num}.csv')
    #raw_val = pd.read_csv(f'val_{num}.csv')
#for num in random_numbers:
#tokenize the data

    train_data = tokenize_and_batch(raw_train['clean_TA'], 
                                    raw_train['Relevant'], 
                                    tokenizer, 
                                    batch_size=10, 
                                    shuffle_buffer_size=10000,
                                    max_length=256)
    #print(np.asarray(train_data))
    val_data = tokenize_and_batch(raw_val['clean_TA'], 
                                  raw_val['Relevant'], 
                                  tokenizer=tokenizer, 
                                  batch_size=10, 
                                  shuffle_buffer_size=10000,
                                  max_length=256)
#set up and train the model
    best_model = []
    hypermodel = MyHyperModel()

    tuner = RandomSearch(
        hypermodel,
        objective='val_binary_accuracy',
        max_trials=50,
        directory='Tuner_delete_me',
        project_name=f'R{round}_{num}'
    )
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', 
        patience=5, 
        restore_best_weights=True
    )
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.75,
        patience=3,
        min_lr=1e-7
    )
    
    tuner.search(train_data,
                 epochs=50, 
                 validation_data=(val_data
                                  ),
                 batch_size=10,
                 callbacks=[early_stopping,reduce_lr]
                 )
    
    best_model = tuner.get_best_models(num_models=10)
    for i in range(len(best_model)):
         best_model[i].save(f'models/R{round}_model_{num}_{i}.keras')

Trial 50 Complete [00h 00m 07s]
val_binary_accuracy: 0.890625

Best val_binary_accuracy So Far: 0.921875
Total elapsed time: 00h 14m 30s


## Predict all of the articles using 100 models and sort the articles by decreasing concensus
### this process will pull more relevant articles to the top of the list for review

In [12]:
#predict df using all 100 models
random_numbers = [15, 32, 69, 75, 83, 8, 68, 99, 44, 92]
#def predict_and_sort(all_data, tokenizer, random_numbers): 
predictions = []
input_ids = []
attention_masks = []
targets = df['Relevant'].values
for text in df['clean_TA']:
    train_tokens = tokenizer.encode_plus(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        #return_tensors='tf'
        )
    input_ids.append(train_tokens['input_ids'])
    attention_masks.append(train_tokens['attention_mask'])
features = np.array(input_ids)
print('All data tokenized using tokenizer', tokenizer)

for num in random_numbers:
    for i in range(10):
        model = keras.models.load_model(f'models/R{round}_model_{num}_{i}.keras')
        predict = model.predict(features,batch_size=10)
        predict = np.where(predict >= 0, 1, 0)
        predictions.append(predict)

sum_predictions = np.sum(predictions, axis=0)
#calculate uncertainty as 1 divided by the sum of 1 predictions mininus the sum of 0 predictions
#count of 0's in predictions
count_0 = np.count_nonzero(np.array(predictions) == 0, axis=0)

df[f'R{round} Uncertanty'] = 1 / (np.abs(sum_predictions - count_0)+10**(-6))
df[f'R{round} Prediction score'] = sum_predictions
df = df.sort_values(by=f'R{round} Prediction score', ascending=False)
df.to_excel(f'R{round}_predictions_for_review.xlsx', index=False)
df.head()


Unnamed: 0,Master patent number,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,representative figure link,Abstracts,Relevant,Round,clean_TA,Uncertanty,Prediction score
424,CN-111925718-A,Universal primer for nonferrous metal and prep...,_×¥_¬¶¬½ùÏâó_ê_â_ñ¾óØ_¦_¶_,"_öóÒâ__¬, Ï__ãóñ, __Ô¬__, _ù·_ó¦, ó...",8/4/20,8/4/20,11/13/20,,https://patents.google.com/patent/CN111925718A/en,,\nThe invention provides a universal primer fo...,,,universal primer nonferrous metal preparation ...,0.01,100
2264,JP-S58201860-A,Rust preventing coating material composition,"Mitsui Toatsu Chem Inc, __¬¾óñ_ØÏ_×Ñ___óÒ¯_...","Yoshio Kikuta, _à­_êê___à , Toashi Kishi, ...",5/18/82,5/18/82,11/24/83,,https://patents.google.com/patent/JPS58201860A/en,,\nPURPOSE:The titled novel composition consist...,5.0,1.0,rust preventing coating material composition p...,0.01,100
2333,JP-S6072954-A,Long-period rustproof coating composition,"Dainippon Toryo Co Ltd, __Ïó¥ïóØ¶_µ¥óÑ¦óÒ¯___...","Toshio Shinohara, Ò__ë_êêªà_íÆ, Toshimiki T...",9/30/83,9/30/83,4/25/85,,https://patents.google.com/patent/JPS6072954A/en,,\nPURPOSE:The titled composition which can for...,,,long period rustproof coating composition purp...,0.01,100
2335,JP-S6078672-A,Corrosion-proof painting method of metal surfa...,"Mitsui Eng & Shipbuild Co Ltd, __¬¾_êÒ_óÒ¯...","Satoru Nishimoto, óë ï·óØ¶",10/4/83,10/4/83,5/4/85,,https://patents.google.com/patent/JPS6078672A/en,,\nPURPOSE:To apply corrosion-proof coating to ...,,,corrosion proof painting method metal surface ...,0.01,100
394,CN-110157289-A,Watersoluble plumbago alkene anticorrosive paint,_ññ_Ø___óöë__·Ïâó_êóØ_¦_¶_,"__Òó__, __óöï__",3/28/18,3/28/18,8/23/19,,https://patents.google.com/patent/CN110157289A/en,,\nThe present invention relates to a kind of w...,,,watersoluble plumbago alkene anticorrosive pai...,0.01,100


## New code to remove the need for keras tuner
### Keras Tuner is very nice, but it requiers saving the tuner which can make deployment more difficult

In [None]:
import tensorflow as tf
import numpy as np
import random
import os

# Define the model
def create_model(learning_rate, num_units):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(num_units, activation='relu'),
        tf.keras.layers.Dense(1)
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='mean_squared_error')
    return model

# Define the hyperparameter ranges
learning_rates = [0.01, 0.001, 0.0001, 0.00001]
num_units_list = [32, 64, 128, 256, 512]

# Function to randomly sample hyperparameters
def sample_hyperparameters():
    learning_rate = random.choice(learning_rates)
    num_units = random.choice(num_units_list)
    return learning_rate, num_units

# Directory to save the best models
save_dir = 'best_models'
os.makedirs(save_dir, exist_ok=True)

# Perform random search
num_combinations = 50
best_models = []

for i in range(num_combinations):
    learning_rate, num_units = sample_hyperparameters()
    model = create_model(learning_rate, num_units)
    model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), verbose=0)
    score = model.evaluate(X_val, y_val, verbose=0)
    
    # Save the model and its score
    best_models.append((score, model, {'learning_rate': learning_rate, 'num_units': num_units}))
    
    # Keep only the 10 best models
    best_models = sorted(best_models, key=lambda x: x[0])[:10]

# Save the 10 best models
for i, (score, model, params) in enumerate(best_models):
    model_path = os.path.join(save_dir, f"model_{i+1}.h5")
    model.save(model_path)
    print(f"Model {i+1}: Score = {score}, Hyperparameters = {params}, Saved at = {model_path}")