# 4.1.2 Loading the trained model and predicting relevancy of new entries

In [1]:
import numpy as np
import pandas as pd
import os
import re
import shutil
import string
import tensorflow as tf
import pickle

2024-02-12 10:50:13.554212: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tf.__version__

'2.11.0'

## Load the patent file and sample 100 unranked entries

In [3]:
#import new .xlsx
Database_file = 'TACScrapedClean.csv'

raw_data = pd.read_csv(Database_file, encoding='latin')

#combine title and abstracts
raw_data['Title_Abstract'] = raw_data['title'] + raw_data['Abstracts'].fillna('')

#remove other columns
data_TA = raw_data[['Title_Abstract', 'Relevant']]
#isolate unranked

data_unranked = data_TA[data_TA['Relevant'].isna()]

data_TA_only = data_unranked['Title_Abstract']

data_sample = data_TA_only.sample(n=100, random_state = 42)

#convert to tensor and batch

raw_sample_ds = tf.convert_to_tensor(data_sample)


2024-02-12 10:51:47.004201: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-12 10:51:47.028401: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [4]:
data_unranked.head(20)

Unnamed: 0,Title_Abstract,Relevant
6,Method of stabilizing metal pigments against g...,
8,Aqueous epoxy resin curing composition \r\r bi...,
9,Coating compositions based on polyesters with ...,
10,Powder coating composition with coalescing and...,
11,Anti-contact paint composition,
17,"A multilayer structure, and a method for makin...",
18,A curing agent \rPURPOSE: A diamine hardener i...,
19,Curable compositions and their use as coatings...,
20,Coating composition of a glycidyl acrylic poly...,
21,Methods for electrocoating a metallic substrat...,


In [5]:
raw_sample_ds.shape

TensorShape([100])

In [6]:
raw_sample_ds[0]

<tf.Tensor: shape=(), dtype=string, numpy=b'Thermosetting coating material and coating method \r<P>PROBLEM TO BE SOLVED: To obtain a thermosetting coating material which has an irregularity-filling function and rustproof and beautifying functions and can form a thick coated film.  <P>SOLUTION: After a metallic material 1 of a material to be coated is provided with chemical treatment or mechanical pretreatment to render the surface state suitable for coating, a coating film having a thickness of 10-250 \xce\xbcm is formed with a thermosetting coating material comprising 30-45 wt.% acrylic resin varnish, 10-20 wt.% melamine resin varnish, 5-15 wt.% epoxy resin varnish, silicon dioxide, barium sulfate, calcium carbonate, bentonite, and a monocarboxylic acid amide organic substance and is heat-cured at 150 to 190\xc2\xb0C to form a thermoset coated film 2.  <P>COPYRIGHT: (C)2008,JPO&INPIT  \r'>

## Load vectorizer and model and apply to the data

In [6]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html_1 = tf.strings.regex_replace(lowercase, '\r', ' ')
    return tf.strings.regex_replace(stripped_html_1,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [7]:
max_features = 10000
sequence_length = 500

file = pickle.load(open('3.1.2ClassifierVectorizeLayer.pkl', "rb"))
vectorize_layer = tf.keras.layers.TextVectorization.from_config(file['config'])
vectorize_layer.set_weights(file['weights'])
vectorize_layer.set_vocabulary('3.1.2.vocab.txt')

In [8]:
# checking if the layer works on one TA
#text_batch = next(iter(raw_sample_ds))
first_patent, = raw_sample_ds[0],
print("First patent TA", first_patent)
print("First patent Standardized TA", custom_standardization(first_patent))
print('First patent vectorized text', vectorize_layer(first_patent))

First patent TA tf.Tensor(b'Method for the production of colour-and/or effect-endowing multicoat paint on vehicle chassis or parts thereof \rA process for producing multicoat color and/or effect finishes on motor vehicle bodies or parts thereof (substrates), in which (I) a pigmented powder coating material is applied to the substrates, (II) the resulting powder coating film is partly or fully cured, (III) the powder coating film or powder coating is overcoated with a solid-color topcoat, with a basecoat and a clearcoat, or with a clearcoat, after which (IV) the resulting films are cured in each case individually or together with other films (wet-on-wet technique), wherein said powder coating material (A) comprises at least one carboxyl-containing polyester and (B) comprises at least one glycidyl ester of an aromatic or of a saturated or unsaturated cycloaliphatic dicarboxylic acid and/or at least one N,N,N\xc2\x89\xc3\x9b_,N\xc2\x89\xc3\x9b_-tetrakis(beta-hydroxyalkyl)alkanedicarboxami

In [9]:
def vectorize_text(text):
  #text = tf.expand_dims(text, -1)
  return vectorize_layer(text)

In [10]:
sample_ds = vectorize_layer(raw_sample_ds)

In [11]:
sample_ds.shape

TensorShape([100, 500])

In [13]:
sample_ds[:2]

<tf.Tensor: shape=(2, 500), dtype=int64, numpy=
array([[ 302,    8,   69,    6,    8,   29,    1,    9,   53,  912,    9,
         259,    4,  302,    8,   69,   47,   48,   10,    1, 1735,    6,
           1,    6,    1,    1,    6,   56,   97,    4,  527,   52,   36,
           1,  240,    4,  227,   69,   58,    7,    4,   69,    9,   53,
          52,   12,  120,   21,  238,  581,   16,  333,  774,    9,    1,
           5,   67, 2403,  220,   18,    8,    4,    8,   36,   26,    4,
         403,    7,    1,  873,   12,   99,   21,    4,  302,    8,   69,
          23,    1,  111,   90,   13,  314,    1,  111,  493,   13,  314,
        1926,  111,   14,   13,  314,  913, 1279,  628,  652,  834,  718,
        1862,    6,    4,  976,   57,    1,   76, 1123,    6,   12,    1,
          17, 1352,    9,    1,    9,   97,    4,  582,   52,   36,   78,
           1,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,   

## Predict relvancy and save result to .csv

In [16]:
model = tf.keras.models.load_model('3.1.2.Model2.keras')

predicted_relevancy_2 = model.predict(sample_ds)



In [17]:
data_sample_prediction = pd.DataFrame(data_sample)
data_sample_prediction['Predicted_relevancy'] = predicted_relevancy_2
data_sample_prediction.head()

Unnamed: 0,Title_Abstract,Predicted_relevancy,Predicted_relevancy_6
1968,Method for the production of colour-and/or eff...,0.514414,0.406311
3928,High-strength polyurethane heavy-duty corrosio...,0.872142,0.852499
1071,Waterborne coating composition and process of ...,0.477848,0.541831
3392,Forming method for paint film and paint film a...,0.970759,0.985636
240,Paint composition \rPURPOSE:To provide a paint...,0.82323,0.801666


In [18]:
data_sample_prediction.to_csv('4.1.2.Prediction100.csv', index=False)