**Import Libraries**

In [95]:
import numpy as np, pandas as pd, time, tensorflow_addons as tfa, tensorflow as tf, tensorflow.keras as keras, os
import scipy
from keras.layers import Flatten, Dense
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import pearsonr
import random

import transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, TFAutoModel
import torch
import wandb
import nltk
import string
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pickle
from sentence_transformers import SentenceTransformer 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

**Download packages**

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
!pip install -U sentence_transformers


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jcecilya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jcecilya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jcecilya/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

**Reading input variables**

In [3]:
df_train= pd.read_csv('/Users/jcecilya/Documents/Projects/Sciencw/USPatent/train.csv')
df_test= pd.read_csv('/Users/jcecilya/Documents/Projects/Sciencw/USPatent/test.csv')
titles= pd.read_csv('/Users/jcecilya/Documents/Projects/Sciencw/USPatent/titles.csv')
train_df=df_train.merge(titles, left_on='context', right_on='code', how='left')
test_df=df_test.merge(titles, left_on='context', right_on='code', how='left')

In [5]:
train_df.head()

Unnamed: 0,id,anchor,target,context,score,code,title,section,class,subclass,group,main_group
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,


***DATA PREPARATION***

**Preprocessing input text**

In [7]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()

def lemma_stopwords(sent):
    new_sent=[]
    filtered_words = " ".join([word.lower() for word in sent.split(" ") if word not in stopwords.words('english')])
    x1="".join([word for word in filtered_words if word not in string.punctuation])
    for x in x1:
        new_sent.append(lemmatizer.lemmatize(x, pos ="v"))
    return "".join(new_sent) 


In [7]:
train_df["text_a"] = (train_df['anchor']+' '+train_df['title']+' '+train_df['target']).apply(lambda x: lemma_stopwords(x))
sentence_pairs = train_df["text_a"].values.astype(str)
y = train_df['score'].values.astype(np.float64)

***Model 1: Using PatentSBERTa sentence transformer and data modelling using XGB algorithm***

**Huggingface's pretrained PatentSBERTa sentence transformer**

It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
https://huggingface.co/AI-Growth-Lab/PatentSBERTa

In [8]:
strans = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')
X = strans.encode(sentence_pairs)

**Download and load transformers for resuability**

In [9]:
import pickle
with open('sberta_trans.pkl', "wb") as fOut:
    pickle.dump({'sentences': sentence_pairs, 'embeddings': X}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

import pickle
#Load sentences & embeddings from disc
with open('sberta_trans.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    X = stored_data['embeddings']

In [10]:

import pickle
#Load sentences & embeddings from disc
with open('sberta_trans.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    X = stored_data['embeddings']

**Split input for testing. Data Modelling with hyper parameter tuning**

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.2)
model = xgb.XGBRegressor(seed=20)
params= {
    "learning_rate":[0.15], 
    "max_depth":[15,25]
}

model = GridSearchCV(model,params, cv=5, n_jobs= -1, verbose = 1)


In [21]:

os.environ["TOKENIZERS_PARALLELISM"] = "false"
z=model.fit(X_train, y_train)


Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [22]:
pickle.dump(z, open('xgb.pkl', 'wb'))
with open('xgb.pkl' , 'rb') as f:
    lr = pickle.load(f)

In [23]:
preds = lr.predict(X_test)
mean_squared_error(y_test, preds)

0.05023928121731964

In [24]:
test_df["text_a"] = (test_df['anchor']+' '+test_df['title']+' '+test_df['target']).apply(lambda x: lemma_stopwords(x))
sentence_pairs_test = test_df["text_a"].values.astype(str)
test_encoded = strans.encode(sentence_pairs_test)

In [25]:
preds = lr.predict(test_encoded)

In [26]:
submission= pd.read_csv('/Users/jcecilya/Documents/Projects/Sciencw/USPatent/sample_submission.csv')


In [27]:
submission['score'] = preds
submission.to_csv("sberta_submission.csv", index=False)
submission

Unnamed: 0,id,score
0,4112d61851461f60,0.584368
1,09e418c93a776564,0.501242
2,36baf228038e314b,0.497741
3,1f37ead645e7f0c8,0.254475
4,71a5b6ad068d531f,0.002114
5,474c874d0c07bd21,0.467312
6,442c114ed5c4e3c9,0.498677
7,b8ae62ea5e1d8bdb,0.002952
8,faaddaf8fcba8a3f,0.252589
9,ae0262c02566d2ce,0.645575


***MODEL 2 - Using allmini sentence transformer and XGB Model***

**Using pretrained Allmini Transformer model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2**

In [None]:
allmini_strans = SentenceTransformer('all-MiniLM-L6-v2')
X_allmini = allmini_strans.encode(sentence_pairs)

In [None]:
import pickle
with open('all_mini_encode.pkl', "wb") as fOut:
    pickle.dump({'sentences': sentence_pairs, 'embeddings': X_allmini}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

In [30]:
import pickle
#Load sentences & embeddings from disc
with open('all_mini_encode.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    X_allmini = stored_data['embeddings']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_allmini, y, stratify=y, random_state=42, test_size=0.2)

In [None]:

from sklearn.model_selection import GridSearchCV

import xgboost as xgb

model = xgb.XGBRegressor(seed=20)
params= {
    "learning_rate":[0.1], 
    "max_depth":[15]
}


model = GridSearchCV(model,params, cv=5, n_jobs= -1, verbose = 1)

allmini_model=model.fit(X_train, y_train)

pickle.dump(allmini_model, open('allmini_model2.pkl', 'wb'))


In [34]:
import pickle
with open('allmini_model2.pkl' , 'rb') as f:
    lr = pickle.load(f)

In [36]:
preds = lr.predict(X_test)
mean_squared_error(y_test, preds)

0.04913873149108243

In [37]:
test_encoded = allmini_strans.encode(sentence_pairs_test)
preds = lr.predict(test_encoded)

In [38]:
submission_allmini= pd.read_csv('/Users/jcecilya/Documents/Projects/Sciencw/USPatent/sample_submission.csv')
submission_allmini['score'] = preds
submission_allmini.to_csv("submission_allmini.csv", index=False)
submission_allmini

Unnamed: 0,id,score
0,4112d61851461f60,0.412687
1,09e418c93a776564,0.448923
2,36baf228038e314b,0.471552
3,1f37ead645e7f0c8,0.383177
4,71a5b6ad068d531f,0.330968
5,474c874d0c07bd21,0.226029
6,442c114ed5c4e3c9,0.418601
7,b8ae62ea5e1d8bdb,0.22565
8,faaddaf8fcba8a3f,0.300469
9,ae0262c02566d2ce,0.452526


***Model 3 - Using glove worlds embeddings and Data Modelling with LSTM***

**GloVe is an unsupervised learning algorithm to learn vector representation i.e word embedding for various words. GloVe stands for Global Vectors for Word Representations. In this code I used 100 dimensional GloVe vectors**

In [92]:
df_train= pd.read_csv('/Users/jcecilya/Documents/Projects/Sciencw/USPatent/train.csv')
df_test= pd.read_csv('/Users/jcecilya/Documents/Projects/Sciencw/USPatent/test.csv')
titles= pd.read_csv('/Users/jcecilya/Documents/Projects/Sciencw/USPatent/titles.csv')
train_df=df_train.merge(titles, left_on='context', right_on='code', how='left')
test_df=df_test.merge(titles, left_on='context', right_on='code', how='left')

In [93]:
train_df["text_a"] = (train_df['anchor']+' '+train_df['title']+' '+train_df['target']).apply(lambda x: lemma_stopwords(x))
sentence_pairs = train_df["text_a"]
y = train_df['score'].values.astype(np.float64)
texts=sentence_pairs
labels=y


In [99]:
X_train, X_test,Y_train, Y_test = train_test_split(texts, labels, test_size=0.2, random_state = 45)


In [100]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

words_to_index = tokenizer.word_index

In [107]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)



  return word_to_vec_map

In [102]:
word_to_vec_map =read_glove_vector (os.path.join(
    os.path.expanduser("~"), "./Documents/Projects/Sciencw/USPatent/glove.6B.100d.txt"))
maxLen = 250

In [113]:
maxLen

250

In [114]:
vocab_len = len(words_to_index)
embed_vector_len = 250
emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
      embedding_matrix[index] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)


**LSTM stands for Long Term Short Memory and its a type of RNN architecture and is used in NLP problems as it handles long sequence dependencies well**

In [115]:

def uspatent(input_shape):

  X_indices = Input(input_shape)

  embeddings = embedding_layer(X_indices)

  X = LSTM(128, return_sequences=True)(embeddings)

  X = Dropout(0.6)(X)

  X = LSTM(128, return_sequences=True)(X)

  X = Dropout(0.6)(X)

  X = LSTM(128)(X)

  X = Dense(1, activation='sigmoid')(X)

  model = Model(inputs=X_indices, outputs=X)

  return model

In [116]:

X_train_indices = tokenizer.texts_to_sequences(X_train)
X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')

In [85]:
adam = keras.optimizers.Adam(learning_rate = 0.1)
model.compile(loss="mse", optimizer="Adam", metrics=["mae"])
history=model.fit(X_train_indices, Y_train, batch_size=30, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [123]:
# pickle.dump(history, open('LSTM.pkl', 'wb'))
with open('LSTM.pkl' , 'rb') as f:
    model = pickle.load(f)

In [118]:

test_df["text_a"] = (test_df['anchor']+' '+test_df['title']+' '+test_df['target']).apply(lambda x: lemma_stopwords(x))
test=test_df["text_a"]

In [125]:

X_test_indices = tokenizer.texts_to_sequences(test)
X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')
preds = model.predict(X_test_indices)


In [89]:
submission_lstmglove= pd.read_csv('/Users/jcecilya/Documents/Projects/Sciencw/USPatent/sample_submission.csv')
submission_lstmglove['score'] = preds
submission_lstmglove.to_csv("submission_lstmglove.csv", index=False)
submission_lstmglove

Unnamed: 0,id,score
0,4112d61851461f60,0.570836
1,09e418c93a776564,0.601534
2,36baf228038e314b,0.167181
3,1f37ead645e7f0c8,0.268841
4,71a5b6ad068d531f,0.091575
5,474c874d0c07bd21,0.617274
6,442c114ed5c4e3c9,0.5768
7,b8ae62ea5e1d8bdb,0.026787
8,faaddaf8fcba8a3f,0.253366
9,ae0262c02566d2ce,0.644704
