In [None]:
import numpy as np
import pickle 
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Bidirectional, SpatialDropout1D
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')
DATA_PATH = "/content/gdrive/My Drive"
tech_infile = open(DATA_PATH+'/technologies.pkl','rb')
tech_pkl = pickle.load(tech_infile)

infile = open(DATA_PATH+'/abstracts.pkl','rb')
abs_pkl = pickle.load(infile)

Mounted at /content/gdrive


In [None]:
indexlist = list(set(tech_pkl.values()))
df = pd.read_stata('gdrive/My Drive/public_pat_var20200908.dta')

In [None]:
df_a = pd.DataFrame(abs_pkl.items())
df_a.columns = ['patent', 'abstract']

#df['patent'] = pd.to_numeric(df['patent'])

In [None]:
#for idx in indexlist:

pat_ids = [float(i) for i in tech_pkl if tech_pkl[i]==16] #Patent Id's associated to a certain industry (In this case Pharma, I looked up the indexes from one of the excels Hari had sent us)
pid_str = [str(s) for s in pat_ids]
df_n=df[df['patent'].isin(pat_ids)] 
df_n = df_n.filter(['qje_adj_xi'], axis=1)  #Industry specific kogan scores dataframe
df_abs = df_a[df_a['patent'].isin(pid_str)] #Industry specific abstracts dataframe

#df_abs = df_abs.filter(['abstract'], axis=1) 
#abs_data = df_abs.to_numpy()#x

In [None]:
MAX_NB_WORDS = 10000
MAX_SEQ_LENGTH = 400
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df_abs['abstract'].values)
word_index = tokenizer.word_index

#print('Found %s unique tokens.' % len(word_index))

X = tokenizer.texts_to_sequences(df_abs['abstract'].values)
X = pad_sequences(X, maxlen=MAX_SEQ_LENGTH)

#print('Shape of data tensor:', X.shape)

Y = df_n.to_numpy()

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
Y_train = Y_train.astype(float)
#Y_train = np.log(Y_train)
Y_test = Y_test.astype(float)

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
#model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(13, activation='relu'))
model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['accuracy'])
print(model.summary())



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 400, 100)          1000000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 400, 100)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               160800    
_________________________________________________________________
dense (Dense)                (None, 13)                2613      
Total params: 1,163,413
Trainable params: 1,163,413
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
epochs = 500
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))