In [None]:
conda install -c conda-forge numpy=1.19.5

In [None]:
import numpy as np

In [None]:
np.version.version

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from tensorflow.keras.layers import Embedding

In [None]:
import numpy as np

In [None]:
# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Other
import re
import string
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/My Drive/IDAbstractKoganScore.csv', sep = ',', usecols = [0,1], error_bad_lines=False)

In [None]:
df.head()

In [None]:
df= df.dropna()

In [None]:
df.head()

Unnamed: 0,Kogan_Score,Text_Abstract
0,1.616379,A method for fabricating a semiconductor diod...
1,0.501945,The invention is directed to a method of and ...
2,0.128938,A method and apparatus for end jointing timbe...
3,0.89154,A junction field effect transistor having a V ...
4,1.175692,A thermally and electrically conductive metal...


In [None]:
vocabulary_size = 285000

In [None]:
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['Text_Abstract'])

In [None]:
sequences = tokenizer.texts_to_sequences(df['Text_Abstract'])
data = pad_sequences(sequences, maxlen=250)

In [None]:
import pickle

# saving
with open('/content/drive/My Drive/dataPaddedSequences.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import pickle
with open('/content/drive/My Drive/dataPaddedSequences.pickle', 'rb') as handle:
    data = pickle.load(handle)

In [None]:
print(data.shape)

In [None]:
embeddings_index = dict()
f = open(r'/content/drive/My Drive/glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [None]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [None]:
labels = df['Kogan_Score']

In [None]:
model_glove = Sequential()
model_glove.add(Embedding(vocabulary_size, 100, input_length=250, weights=[embedding_matrix], trainable=False))
model_glove.add(Dropout(0.5))
model_glove.add(Conv1D(64, 5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(LSTM(256))
model_glove.add(Dense(256, activation='relu'))
model_glove.compile(loss='mae', optimizer='adam', metrics=['mse', 'mae', 'accuracy'])

In [None]:
# checkpoint
from keras.callbacks import ModelCheckpoint
path_begin = "/content/drive/My Drive/"
filepath= path_begin + "weights-improvement-{epoch:02d}-{val_accuracy:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=False, save_weights_only=False, mode='auto', save_freq=1)
callbacks_list = [checkpoint]

In [None]:
model_glove.fit(data, np.array(labels), validation_split=0.4, epochs = 2)
model_glove.save_weights('/content/drive/My Drive/model_glove_2epochs_pretransfer.h5')

Epoch 1/60
Epoch 2/60

In [None]:
model_glove_transfer = Sequential()
model_glove_transfer.add(Embedding(vocabulary_size, 100, input_length=250, weights=[embedding_matrix], trainable=False))
model_glove_transfer.add(Dropout(0.5))
model_glove_transfer.add(Conv1D(64, 5, activation='relu'))
model_glove_transfer.add(MaxPooling1D(pool_size=4))
model_glove_transfer.add(LSTM(256))
model_glove_transfer.add(Dense(256, activation='relu'))

In [None]:
weights_list = model_glove.get_weights()
for i, weights in enumerate(weights_list[0:len(weights_list)]):
    model_glove_transfer.layers[i].set_weights(weights)

In [None]:
model_glove_transfer.compile(loss='mae', optimizer='adam', metrics=['mse', 'mae', 'accuracy'])
model_glove.fit(data, np.array(labels), validation_split=0.15, epochs = 25)