In [1]:
import pandas as pd
import numpy as np
import os

import fasttext
import fasttext.util

from keybert import KeyBERT

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
data_home = "/lyceum/jhk1c21/msc_project/data/graph"

In [2]:
df = pd.read_pickle(os.path.join(data_home, 'filtered', 'filtered_nodes.pkl'))
df = pd.read_csv(os.path.join(data_home, 'filtered', 'filtered_nodes.csv'))
df = pd.read_csv(os.path.join(data_home, 'full', 'nodes_full.csv'))

In [12]:
title_list = list(df['title'])
keywords_list = list(df['keywords'])
abstract_list = list(df['abstract'])
fos_list = list(df['fos'])

In [4]:
fast_model = fasttext.load_model('cc.en.300.bin')



In [61]:
def word_embedding(model, keywords_list):
    embeddings = [ np.array(list(map(model.get_word_vector, keywords))) for keywords in keywords_list ]
    return embeddings
    
def word_list_embedding(model, keywords_list):
    embeddings = word_embedding(model, keywords_list)
    # print(len(embeddings))
    # return np.array([ np.mean(keywords_embedding, axis=0) for keywords_embedding in embeddings])
    return [ np.mean(keywords_embedding, axis=0) for keywords_embedding in embeddings]
    
def sentence_embedding(model, sentence_list):
    return np.array([ model.get_sentence_vector(' '.join(sentence.split())) for sentence in sentence_list ])

In [17]:
keywords_embedding_list = word_list_embedding(fast_model, keywords_list)

In [18]:
keywords_embedding_list.shape

(148061, 300)

In [8]:
title_embedding_list = sentence_embedding(fast_model, title_list)

In [11]:
print(title_embedding_list[0][:10])
print(title_embedding_list[1][:10])

[ 0.00433523 -0.02681895  0.03109596  0.03126718  0.01572027  0.03395665
  0.07647985 -0.03281466  0.03433651  0.00113465]
[ 0.01835213 -0.01114322  0.03089921 -0.00523332  0.032729    0.03502502
  0.03554188 -0.02476176  0.01622009  0.02455734]


In [53]:
keywords_from_abstract = []
threshold = 0.3
keywords_limit = 2
bert_model = KeyBERT()

In [54]:
for idx, abstract in enumerate(abstract_list):
    keywords = bert_model.extract_keywords(abstract, top_n=20, use_mmr=True)
    
    filtered_keywords = [ keyword[0] for keyword in keywords if keyword[1] > threshold ]
    if len(filtered_keywords) == 0:
        filtered_keywords = [ keyword[0] for keyword in keywords[:keywords_limit] ]
    
    keywords_from_abstract.append(filtered_keywords)
    if idx % 1000 == 0:
        print(f"{idx} is done => shape: {filtered_keywords.shape}")

0 is done
1000 is done
2000 is done
3000 is done
4000 is done
5000 is done
6000 is done
7000 is done
8000 is done
9000 is done
10000 is done
11000 is done
12000 is done
13000 is done
14000 is done
15000 is done
16000 is done
17000 is done
18000 is done
19000 is done
20000 is done
21000 is done
22000 is done
23000 is done
24000 is done
25000 is done
26000 is done
27000 is done
28000 is done
29000 is done
30000 is done
31000 is done
32000 is done
33000 is done
34000 is done
35000 is done
36000 is done
37000 is done
38000 is done
39000 is done
40000 is done
41000 is done
42000 is done
43000 is done
44000 is done
45000 is done
46000 is done
47000 is done
48000 is done
49000 is done
50000 is done
51000 is done
52000 is done
53000 is done
54000 is done
55000 is done
56000 is done
57000 is done
58000 is done
59000 is done
60000 is done
61000 is done
62000 is done
63000 is done
64000 is done
65000 is done
66000 is done
67000 is done
68000 is done
69000 is done
70000 is done
71000 is done
72000

In [62]:
abstract_embedding_list = word_list_embedding(fast_model, keywords_from_abstract)

In [80]:
to_eleminate_list = []
original_shape = abstract_embedding_list[0].shape
for idx, embedding in enumerate(abstract_embedding_list):
    if original_shape != embedding.shape:
        to_eleminate_list.append(idx)
        print(idx, len(abstract_list[idx]), abstract_list[idx])

1607 469 += , . = = ( ) < • ∫ = • ∕ = ∕ = < × = = ( ) = + ∫ = • ∫ = . .. .. .. .. .. .. .. .. .. .. .. .. .. .... .. .. .. .. .. .. .. .. . .. . .. . .. .. .. . ... .. .. .. .. .. .. .. .... .... .. .. .. . . .. .. .. .. .. .. .... .... ... ... ... . ...... .................... .... .... .... .... .. .. .. . . .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .
8212 3 N/A
9424 2   
20851 1 :
24129 2   
24705 20 'Where's' in a name?
35417 3 N/A
42362 2   
55084 3 N/A
67338 2   
83948 2   
91579 2   
93543 2   
103099 3 N/A
104488 2   
104530 2   
109805 674 .@~.~• @.~@. ~ ~@#@@@#. @@@@@@@@. .•°.... .... @° .'. .... @. .° .... @. .. .° .@ .... .. .@ .... .. °@ .... °° °@ .... ~ .... ..@ ................ .. @ .... ..@ ................ .. @ .... • .@ ................ .. @ .... • .@ ................ .. @ .... ..~ ................ ~... @... ~... ~... • .@ ........ • ~ ; ..,. ...

In [115]:
keywords_embedding = []
title_embedding = []
abstract_embedding = []

filtered_fos = []
filtered_id = []
start = 0

for end in to_eleminate_list:
    keywords_embedding.extend(keywords_embedding_list[start:end])
    title_embedding.extend(title_embedding_list[start:end])
    abstract_embedding.extend(abstract_embedding_list[start:end])
    
    filtered_fos.extend(list(df['fos'].iloc[:][start:end]))
    filtered_id.extend(list(df['_id'].iloc[:][start:end]))
    start = end + 1
    
keywords_embedding.extend(keywords_embedding_list[start:])
title_embedding.extend(title_embedding_list[start:])
abstract_embedding.extend(abstract_embedding_list[start:])

filtered_fos.extend(list(df['fos'].iloc[:][start:]))
filtered_id.extend(list(df['_id'].iloc[:][start:]))

In [146]:
keywords_embedding = np.array(keywords_embedding)
title_embedding = np.array(title_embedding)
abstract_embedding = np.array(abstract_embedding)

filtered_id = np.array(filtered_id)
filtered_fos = np.array(filtered_fos, dtype='object')

In [147]:
np.save('../data/title.npy', title_embedding)
np.save('../data/keywords.npy', keywords_embedding)
np.save('../data/abstract.npy', abstract_embedding)
np.save('../data/id.npy', filtered_id)
np.save('../data/fos.npy', filtered_fos)

In [157]:
# how to load the npy file, particularly fos.npy
with open('../data/fos.npy', 'rb') as f:
    a = np.load(f, allow_pickle=True)

array([list(['Objective programming', 'Fire protection', 'Computer science', 'Fuzzy logic', 'Operations research', 'Fire risk', 'Genetic algorithm', 'Decision maker']),
       list(['Integer', 'Weight function', 'Edge cover', 'Vertex (graph theory)', 'Graph theory', 'Approximation algorithm', 'Discrete mathematics', 'Dynamic programming', 'Combinatorics', 'Exponential function', 'Vertex (geometry)', 'Algorithm', 'Neighbourhood (graph theory)', 'Minimum weight', 'Vertex cover', 'Mathematics']),
       list(['CDNA Arrays', 'Normalization (statistics)', 'Pattern recognition', 'Biology', 'Population Heterogeneity', 'High density', 'Comparative genomic hybridization', 'Artificial intelligence', 'Mega-', 'Genetics', 'DNA microarray', 'Gene expression microarray']),
       ...,
       list(['XML Encryption', 'XML framework', 'Computer science', 'XML validation', 'Document Structure Description', 'XML namespace', 'Theoretical computer science', 'RELAX NG', 'XML schema', 'XML Signature']),
    

In [1]:
similarity

NameError: name 'similarity' is not defined