In [1]:
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.metrics.pairwise import cosine_similarity

2021-12-21 18:28:58.893900: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-21 18:28:58.893935: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Methods to try for orphan adoption
1. Using BERT embedding to find most similar L1 and L2 topics
2. Using occurrences to determine parents
3. Using a model to predict the parent

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [4]:
with open("topics_vocab.pkl", "rb") as f:
    target_vocab = pickle.load(f)
    
len(target_vocab)

82178

In [7]:
model_tags_df = pd.DataFrame().from_dict(target_vocab, orient='index').reset_index()[['index']]
model_tags_df.columns = ['normalized_name']

In [8]:
model_tags_df.head()

Unnamed: 0,normalized_name
0,medicine
1,chemistry
2,biology
3,computer science
4,materials science


In [29]:
def fix_empty_list(child):
    try:
        child = child.tolist()
    except:
        child = []
    return child

In [89]:
def create_new_name(orig_name, children):
    if (len(children) > 0) & (len(children) < 25):
        children_string = ', '.join(children)
        new_name = f"{orig_name} and also {children_string}"
    else:
        new_name = f"{orig_name}"
    return new_name

In [90]:
tags_df = pd.read_parquet('fields_of_study_ids.parquet').dropna()
tags_df['level'] = tags_df['level'].astype('int')
orphan_df = pd.read_parquet('mag_imitator_orphans.parquet').merge(model_tags_df, how='inner', on='normalized_name')
children = pd.read_parquet('children.parquet')
orphan_df = orphan_df.merge(children, how='left', left_on = 'normalized_name', right_on='parent').copy()
orphan_df['child'] = orphan_df['child'].apply(fix_empty_list)
orphan_df['new_name'] = orphan_df.apply(lambda x: create_new_name(x.normalized_name, 
                                                                  x.child), axis=1)

In [91]:
children.head()

Unnamed: 0,parent,child
0,1 2 dichlorobenzene,[2 dichlorobenzene]
1,1 2 dimethylhydrazine,"[methylhydrazines, mucin depleted foci]"
2,1 3 butadiene,[1 2 epoxy 3 butene]
3,1 3 dichloropropene,[1 2 dichloropropane]
4,1 3 dipolar cycloaddition,"[nitrilimine, tf biphamphos, azomethine ylide,..."


In [92]:
orphan_df.head(10)

Unnamed: 0,field_of_study_id,normalized_name,level,parent,child,new_name
0,2908647359,population,2,population,"[chaerophyllum aureum, neotropical parrot, tri...",population
1,3018028166,in patient,3,,[],in patient
2,3020570323,preparation method,2,preparation method,"[iron black, white beeswax, buddleja officinal...",preparation method
3,3017944768,poison control,2,poison control,"[public drunkenness, glycol poisoning, chemica...",poison control
4,3018420607,control methods,2,control methods,[rate control method],control methods and also rate control method
5,2910001868,european union,2,european union,"[clp regulation, nomenclature of territorial u...",european union
6,3013748606,human immunodeficiency virus,2,human immunodeficiency virus,"[deceptive imprinting, anti retro viral, hiv i...",human immunodeficiency virus
7,3018023364,significant difference,2,significant difference,"[glass carbomer, vaginal spontaneous, vita ena...",significant difference
8,2984729377,high pressure,2,high pressure,"[intermediate pressure, high pressure neurolog...",high pressure
9,2982719155,energy source,3,,[],energy source


In [93]:
orphan_df.groupby('level').count()

Unnamed: 0_level_0,field_of_study_id,normalized_name,parent,child,new_name
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,4566,4566,684,4566,4566
3,518,518,7,518,518


In [94]:
orphan_df['level'].value_counts()

2    4566
3     518
Name: level, dtype: int64

In [95]:
tags_df.head()

Unnamed: 0,normalized_name,field_of_study_id,level
0,u s standard atmosphere,16287357,3
1,space time block code,83487572,4
2,organizational architecture,156812175,2
3,retained mode,2775964467,3
4,collaborative leadership,2776545201,2


In [108]:
level_2_orphans = orphan_df[orphan_df['level']==2].sample(50)['new_name'].to_list()
print(len(level_2_orphans))
level_1_list = tags_df[tags_df['level']==1]['normalized_name'].to_list()
print(len(level_1_list))
level_0_list = tags_df[tags_df['level']==0]['normalized_name'].to_list()
print(len(level_0_list))

50
292
19


## Orphans

In [103]:

# last hidden state
# orphan_embs = []
# for i in level_2_orphans:
#     toks = tokenizer(i, return_tensors="tf")
#     emb = model(toks).last_hidden_state.numpy().mean(axis=-2)
#     orphan_embs.append(emb)
# orphan_embs = np.array(orphan_embs)[:,0,:]

# last 4 summed up
orphan_embs = []
for i in level_2_orphans:
    toks = tokenizer(i, return_tensors="tf")
    emb = tf.add_n(model(toks, output_hidden_states=True).hidden_states[-4:]).numpy().mean(axis=-2)
    orphan_embs.append(emb)
orphan_embs = np.array(orphan_embs)[:,0,:]

# embedding layer

## Level 1

In [104]:
# last hidden state
# level_1_embs = []
# for i in level_1_list:
#     toks = tokenizer(i, return_tensors="tf")
#     emb = model(toks).last_hidden_state.numpy().mean(axis=-2)
#     level_1_embs.append(emb)
# level_1_embs = np.array(level_1_embs)[:,0,:]

# last 4 summed up
level_1_embs = []
for i in level_1_list:
    toks = tokenizer(i, return_tensors="tf")
    emb = tf.add_n(model(toks, output_hidden_states=True).hidden_states[-4:]).numpy().mean(axis=-2)
    level_1_embs.append(emb)
level_1_embs = np.array(level_1_embs)[:,0,:]

# embedding layer

## Level 0

In [111]:
# last 4 summed up
level_0_embs = []
for i in level_0_list:
    toks = tokenizer(i, return_tensors="tf")
    emb = tf.add_n(model(toks, output_hidden_states=True).hidden_states[-4:]).numpy().mean(axis=-2)
    level_0_embs.append(emb)
level_0_embs = np.array(level_0_embs)[:,0,:]

# embedding layer

In [105]:
orphan_embs.shape

(50, 768)

In [106]:
level_1_embs.shape

(292, 768)

In [112]:
level_0_embs.shape

(19, 768)

In [113]:
for i in range(orphan_embs.shape[0]):
    print(f"{level_2_orphans[i]}")
    print("L0")
    sorted_tags = np.argsort(cosine_similarity(orphan_embs[i].reshape(1,-1), level_0_embs))[0,-5:][::-1]
    sorted_scores = np.sort(cosine_similarity(orphan_embs[i].reshape(1,-1), level_0_embs))[0,-5:][::-1]
    for j in range(3):
        print(f"_______ {level_0_list[sorted_tags[j]]}: {sorted_scores[j]}")
    
    print("L1")
    sorted_tags = np.argsort(cosine_similarity(orphan_embs[i].reshape(1,-1), level_1_embs))[0,-5:][::-1]
    sorted_scores = np.sort(cosine_similarity(orphan_embs[i].reshape(1,-1), level_1_embs))[0,-5:][::-1]
    for j in range(5):
        print(f"_______ {level_1_list[sorted_tags[j]]}: {sorted_scores[j]}")
    print("\n")

two temperature
L0
_______ philosophy: 0.7631981372833252
_______ psychology: 0.7529655694961548
_______ economics: 0.752226710319519
L1
_______ risk analysis: 0.8285120725631714
_______ financial system: 0.8279814720153809
_______ engineering ethics: 0.8186806440353394
_______ positive economics: 0.816841185092926
_______ demographic economics: 0.8166027069091797


business activities
L0
_______ business: 0.7770204544067383
_______ medicine: 0.7626142501831055
_______ art: 0.7533687353134155
L1
_______ economic system: 0.8523975610733032
_______ financial system: 0.8357780575752258
_______ industrial organization: 0.8275732398033142
_______ multimedia: 0.8180881142616272
_______ communication: 0.809441089630127


bioactive molecules
L0
_______ environmental science: 0.45146480202674866
_______ materials science: 0.4260980188846588
_______ medicine: 0.39979973435401917
L1
_______ gastroenterology: 0.7339313626289368
_______ endocrinology: 0.706972599029541
_______ chromatography: 0.648

### Word2Vec

In [114]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [None]:
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")