In [1]:
import pickle
import pandas as pd
import numpy as np
# import tensorflow as tf
# from transformers import BertTokenizer, TFBertModel
# from sklearn.metrics.pairwise import cosine_similarity

### Methods to try for orphan adoption
1. Using BERT embedding to find most similar L1 and L2 topics
2. Using occurrences to determine parents
3. Using a model to predict the parent

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [7]:
with open("topics_vocab.pkl", "rb") as f:
    target_vocab = pickle.load(f)
    
len(target_vocab)

65026

In [8]:
model_tags_df = pd.DataFrame().from_dict(target_vocab, orient='index').reset_index()[['index']]
model_tags_df.columns = ['normalized_name']

In [9]:
model_tags_df.shape

(65026, 1)

In [10]:
model_tags_df.head(5)

Unnamed: 0,normalized_name
0,art
1,earth science
2,social science
3,clustering coefficient
4,gratification


In [8]:
def fix_empty_list(child):
    try:
        child = child.tolist()
    except:
        child = []
    return child

In [9]:
def create_new_name(orig_name, children):
    if (len(children) > 0) & (len(children) < 25):
        children_string = ', '.join(children)
        new_name = f"{orig_name} and also {children_string}"
    else:
        new_name = f"{orig_name}"
    return new_name

In [14]:
tags_df = pd.read_parquet('fields_of_study_ids.parquet').dropna()
tags_df['level'] = tags_df['level'].astype('int')
orphan_df = pd.read_parquet('mag_imitator_orphans.parquet').merge(model_tags_df, how='inner', on='normalized_name')
# children = pd.read_parquet('children.parquet')
# orphan_df = orphan_df.merge(children, how='left', left_on = 'normalized_name', right_on='parent').copy()
# orphan_df['child'] = orphan_df['child'].apply(fix_empty_list)
# orphan_df['new_name'] = orphan_df.apply(lambda x: create_new_name(x.normalized_name, 
#                                                                   x.child), axis=1)

In [16]:
orphan_df.shape

(1664, 3)

In [17]:
orphan_df[orphan_df['normalized_name']=='energy source']

Unnamed: 0,field_of_study_id,normalized_name,level
6,2982719155,energy source,3


In [18]:
orphan_df.groupby('level').count()

Unnamed: 0_level_0,field_of_study_id,normalized_name
level,Unnamed: 1_level_1,Unnamed: 2_level_1
2,1489,1489
3,175,175


In [19]:
orphan_df['level'].value_counts()

2    1489
3     175
Name: level, dtype: int64

In [20]:
openalex_concepts = pd.read_csv("concepts_with_wiki.csv")[['normalized_name']]
openalex_concepts.head(5)

Unnamed: 0,normalized_name
0,political science
1,philosophy
2,economics
3,business
4,psychology


In [21]:
openalex_concepts.shape

(65073, 1)

In [22]:
orphan_df.merge(openalex_concepts, how='inner', on='normalized_name').shape

(1664, 3)

In [23]:
orphan_df.merge(openalex_concepts, how='inner', on='normalized_name')['level'].value_counts()

2    1489
3     175
Name: level, dtype: int64

In [24]:
orphan_and_wiki = orphan_df.merge(openalex_concepts, how='inner', on='normalized_name')

## Orphans

In [103]:

# last hidden state
# orphan_embs = []
# for i in level_2_orphans:
#     toks = tokenizer(i, return_tensors="tf")
#     emb = model(toks).last_hidden_state.numpy().mean(axis=-2)
#     orphan_embs.append(emb)
# orphan_embs = np.array(orphan_embs)[:,0,:]

# last 4 summed up
orphan_embs = []
for i in level_2_orphans:
    toks = tokenizer(i, return_tensors="tf")
    emb = tf.add_n(model(toks, output_hidden_states=True).hidden_states[-4:]).numpy().mean(axis=-2)
    orphan_embs.append(emb)
orphan_embs = np.array(orphan_embs)[:,0,:]

# embedding layer

## Level 1

In [104]:
# last hidden state
# level_1_embs = []
# for i in level_1_list:
#     toks = tokenizer(i, return_tensors="tf")
#     emb = model(toks).last_hidden_state.numpy().mean(axis=-2)
#     level_1_embs.append(emb)
# level_1_embs = np.array(level_1_embs)[:,0,:]

# last 4 summed up
level_1_embs = []
for i in level_1_list:
    toks = tokenizer(i, return_tensors="tf")
    emb = tf.add_n(model(toks, output_hidden_states=True).hidden_states[-4:]).numpy().mean(axis=-2)
    level_1_embs.append(emb)
level_1_embs = np.array(level_1_embs)[:,0,:]

# embedding layer

## Level 0

In [111]:
# last 4 summed up
level_0_embs = []
for i in level_0_list:
    toks = tokenizer(i, return_tensors="tf")
    emb = tf.add_n(model(toks, output_hidden_states=True).hidden_states[-4:]).numpy().mean(axis=-2)
    level_0_embs.append(emb)
level_0_embs = np.array(level_0_embs)[:,0,:]

# embedding layer

In [105]:
orphan_embs.shape

(50, 768)

In [106]:
level_1_embs.shape

(292, 768)

In [112]:
level_0_embs.shape

(19, 768)

In [113]:
for i in range(orphan_embs.shape[0]):
    print(f"{level_2_orphans[i]}")
    print("L0")
    sorted_tags = np.argsort(cosine_similarity(orphan_embs[i].reshape(1,-1), level_0_embs))[0,-5:][::-1]
    sorted_scores = np.sort(cosine_similarity(orphan_embs[i].reshape(1,-1), level_0_embs))[0,-5:][::-1]
    for j in range(3):
        print(f"_______ {level_0_list[sorted_tags[j]]}: {sorted_scores[j]}")
    
    print("L1")
    sorted_tags = np.argsort(cosine_similarity(orphan_embs[i].reshape(1,-1), level_1_embs))[0,-5:][::-1]
    sorted_scores = np.sort(cosine_similarity(orphan_embs[i].reshape(1,-1), level_1_embs))[0,-5:][::-1]
    for j in range(5):
        print(f"_______ {level_1_list[sorted_tags[j]]}: {sorted_scores[j]}")
    print("\n")

two temperature
L0
_______ philosophy: 0.7631981372833252
_______ psychology: 0.7529655694961548
_______ economics: 0.752226710319519
L1
_______ risk analysis: 0.8285120725631714
_______ financial system: 0.8279814720153809
_______ engineering ethics: 0.8186806440353394
_______ positive economics: 0.816841185092926
_______ demographic economics: 0.8166027069091797


business activities
L0
_______ business: 0.7770204544067383
_______ medicine: 0.7626142501831055
_______ art: 0.7533687353134155
L1
_______ economic system: 0.8523975610733032
_______ financial system: 0.8357780575752258
_______ industrial organization: 0.8275732398033142
_______ multimedia: 0.8180881142616272
_______ communication: 0.809441089630127


bioactive molecules
L0
_______ environmental science: 0.45146480202674866
_______ materials science: 0.4260980188846588
_______ medicine: 0.39979973435401917
L1
_______ gastroenterology: 0.7339313626289368
_______ endocrinology: 0.706972599029541
_______ chromatography: 0.648

### Word2Vec

In [114]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [None]:
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

### Looking at Most Common Parents

#### Loading orphans

In [27]:
orphan_and_wiki.head()

Unnamed: 0,field_of_study_id,normalized_name,level
0,2908647359,population,2
1,3017944768,poison control,2
2,2910001868,european union,2
3,3013748606,human immunodeficiency virus,2
4,3018023364,significant difference,2


In [33]:
orphan_and_wiki.groupby('normalized_name').count().sort_values('level', ascending=False).head()

Unnamed: 0_level_0,field_of_study_id,level
normalized_name,Unnamed: 1_level_1,Unnamed: 2_level_1
3d printed,1,1
3d simulation,1,1
physical modelling,1,1
physical development,1,1
physical activity,1,1


In [2]:
level_two_parents = pd.read_parquet("s3://mag-model-data/raw_mag_data/level_2_parents/part-00000-tid-4621729047142736932-7d217523-d826-48ff-b1c8-52d0808d0ce8-4655-1-c000.snappy.parquet")
level_three_parents = pd.read_parquet("s3://mag-model-data/raw_mag_data/level_3_parents/part-00000-tid-5145909488161829896-2d50d1d1-2c81-4d07-9de0-65d636230448-4849-1-c000.snappy.parquet")

In [30]:
level_two_parents.shape

(158063, 2)

In [4]:
level_three_parents.sample(10)

Unnamed: 0,level_three,topic_list
12068,entire ulnar nerve,"[functional anatomy, functional organization, ..."
123680,phenol mouthwash,"[test agent, mucosal lesions, random assignmen..."
178101,stromatium,"[anobiidae, longhorn beetle, dna barcoding, in..."
193357,shrimp dietary,"[annelid, dietary supplement, gonad, shrimp, f..."
10223,vagina neoplasm,"[uterus endometriosis, vaginal scar, radioacti..."
217368,aspergillus raperi,"[sarocladium strictum, urokinase, penicillium,..."
267141,gabarapl2 gene,"[developmental biology, gene]"
123404,oocassida pudibunda,"[tortoise, natural enemies, woody plant]"
275164,vehicle information and communication system,"[turbo dispatch, internavi, map database manag..."
62183,fumaropimaric acid,"[curtius rearrangement, oxalyl chloride, host ..."


#### Creating spreadsheet

In [39]:
for_spread = level_two_parents.merge(orphan_and_wiki[orphan_and_wiki['level']==2][['normalized_name']].rename(columns={'normalized_name':'level_two'}), 
                        how='inner',on='level_two') \
.drop_duplicates(subset=['level_two']).reset_index(drop=True)

for_spread.head(5)

Unnamed: 0,level_two,topic_list
0,big game,"[fishery, ethnology, archaeology, forestry, ad..."
1,bone structure,"[biomedical engineering, orthodontics, dentist..."
2,cannabis sativa,"[traditional medicine, biotechnology, botany, ..."
3,career path,"[management, aeronautics, medical education, p..."
4,coast guard,"[aeronautics, forensic engineering, oceanograp..."


In [51]:
final_list = []
for j in range(for_spread.shape[0]):
    small_list = []
    small_list.append(for_spread.iloc[j,0])
    small_list.append('PASS')
    topics = for_spread.iloc[j,1].tolist()
    for i in range(10):
        try:
            small_list.append(topics[i])
        except:
            small_list.append("")
            
    final_list.append(small_list)
for_spread_2 = pd.DataFrame(final_list, columns = ['level_two', 'unknown','a','b','c','d','e','f','g','h','i','j'])

In [55]:
for_spread_2.to_csv("level_2_orphans_sheet.csv")

In [60]:
for_spread_2.shape

(1489, 12)

In [56]:
for_spread = level_three_parents.merge(orphan_and_wiki[orphan_and_wiki['level']==3][['normalized_name']] \
                                       .rename(columns={'normalized_name':'level_three'}), 
                        how='inner',on='level_three') \
.drop_duplicates(subset=['level_three']).reset_index(drop=True)

for_spread.head(5)

Unnamed: 0,level_three,topic_list
0,doctoral dissertation,"[seseli montanum, florestina, de protocol, ams..."
1,octopus,"[seven arm octopus, octopus oliveri, alloposid..."
2,popular science,"[maclaurin s inequality, thlaspi alpestre, mag..."
3,social impact,"[incipient senile cataract, medical tattoo, co..."
4,community center,"[maxillary micrognathia, party wall, corbiere,..."


In [57]:
final_list = []
for j in range(for_spread.shape[0]):
    small_list = []
    small_list.append(for_spread.iloc[j,0])
    small_list.append('PASS')
    topics = for_spread.iloc[j,1].tolist()
    for i in range(10):
        try:
            small_list.append(topics[i])
        except:
            small_list.append("")
            
    final_list.append(small_list)
for_spread_3 = pd.DataFrame(final_list, columns = ['level_three', 'unknown','a','b','c','d','e','f','g','h','i','j'])

In [58]:
for_spread_3.to_csv("level_3_orphans_sheet.csv")

In [59]:
for_spread_3.shape

(175, 12)

In [65]:
for_spread_3.sample(5)

Unnamed: 0,level_three,unknown,a,b,c,d,e,f,g,h,i,j
1,octopus,PASS,seven arm octopus,octopus oliveri,alloposidae,blanket octopus,amphitretus,octopus sp,tremoctopodidae,sepia robsoni,greater blue ringed octopus,abdopus
134,environmental effect,PASS,oncaea mediterranea,multiple margins,site engineer,recumbent body position,process classification,betula maximowicziana,age friendly city,water surface temperature,ainsliaea acerifolia,welsh mountain sheep
55,border crossing,PASS,aimless movement,meat sandwich,customs declaration,port of entry,freight terminal,travel documents,cross border trade,customs officer,input modeling,irregular migration
97,post office,PASS,aerophilately,post world war ii economic expansion,stockbridge indians,gilman hall,postmasters,herbaceous border,montana montana,recurring deposit,postal history,post office box
170,ascus,PASS,wynnea americana,botryosphaeria quercuum,sarcosoma globosum,aiteng,eleutherascus peruvianus,melzer s reagent,ascodesmis sphaerospora,lophodermium nitens,trichobolus,leptosphaerulina australis
