In [1]:
import pickle
import pandas as pd
import numpy as np
# import tensorflow as tf
# from transformers import BertTokenizer, TFBertModel
# from sklearn.metrics.pairwise import cosine_similarity

### Methods to try for orphan adoption
1. Using BERT embedding to find most similar L1 and L2 topics
2. Using occurrences to determine parents
3. Using a model to predict the parent

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [3]:
with open("topics_vocab.pkl", "rb") as f:
    target_vocab = pickle.load(f)
    
len(target_vocab)

65026

In [4]:
model_tags_df = pd.DataFrame().from_dict(target_vocab, orient='index').reset_index()[['index']]
model_tags_df.columns = ['normalized_name']

In [5]:
model_tags_df.shape

(65026, 1)

In [6]:
model_tags_df.head(5)

Unnamed: 0,normalized_name
0,art
1,earth science
2,social science
3,clustering coefficient
4,gratification


In [7]:
def fix_empty_list(child):
    try:
        child = child.tolist()
    except:
        child = []
    return child

In [8]:
def create_new_name(orig_name, children):
    if (len(children) > 0) & (len(children) < 25):
        children_string = ', '.join(children)
        new_name = f"{orig_name} and also {children_string}"
    else:
        new_name = f"{orig_name}"
    return new_name

In [9]:
tags_df = pd.read_parquet('fields_of_study_ids.parquet').dropna()
tags_df['level'] = tags_df['level'].astype('int')
orphan_df = pd.read_parquet('mag_imitator_orphans.parquet').merge(model_tags_df, how='inner', on='normalized_name')
# children = pd.read_parquet('children.parquet')
# orphan_df = orphan_df.merge(children, how='left', left_on = 'normalized_name', right_on='parent').copy()
# orphan_df['child'] = orphan_df['child'].apply(fix_empty_list)
# orphan_df['new_name'] = orphan_df.apply(lambda x: create_new_name(x.normalized_name, 
#                                                                   x.child), axis=1)

In [10]:
orphan_df.shape

(1664, 3)

In [11]:
orphan_df[orphan_df['normalized_name']=='energy source']

Unnamed: 0,field_of_study_id,normalized_name,level
6,2982719155,energy source,3


In [12]:
orphan_df.groupby('level').count()

Unnamed: 0_level_0,field_of_study_id,normalized_name
level,Unnamed: 1_level_1,Unnamed: 2_level_1
2,1489,1489
3,175,175


In [13]:
orphan_df['level'].value_counts()

2    1489
3     175
Name: level, dtype: int64

In [268]:
openalex_concepts = pd.read_csv("concepts_with_wiki.csv")[['normalized_name']]
openalex_concepts.head(5)

Unnamed: 0,normalized_name
0,political science
1,philosophy
2,economics
3,business
4,psychology


In [269]:
openalex_concepts.shape

(65073, 1)

In [16]:
orphan_df.merge(openalex_concepts, how='inner', on='normalized_name').shape

(1664, 3)

In [17]:
orphan_df.merge(openalex_concepts, how='inner', on='normalized_name')['level'].value_counts()

2    1489
3     175
Name: level, dtype: int64

In [18]:
orphan_and_wiki = orphan_df.merge(openalex_concepts, how='inner', on='normalized_name')

## Orphans

In [103]:

# last hidden state
# orphan_embs = []
# for i in level_2_orphans:
#     toks = tokenizer(i, return_tensors="tf")
#     emb = model(toks).last_hidden_state.numpy().mean(axis=-2)
#     orphan_embs.append(emb)
# orphan_embs = np.array(orphan_embs)[:,0,:]

# last 4 summed up
orphan_embs = []
for i in level_2_orphans:
    toks = tokenizer(i, return_tensors="tf")
    emb = tf.add_n(model(toks, output_hidden_states=True).hidden_states[-4:]).numpy().mean(axis=-2)
    orphan_embs.append(emb)
orphan_embs = np.array(orphan_embs)[:,0,:]

# embedding layer

## Level 1

In [104]:
# last hidden state
# level_1_embs = []
# for i in level_1_list:
#     toks = tokenizer(i, return_tensors="tf")
#     emb = model(toks).last_hidden_state.numpy().mean(axis=-2)
#     level_1_embs.append(emb)
# level_1_embs = np.array(level_1_embs)[:,0,:]

# last 4 summed up
level_1_embs = []
for i in level_1_list:
    toks = tokenizer(i, return_tensors="tf")
    emb = tf.add_n(model(toks, output_hidden_states=True).hidden_states[-4:]).numpy().mean(axis=-2)
    level_1_embs.append(emb)
level_1_embs = np.array(level_1_embs)[:,0,:]

# embedding layer

## Level 0

In [111]:
# last 4 summed up
level_0_embs = []
for i in level_0_list:
    toks = tokenizer(i, return_tensors="tf")
    emb = tf.add_n(model(toks, output_hidden_states=True).hidden_states[-4:]).numpy().mean(axis=-2)
    level_0_embs.append(emb)
level_0_embs = np.array(level_0_embs)[:,0,:]

# embedding layer

In [105]:
orphan_embs.shape

(50, 768)

In [106]:
level_1_embs.shape

(292, 768)

In [112]:
level_0_embs.shape

(19, 768)

In [113]:
for i in range(orphan_embs.shape[0]):
    print(f"{level_2_orphans[i]}")
    print("L0")
    sorted_tags = np.argsort(cosine_similarity(orphan_embs[i].reshape(1,-1), level_0_embs))[0,-5:][::-1]
    sorted_scores = np.sort(cosine_similarity(orphan_embs[i].reshape(1,-1), level_0_embs))[0,-5:][::-1]
    for j in range(3):
        print(f"_______ {level_0_list[sorted_tags[j]]}: {sorted_scores[j]}")
    
    print("L1")
    sorted_tags = np.argsort(cosine_similarity(orphan_embs[i].reshape(1,-1), level_1_embs))[0,-5:][::-1]
    sorted_scores = np.sort(cosine_similarity(orphan_embs[i].reshape(1,-1), level_1_embs))[0,-5:][::-1]
    for j in range(5):
        print(f"_______ {level_1_list[sorted_tags[j]]}: {sorted_scores[j]}")
    print("\n")

two temperature
L0
_______ philosophy: 0.7631981372833252
_______ psychology: 0.7529655694961548
_______ economics: 0.752226710319519
L1
_______ risk analysis: 0.8285120725631714
_______ financial system: 0.8279814720153809
_______ engineering ethics: 0.8186806440353394
_______ positive economics: 0.816841185092926
_______ demographic economics: 0.8166027069091797


business activities
L0
_______ business: 0.7770204544067383
_______ medicine: 0.7626142501831055
_______ art: 0.7533687353134155
L1
_______ economic system: 0.8523975610733032
_______ financial system: 0.8357780575752258
_______ industrial organization: 0.8275732398033142
_______ multimedia: 0.8180881142616272
_______ communication: 0.809441089630127


bioactive molecules
L0
_______ environmental science: 0.45146480202674866
_______ materials science: 0.4260980188846588
_______ medicine: 0.39979973435401917
L1
_______ gastroenterology: 0.7339313626289368
_______ endocrinology: 0.706972599029541
_______ chromatography: 0.648

### Word2Vec

In [114]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [None]:
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

### Looking at Most Common Parents

#### Loading orphans

In [27]:
orphan_and_wiki.head()

Unnamed: 0,field_of_study_id,normalized_name,level
0,2908647359,population,2
1,3017944768,poison control,2
2,2910001868,european union,2
3,3013748606,human immunodeficiency virus,2
4,3018023364,significant difference,2


In [33]:
orphan_and_wiki.groupby('normalized_name').count().sort_values('level', ascending=False).head()

Unnamed: 0_level_0,field_of_study_id,level
normalized_name,Unnamed: 1_level_1,Unnamed: 2_level_1
3d printed,1,1
3d simulation,1,1
physical modelling,1,1
physical development,1,1
physical activity,1,1


In [48]:
level_two_parents = pd.read_parquet("s3://mag-model-data/raw_mag_data/level_2_parents/part-00000-tid-4621729047142736932-7d217523-d826-48ff-b1c8-52d0808d0ce8-4655-1-c000.snappy.parquet")
level_three_parents = pd.read_parquet("s3://mag-model-data/raw_mag_data/level_3_parents/part-00000-tid-5973691913633613382-bc0cde1e-b2ec-4cc4-822d-2c96ed469532-33-1-c000.snappy.parquet")

In [105]:
level_two_parents[level_two_parents['level_two']=='octopus']

Unnamed: 0,level_two,topic_list
5028,octopus,"[fishery, zoology, optometry, anatomy, ophthal..."


In [39]:
# level_three_parents = pd.read_parquet("s3://mag-model-data/raw_mag_data/level_3_new_parents/part-00000-tid-7708135600281052475-bbc98008-7c3a-4e19-9d5b-71d02904d5dd-32-1-c000.snappy.parquet")

In [30]:
level_two_parents.shape

(158063, 2)

In [49]:
level_three_parents.sample(10)

Unnamed: 0,level_three,topic_list
295896,cybaeus vignai,"[genus, spider, taxonomy]"
298814,pichia siamensis,"[yeast, budding, fungi imperfecti, taxonomy]"
180190,lard factor,"[vitamin, cod liver oil, identification, mercury]"
26508,plasmacytoid dendritic cell activation,"[immune system, interferon, cytokine, receptor..."
198382,needle deflection,"[needle insertion, deflection, bevel, percutan..."
239493,temnohaswellia,"[crayfish, checklist, identification, genus, b..."
50625,metatheatre,"[drama, performance art, politics, comedy, tra..."
195711,chlorheridine diacetate,"[sperm, in vitro, antibacterial effect]"
188306,pyrofil light bond,"[composite number, irradiation, dentin, tube, ..."
19008,gonodonta nutrix,"[entomology, orange, insect, larva]"


In [50]:
level_three_parents.shape

(313261, 2)

In [62]:
level_2s = list(set([x for y in level_three_parents['topic_list'].tolist() for x in y]))

['human health']

In [73]:
all_cons = openalex_concepts['normalized_name'].tolist()

In [75]:
all_cons[:5]

['political science', 'philosophy', 'economics', 'business', 'psychology']

In [192]:
search_term = 'time preference'
print([x for x in level_2s if x.startswith(search_term)])
print([x for x in all_cons if str(x).startswith(search_term)])

['time preference']
['time preference']


In [193]:
level_two_parents[level_two_parents['level_two'].str.startswith(search_term)]

Unnamed: 0,level_two,topic_list
38966,time preference,"[microeconomics, mathematical economics, keyne..."


#### Creating spreadsheet

In [39]:
for_spread = level_two_parents.merge(orphan_and_wiki[orphan_and_wiki['level']==2][['normalized_name']].rename(columns={'normalized_name':'level_two'}), 
                        how='inner',on='level_two') \
.drop_duplicates(subset=['level_two']).reset_index(drop=True)

for_spread.head(5)

Unnamed: 0,level_two,topic_list
0,big game,"[fishery, ethnology, archaeology, forestry, ad..."
1,bone structure,"[biomedical engineering, orthodontics, dentist..."
2,cannabis sativa,"[traditional medicine, biotechnology, botany, ..."
3,career path,"[management, aeronautics, medical education, p..."
4,coast guard,"[aeronautics, forensic engineering, oceanograp..."


In [51]:
final_list = []
for j in range(for_spread.shape[0]):
    small_list = []
    small_list.append(for_spread.iloc[j,0])
    small_list.append('PASS')
    topics = for_spread.iloc[j,1].tolist()
    for i in range(10):
        try:
            small_list.append(topics[i])
        except:
            small_list.append("")
            
    final_list.append(small_list)
for_spread_2 = pd.DataFrame(final_list, columns = ['level_two', 'unknown','a','b','c','d','e','f','g','h','i','j'])

In [55]:
for_spread_2.to_csv("level_2_orphans_sheet.csv")

In [198]:
for_spread_2.shape

NameError: name 'for_spread_2' is not defined

In [51]:
for_spread = orphan_and_wiki[orphan_and_wiki['level']==3][['normalized_name']] \
.rename(columns={'normalized_name':'level_three'}).merge(level_three_parents, 
                                                         how='left',on='level_three') \
.drop_duplicates(subset=['level_three']).reset_index(drop=True)

for_spread.head(5)

Unnamed: 0,level_three,topic_list
0,energy source,"[fossil fuel, renewable energy, coal, petroleu..."
1,economic shortage,"[china, work, government, population, quality,..."
2,human health,
3,algebra,
4,constructive,"[process, context, work, politics, order, qual..."


In [52]:
null_L3_topics = for_spread[for_spread['topic_list'].isnull()].copy()

In [53]:
for_spread = for_spread[~for_spread['topic_list'].isnull()].copy()

In [54]:
final_list = []
for j in range(for_spread.shape[0]):
    small_list = []
    small_list.append(for_spread.iloc[j,0])
    small_list.append('PASS')
    try:
        topics = for_spread.iloc[j,1].tolist()
    except:
        topics = []
    for i in range(10):
        try:
            small_list.append(topics[i])
        except:
            small_list.append("")
            
    final_list.append(small_list)
for_spread_3 = pd.DataFrame(final_list, columns = ['level_three', 'unknown','a','b','c','d','e','f','g','h','i','j'])

In [55]:
for_spread_3.to_csv("level_3_orphans_sheet.csv")

In [56]:
for_spread_3.shape

(144, 12)

In [59]:
for_spread_3[for_spread_3['level_three']=='numerical models']

Unnamed: 0,level_three,unknown,a,b,c,d,e,f,g,h,i,j
17,numerical models,PASS,finite element method,computer simulation,numerical analysis,numerical modeling,mathematical model,flow,nonlinear system,process,field,work


In [245]:
for_spread_3

Unnamed: 0.1,level_three,Unnamed: 0,unknown,a,b,c,d,e,f,g,h,i,j
0,energy source,0,PASS,fossil fuel,renewable energy,coal,petroleum,natural gas,biomass,energy consumption,solar energy,combustion,electricity
1,economic shortage,1,PASS,china,work,government,population,quality,health care,order,workforce,production,agriculture
2,constructive,2,PASS,process,context,work,politics,order,quality,set,value,china,perspective
3,qualitative analysis,3,PASS,analyse qualitative,quantitative analysis,qualitative research,context,process,perception,work,population,quality,data collection
4,human being,4,PASS,context,meaning,value,order,subject,perspective,process,politics,humanity,natural
...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,modiolus,139,PASS,cochlea,cochlear implant,inner ear,electrode,temporal bone,implant,hearing loss,guinea pig,cadaver,electrophysiology
140,chalazion,140,PASS,eyelid,lesion,curettage,biopsy,blepharitis,triamcinolone acetonide,conjunctiva,differential diagnosis,eye disease,carcinoma
141,phallus,141,PASS,genus,hemiptera,morphology,heteroptera,key,male genitalia,china,appendage,dorsum,lepidoptera genitalia
142,b tree,142,PASS,tree,search engine indexing,data structure,index,node,structure,hash function,set,flash memory,key


### Pulling the data back from the spreadsheet to get the parents

In [252]:
for_spread_2 = pd.read_csv("level_2_orphans_sheet.csv")
for_spread_2.shape

(1500, 12)

In [253]:
level_2_comp = pd.read_csv("level_2_orphans_sheet_complete.csv")
level_2_comp.shape

(1489, 13)

In [254]:
for_spread_3 = pd.read_csv("level_3_orphans_sheet.csv").drop("Unnamed: 0", axis=1)
for_spread_3.shape

(144, 12)

In [255]:
level_3_comp = pd.read_csv("level_3_orphans_L2_sheet_complete.csv")
level_3_comp.shape

(175, 14)

In [256]:
level_2_comp = level_2_comp.merge(for_spread_2[['level_two']], how='inner', on=['level_two']).copy()

In [257]:
for_spread_2 = level_2_comp[['level_two']].merge(for_spread_2, how='inner', on=['level_two']).copy()

In [258]:
level_3_comp = level_3_comp.merge(for_spread_3[['level_three']], how='inner', on=['level_three']).copy()

In [259]:
for_spread_3 = level_3_comp[['level_three']].merge(for_spread_3, how='inner', on=['level_three']).copy()

In [260]:
all_L2_parents = []
L2_orphan = []
for i in range(level_2_comp.shape[0]):
    true_mask = np.array(level_2_comp.iloc[i,2:-1]) == 'x'
    parent_ops = np.array(for_spread_2.iloc[i,2:])
    final_parents = []
    
    final_parents = parent_ops[true_mask].tolist()
    
    if pd.notnull(level_2_comp.iloc[i,-1]):
        final_parents += [str(level_2_comp.iloc[i,-1])]
    
    if len(final_parents)==0:
        print(str(level_2_comp.iloc[i,0]))
    all_L2_parents.append(final_parents)
    L2_orphan.append(str(level_2_comp.iloc[i,0]))

In [263]:
L3_orphan = []
all_L3_parents = []
switch_to_L2 = []
for i in range(level_3_comp.shape[0]):
    if str(level_3_comp.iloc[i,-2]) == 'L1':
        print(f"Switch - {str(level_3_comp.iloc[i,0])}")
        switch_to_L2.append(str(level_3_comp.iloc[i,0]))
    else:
        final_parents = []
        true_mask = np.array(level_3_comp.iloc[i,2:-2]) == 'x'
        parent_ops = np.array(for_spread_3.iloc[i,2:])

        final_parents = parent_ops[true_mask].tolist()

        if pd.notnull(level_3_comp.iloc[i,-1]):
            final_parents += [str(level_3_comp.iloc[i,-1])]

        if len(final_parents)==0:
            print(str(level_3_comp.iloc[i,0]))
        all_L3_parents.append(final_parents)
        L3_orphan.append(str(level_3_comp.iloc[i,0]))

In [264]:
final_L2_parents = pd.DataFrame(zip(L2_orphan, all_L2_parents), columns=['normalized_name','parents'])

In [265]:
final_L3_parents = pd.DataFrame(zip(L3_orphan, all_L3_parents), columns=['normalized_name','parents'])

In [266]:
final_L2_parents

Unnamed: 0,normalized_name,parents
0,big game,"[fishery, environmental ethics, environmental ..."
1,bone structure,"[biomedical engineering, anatomy, physiology]"
2,cannabis sativa,"[botany, horticulture]"
3,career path,"[management, business administration, engineer..."
4,coast guard,"[marine engineering, environmental protection]"
...,...,...
1484,summer camp,"[ethnology, developmental psychology]"
1485,tobacco product,[environmental health]
1486,visual media,"[visual arts, multimedia]"
1487,federal state,"[public administration, economic policy]"


In [267]:
final_L3_parents

Unnamed: 0,normalized_name,parents
0,energy source,"[fossil fuel, renewable energy, coal, petroleu..."
1,economic shortage,"[work, government]"
2,constructive,[process]
3,qualitative analysis,[qualitative research]
4,human being,[humanity]
...,...,...
139,modiolus,[cochlea]
140,chalazion,[eyelid]
141,phallus,[male genitalia]
142,b tree,"[tree, search engine indexing, data structure]"


### Check against vocab to make sure parents exist

In [270]:
concepts_check = openalex_concepts['normalized_name'].tolist()

In [273]:
for i in range(final_L2_parents.shape[0]):
    new_list = [x for x in final_L2_parents.iloc[i,1]]
    if len(new_list) == 0:
        print(final_L2_parents.iloc[i,0])
        print(final_L2_parents.iloc[i,1])
        print("\n")

In [274]:
for i in range(final_L3_parents.shape[0]):
    new_list = [x for x in final_L3_parents.iloc[i,1]]
    if len(new_list) == 0:
        print(final_L3_parents.iloc[i,0])
        print(final_L3_parents.iloc[i,1])
        print("\n")

### Get into proper format for Heather

In [284]:
final_L2_parents = final_L2_parents.explode('parents').copy()

In [285]:
final_L3_parents = final_L3_parents.explode('parents').copy()

In [279]:
field_of_study_ids = pd.read_parquet("fields_of_study_ids.parquet")
field_of_study_ids.head(1)

Unnamed: 0,normalized_name,field_of_study_id,level
0,u s standard atmosphere,16287357,3.0


In [281]:
names = field_of_study_ids['normalized_name'].tolist()
ids = field_of_study_ids['field_of_study_id'].tolist()

In [282]:
id_mapping = {name:name_id for name, name_id in zip(names, ids)}

In [286]:
final_L2_parents['normalized_name'] = final_L2_parents['normalized_name'].apply(lambda x: id_mapping[x])
final_L2_parents['parents'] = final_L2_parents['parents'].apply(lambda x: id_mapping[x])

In [287]:
final_L3_parents['normalized_name'] = final_L3_parents['normalized_name'].apply(lambda x: id_mapping[x])
final_L3_parents['parents'] = final_L3_parents['parents'].apply(lambda x: id_mapping[x])

In [288]:
final_L2_parents.columns = ['orphan_id','parent_id']
final_L3_parents.columns = ['orphan_id','parent_id']

In [289]:
all_orphans = pd.concat([final_L2_parents, final_L3_parents], axis=0)

In [290]:
all_orphans.shape

(3124, 2)

In [292]:
all_orphans.to_csv("orphan_parents.csv", index=False)