In [1]:
from scipy import sparse
import numpy as np
import literature

R = sparse.load_npz("data/thrm_vertex_matrix.npz")
mats = np.array(open("data/thrm_mats.txt", "r").read().splitlines())
props = ["thermoelectric"]



In [2]:
yrs = np.loadtxt('data/thrm_years.txt')
R = R[(yrs>=1996)*(yrs<=2000),:]

In [3]:
h = literature.hypergraph(R, mats, props)

In [4]:
length = 20                 # length of the walk
size = 1                    # number of the walk
prop_ind = R.shape[1]-1     # column index of the property as the starting node 

In [5]:
h.random_walk(length, size, start_inds=prop_ind, rand_seed=0)    # uniform sampling

# resulting in the following output: 
# (the first array is the sequence of selected nodes; the second array is the selected papers along the walk):
# ---------------------
# (['thermoelectric a_1244326 a_1084770 a_1085357 CoCrFeMnNi a_281555 a_1076970 CSi a_10764 Al2O3
# K2O a_1672448 CaF2 a_460834 BaF2 a_638548 a_1287239 a_955446 a_955445 a_955447'],
#  ['962469 1191497 746280 1191497 1421491 734403 1115449 132804 46832 1194889 1400463 1400463 23
# 2314 232314 894012 1035899 1035899 615755 1075096'])

(['thermoelectric CeGeNi a_886058 a_885358 Al10Ce10NiPd9 a_886058 a_885610 thermoelectric a_425231 a_524526 thermoelectric a_1710042 thermoelectric a_815535 KO5PTi K2O Na2O GeO2 a_99281 GeO2'],
 ['50739 50739 50737 50729 50729 52319 50739 9151 9151 9151 83553 83553 18658 18633 49949 19121 36848 646 646'])

In [6]:
with open("rw_seqs.txt", "w") as file:
    for i in range(100):
        rw_seqs = h.random_walk(length, size, start_inds=prop_ind, alpha=2, rand_seed=i)[0][0]    # non-uniform sampling (alpha=1)
        file.write(rw_seqs+'\n')

In [7]:
import utils 
seqs = open("rw_seqs.txt").read().splitlines()                              # reading the sequences
seqs_noauthors = utils.remove_authors_from_RW(seqs)                         # removing the author nodes
open("rw_seqs_noauthors.txt", "w").write("\n".join(seqs_noauthors)+"\n")    # saving the pruned sequences

6759

In [8]:
seqs_noauthor_path = "rw_seqs_noauthors.txt"

import embedding
embed = embedding.dww2v(seqs_noauthor_path, workers=20)     # initiating deepwalk model with a different value for parameter workers
embed.build_model()
embed.train()

2023-11-07 15:15:20,950 : INFO : Parsing lines (sentences) in: rw_seqs_noauthors.txt: 
2023-11-07 15:15:20,951 : INFO : Parameters for parsing phrases are as follows:
2023-11-07 15:15:20,951 : INFO : 	depth: 2
2023-11-07 15:15:20,952 : INFO : 	phrase_min_count: 10
2023-11-07 15:15:20,952 : INFO : 	phrase_threshold: 15
2023-11-07 15:15:20,953 : INFO : collecting all words and their counts
2023-11-07 15:15:20,953 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2023-11-07 15:15:20,955 : INFO : collected 1015 word types from a corpus of 842 words (unigram + bigrams) and 92 sentences
2023-11-07 15:15:20,956 : INFO : using 1015 counts as vocab in Phrases<0 vocab, min_count=10, threshold=15, max_vocab_size=40000000>
2023-11-07 15:15:20,956 : INFO : source_vocab length 1015
2023-11-07 15:15:20,962 : INFO : Phraser built with 0 phrasegrams
0it [00:00, ?it/s]
2023-11-07 15:15:20,967 : INFO : collecting all words and their counts
2023-11-07 15:15:20,968 : INFO : PROGRESS: at

In [9]:
sims,_,reordered_mats = embed.similarities(['thermoelectric'], mats, return_nan=False)

In [10]:
full_R = sparse.load_npz("data/thrm_vertex_matrix.npz")
subgraph_R = full_R[yrs<=2000]
studied_mats = mats[np.asarray(np.sum(subgraph_R[:,h.nA:-1].multiply(subgraph_R[:,-1]), axis=0)>0)[0,:]]
candidate_mats = mats[~np.isin(mats,studied_mats)]

In [11]:
mats

array(['F6S', 'H2O', 'O2Si', ..., 'Si12Zr13', 'Si29Zr21', 'AgGe4SbTe6'],
      dtype='<U48')

In [12]:
candidate_mats

array(['F6S', 'O4SiTi', 'GeO2', ..., 'Si12Zr13', 'Si29Zr21', 'AgGe4SbTe6'],
      dtype='<U48')

In [13]:
embed.similarities(['thermoelectric'], candidate_mats, return_nan=False)

(array([[-0.33034265, -0.01158052,  0.02781176]]),
 array(['thermoelectric'], dtype='<U14'),
 array(['CsOV', 'GaN', 'O4RuSr2'], dtype='<U7'))

In [14]:
sims,_,reordered_mats = embed.similarities(['thermoelectric'], candidate_mats, return_nan=False)

# reporting 50 materials with highest likelihood of being thermoelectric
preds = reordered_mats[np.argsort(-sims[0,:])][:50]

In [15]:
import json

gt_discs = json.load(open("data/thrm_groundtruth_discs.json","r")) 
yearwise_precs = [np.isin(preds,gt_discs[str(x)]).sum()/len(preds) for x in range(2001,2019)]
np.cumsum(yearwise_precs)

array([0.        , 0.        , 0.33333333, 0.33333333, 0.33333333,
       0.33333333, 0.66666667, 0.66666667, 0.66666667, 0.66666667,
       0.66666667, 0.66666667, 0.66666667, 0.66666667, 0.66666667,
       0.66666667, 0.66666667, 0.66666667])