In [13]:
import data
import extension
import re

In [14]:
tabulation = data.tabulation(path='resource/csv/data.csv')
tabulation.read()
print(tabulation.data.shape)
print(tabulation.data['keyword'].value_counts())

(498, 6)
melanoma        108
chest           107
diabetes         98
hypertension     97
covid-19         88
Name: keyword, dtype: int64


In [15]:
vocabulary = data.vocabulary()
vocabulary.build(sentence=tabulation.data['abstract'], title=tabulation.data['title'])

100%|██████████| 498/498 [00:01<00:00, 362.80it/s]


In [16]:
machine = extension.machine(vocabulary.word, vocabulary=vocabulary)
machine.build(what='model', by='SG', window=10, dimension=150, epoch=20)
machine.collect(title=["covid-19", "diabetes", "melanoma", "hypertension", 'chest'], top=30)
machine.reduce(method="MDS", dimension=2)

loss after epoch 0: 1439338.625
loss after epoch 1: 2525426.75
loss after epoch 2: 3534746.25
loss after epoch 3: 4455019.5
loss after epoch 4: 5252533.0
loss after epoch 5: 6033547.5
loss after epoch 6: 6801783.5
loss after epoch 7: 7553366.5
loss after epoch 8: 8290129.0
loss after epoch 9: 8921958.0
loss after epoch 10: 9531420.0
loss after epoch 11: 10133417.0
loss after epoch 12: 10730749.0
loss after epoch 13: 11321132.0
loss after epoch 14: 11902631.0
loss after epoch 15: 12477099.0
loss after epoch 16: 13047792.0
loss after epoch 17: 13618377.0
loss after epoch 18: 14187149.0
loss after epoch 19: 14758481.0


In [17]:
picture = machine.plot(skip='')
picture.write_html("SG-plot.html")
picture.show()

---

In [60]:
# pair = ("chest", "tube")
# pair = ("2019", "coronaviru")
pair = ("obs", "ambulatori")
# pair = ("ocular", "uveal")


##  使用 tf

In [61]:
##  Term document matrix.
weight = extension.weight(matrix=vocabulary.matrix)
matrix = weight.transform(what='tf')

In [62]:
comparison = matrix.loc[(matrix.index==pair[0])|(matrix.index==pair[1])].sum(axis=0)
title = tabulation.data.loc[tabulation.data['title']==comparison.idxmax()]['title'].item()
text = tabulation.data.loc[tabulation.data['title']==comparison.idxmax()]['abstract'].item()
print('title : {}'.format(title))
print('-'*10, 'abstract', '-'*10, '\n', text, '\n')
print('-'*10, 'abstract (tokenize + stemming + remove stopword)', '-'*10, '\n', [re.sub(i, "**"+i+"**", i) if((pair[0] in i)or(pair[1] in i)) else i  for i in vocabulary.tokenize(text)])

title : Clinical features of 8295 patients with resistant hypertension classified on the basis of ambulatory blood pressure monitoring
---------- abstract ---------- 
 We aimed to estimate the prevalence of resistant hypertension through both office and ambulatory blood pressure monitoring in a large cohort of treated hypertensive patients from the Spanish Ambulatory Blood Pressure Monitoring Registry. In addition, we also compared clinical features of patients with true or white-coat-resistant hypertension. In December 2009, we identified 68 045 treated patients with complete information for this analysis. Among them, 8295 (12.2% of the database) had resistant hypertension (office blood pressure ≥140 and/or 90 mm Hg while being treated with ≥3 antihypertensive drugs, 1 of them being a diuretic). After ambulatory blood pressure monitoring, 62.5% of patients were classified as true resistant hypertensives, the remaining 37.5% having white-coat resistance. The former group was younger, m

##  使用 tf-idf

In [48]:
##  Term document matrix.
weight = extension.weight(matrix=vocabulary.matrix)
matrix = weight.transform(what='tf-idf')

In [49]:
comparison = matrix.loc[(matrix.index==pair[0])|(matrix.index==pair[1])].sum(axis=0)
title = tabulation.data.loc[tabulation.data['title']==comparison.idxmax()]['title'].item()
text = tabulation.data.loc[tabulation.data['title']==comparison.idxmax()]['abstract'].item()
print('title : {}'.format(title))
print('-'*10, 'abstract', '-'*10, '\n', text, '\n')
print('-'*10, 'abstract (tokenize + stemming + remove stopword)', '-'*10, '\n', [re.sub(i, "**"+i+"**", i) if((pair[0] in i)or(pair[1] in i)) else i  for i in vocabulary.tokenize(text)])

title : [Choroidal and ciliary body melanoma: case report]
---------- abstract ---------- 
 Ocular melanomas correspond to 5% of all melanomas and 85% of them have its origin in the uveal tract. Uveal melanoma is the most common primary intraocular malignant tumor in the adult. In this article, a case of uveal melanoma in a 31 year-old female patient, with photopsia, hyperemia and low visual acuity in the left eye with evolution of 4 months is presented. In the ophthalmologic examination, visual acuity was lower than 20/400, a large tumoral mass was noted at the nasal region behind the iris with anterior lens displacement, anterior chamber narrowing and serous retinal detachment. The ocular echography suggested a large tumoral mass as a choroidal melanoma extending to the ciliary body. The confirmation diagnosis was possible through the histopathologic examination. 

---------- abstract (tokenize + stemming + remove stopword) ---------- 
 ['**ocular**', 'melanoma', 'correspond', '5', '

##  使用 norm-tf-idf

In [63]:
##  Term document matrix.
weight = extension.weight(matrix=vocabulary.matrix)
matrix = weight.transform(what='norm-tf-idf')

In [64]:
comparison = matrix.loc[(matrix.index==pair[0])|(matrix.index==pair[1])].sum(axis=0)
title = tabulation.data.loc[tabulation.data['title']==comparison.idxmax()]['title'].item()
text = tabulation.data.loc[tabulation.data['title']==comparison.idxmax()]['abstract'].item()
print('title : {}'.format(title))
print('-'*10, 'abstract', '-'*10, '\n', text, '\n')
print('-'*10, 'abstract (tokenize + stemming + remove stopword)', '-'*10, '\n', [re.sub(i, "**"+i+"**", i) if((pair[0] in i)or(pair[1] in i)) else i  for i in vocabulary.tokenize(text)])

title : [Definition of arterial hypertension]
---------- abstract ---------- 
 Recent publication of several consensus by well-known international experts in the field of high blood pressure gives us the opportunity to update the definition of hypertension. For the first time, systolic blood pressure is taken into account to define hypertensives. Ambulatory blood pressure monitoring has been recently developed for the evaluation and treatment of hypertensive patients. Due to the absence of world-wide recognized normative data and the lack of prospective studies assessing the superiority of ambulatory blood pressure monitoring over casual blood pressure measurement for the prediction of cardiovascular events, the use of this new technique is to be restricted to a limited number of hypertensive patients. 

---------- abstract (tokenize + stemming + remove stopword) ---------- 
 ['recent', 'public', 'sever', 'consensu', 'well-known', 'intern', 'expert', 'field', 'high', 'blood', 'pressur'

In [None]:
# comparison = matrix.loc[(matrix.index=='borderlin')|(matrix.index=='obes')].sum(axis=0)
# text = tabulation.data.loc[tabulation.data['title']==comparison.idxmax()]['abstract'].item()
# print('-'*10, 'abstract', '-'*10, '\n', text, '\n')
# print('-'*10, 'abstract (tokenize + stemming + remove stopword)', '-'*10, '\n', vocabulary.tokenize(text))

In [None]:
# comparison = matrix.loc[(matrix.index=='attach')|(matrix.index=='three-bottl')].sum(axis=0)
# text = tabulation.data.loc[tabulation.data['title']==comparison.idxmax()]['abstract'].item()
# print('-'*10, 'abstract', '-'*10, '\n', text, '\n')
# print('-'*10, 'abstract (tokenize + stemming + remove stopword)', '-'*10, '\n', vocabulary.tokenize(text))

In [None]:
# comparison = matrix.loc[(matrix.index=='ocular')|(matrix.index=='uveal')].sum(axis=0)
# text = tabulation.data.loc[tabulation.data['title']==comparison.idxmax()]['abstract'].item()
# print('-'*10, 'abstract', '-'*10, '\n', text, '\n')
# print('-'*10, 'abstract (tokenize + stemming + remove stopword)', '-'*10, '\n', vocabulary.tokenize(text))