### Load data (doc-word)

In [103]:
import numpy as np
doc_word = np.load("science2k-doc-word.npy")
print(doc_word.shape)
print(doc_word[0])

(1373, 5476)
[-0.2521619 -0.2521619  9.36371   ... -0.2521619 -0.2521619 -0.2521619]


In [5]:
# lol i though we were to calculate the features lol
# per-document smoothed word frequencies
f = []
for doc in doc_word:
    term_count = np.count_nonzero(doc)
    word_count = sum(doc)
    f_per_doc = []
    for word in doc:
        frequency = (word + 1)/(word_count + term_count)
        f_per_doc.append(frequency)
    f.append(f_per_doc)
f = np.asarray(f)
print(f.shape)
print(f[0])

# log frequencies
log_f = np.log(f)
print(log_f[0])

(1373, 5476)
[0.00013657 0.00013657 0.00189257 ... 0.00013657 0.00013657 0.00013657]


### K-means on doc-word

In [8]:
from sklearn.cluster import KMeans
k = 20
kmeans = KMeans(n_clusters=k, random_state=0).fit(doc_word)

In [100]:
# construct feature and corresponding label list

f = []
f_idx = []
for i in range(k):
    indices = np.where(kmeans.labels_ == i)
    f_idx.append(list(indices[0]))
    feature = np.zeros(doc_word.shape[1])
    for idx in indices:
        feature = np.vstack((feature, doc_word[idx]))
    f.append(feature[1:, :])
    
print(len(f))
print(f[1])
print(f_idx[1])

20
[[ 8.820592  -0.7952802  8.820592  ... -0.7952802 -0.7952802 -0.7952802]
 [-1.262218   7.255175  11.24396   ... -1.262218  -1.262218  -1.262218 ]
 [10.28912   -1.406134   9.056998  ... -1.406134  -1.406134  -1.406134 ]
 ...
 [10.89424   -0.6186975  8.997175  ... -0.6186975 -0.6186975 -0.6186975]
 [10.04846   -0.7713402  9.132197  ... -0.7713402 -0.7713402 -0.7713402]
 [11.00702   -0.8423803 10.38288   ... -0.8423803 -0.8423803 -0.8423803]]
[41, 42, 99, 108, 191, 213, 313, 386, 423, 450, 479, 490, 506, 594, 607, 621, 648, 773, 792, 793, 871, 1090, 1122, 1133, 1134, 1136, 1150, 1187, 1260, 1291, 1292, 1370, 1371]


In [101]:
# indices of ten words/docs

from sklearn.metrics.pairwise import euclidean_distances
import pprint

x_bar = doc_word.mean(axis = 0)
print(x_bar.shape)
print(x_bar)
top_ten_words = []
top_ten_docs = []

for i in range(len(f)):
    m = f[i].mean(axis = 0)
    pos_dist = m - x_bar
    top_ten_words_per_cluster = np.argsort(pos_dist)[-10:]
    top_ten_words.append(list(top_ten_words_per_cluster))
    
    doc_dist = euclidean_distances(x_bar.reshape(1, -1), f[i])
    top_ten_doc_idx_in_cluster = np.argsort(doc_dist)[0][-10:]
    actual_idx = []
    for in_cluster_idx in list(top_ten_doc_idx_in_cluster):
        idx = f_idx[i][in_cluster_idx]
        actual_idx.append(idx)    
    top_ten_docs.append(actual_idx)
    
pprint.pprint(top_ten_words)
pprint.pprint(top_ten_docs)

# largest positive distance from x bar means 
# the frequency of certain word in current document 
# is much higher than the frequency of that word in average of all documents
# ie. it's current doc's "identifier" word

(5476,)
[ 6.12158107  3.39409491  9.01546653 ... -0.5040002  -0.58283004
 -0.5696959 ]
[[371, 2636, 2340, 954, 4465, 5074, 4305, 4478, 2551, 1936],
 [1187, 826, 1275, 237, 852, 219, 761, 163, 674, 1021],
 [1046, 687, 876, 269, 76, 167, 436, 185, 228, 49],
 [760, 0, 33, 5, 183, 189, 236, 23, 1, 21],
 [3322, 887, 3011, 646, 2312, 4411, 3120, 266, 4986, 461],
 [4629, 2210, 3643, 79, 1012, 645, 240, 1522, 444, 775],
 [1, 162, 23, 5, 22, 14, 29, 44, 7, 17],
 [34, 205, 76, 18, 1094, 17, 29, 49, 185, 171],
 [34, 195, 16, 478, 315, 278, 177, 619, 164, 96],
 [246, 979, 630, 228, 34, 167, 753, 19, 350, 444],
 [2069, 372, 1954, 3965, 1449, 1279, 4139, 4171, 2157, 3552],
 [582, 2358, 1981, 599, 1351, 4200, 3235, 1945, 1840, 3387],
 [331, 39, 58, 1285, 250, 440, 270, 283, 281, 262],
 [829, 5074, 421, 1725, 963, 1288, 1936, 1448, 2551, 2761],
 [253, 89, 277, 128, 405, 39, 154, 0, 95, 54],
 [750, 1134, 2270, 233, 3405, 1717, 206, 3395, 5070, 2609],
 [962, 82, 515, 38, 451, 208, 463, 51, 396, 230],
 [

In [92]:
# find actual words/titles with indices
# import json

with open('science2k-titles.txt', 'r') as f:
    titles = f.readlines()
    for i in range(len(titles)):
        titles[i] = titles[i].split("\n")[0]
    
print(len(titles))
print(titles[0])

top_doc_names = {}
for i in range(len(top_ten_docs)):
    one_doc_names = []
    for doc in top_ten_docs[i]:
        one_doc_names.append(titles[doc])
    top_doc_names[i] = one_doc_names
pprint.pprint(top_doc_names)

# with open('top_doc_names.json', 'w') as outfile:
#     json.dump(top_doc_names, outfile)
    

1373
"Archaeology in the Holy Land"
{0: ['"Splitting the Chromosome: Cutting the Ties That Bind Sister '
     'Chromatids"'],
 1: ['"Mammalian Neural Stem Cells"',
     '"Primates: A Natural Heritage of Conflict Resolution"',
     '"Diversity and Dynamics of Dendritic Signaling"',
     '"Whither the Future of Controlling Quantum Phenomena?"',
     '"Deconstructing the Science Wars by Reconstructing an Old Mold"',
     '"Designing a New Material World"',
     '"AIDS as a Zoonosis: Scientific and Public Health Implications"',
     '"Untangling Dendrites with Quantitative Models"',
     '"How Animals Move: An Integrative View"',
     '"Breaking down Scientific Barriers to the Study of Brain and Mind"'],
 2: ['"A New Breed of High-Tech Detectives"',
     '"Silent No Longer: \'Model Minority\' Mobilizes"',
     '"Help Needed to Rebuild Science in Yugoslavia"',
     '"The Quickening of Science Communication"',
     '"Clones: A Hard Act to Follow"',
     '"Presidential Forum: Gore and Bush Of

In [96]:
with open('science2k-vocab.txt', 'r') as f:
    words = f.readlines()
    for i in range(len(words)):
        words[i] = words[i].split("\n")[0]
    
print(len(words))
print(words[0])

top_words = {}
for i in range(len(top_ten_words)):
    one_doc_words = []
    for doc in top_ten_words[i]:
        one_doc_words.append(words[doc])
    top_words[i] = one_doc_words
pprint.pprint(top_words)

5476
fig
{0: ['chromosome',
     'spindle',
     'mitotic',
     'chromosomes',
     'metaphase',
     'chromatid',
     'anaphase',
     'chromatids',
     'cohesion',
     'sister'],
 1: ['stimulus',
     'processing',
     'task',
     'responses',
     'stimuli',
     'significant',
     'neural',
     'brain',
     'cortex',
     'visual'],
 2: ['say',
     'working',
     'get',
     'just',
     'year',
     'national',
     'people',
     'researchers',
     'scientists',
     'says'],
 3: ['staining',
     'fig',
     'control',
     'cell',
     'mouse',
     'days',
     'day',
     'expression',
     'cells',
     'mice'],
 4: ['superconducting',
     'scattering',
     'ordering',
     'charge',
     'fermi',
     'superconductivity',
     'spins',
     'quantum',
     'quasiparticle',
     'spin'],
 5: ['perspectives',
     'investigators',
     'cgi',
     'see',
     'authors',
     'issue',
     'figure',
     'author',
     'mail',
     'page'],
 6: ['cells',
     'se

### Load data and K-means on word-doc

In [104]:
import numpy as np
word_doc = np.load("science2k-word-doc.npy")
print(word_doc.shape)

k = 20
kmeans = KMeans(n_clusters=k, random_state=0).fit(word_doc)

(5476, 1373)


In [105]:
# construct test feature and corresponding label list

f2 = []
f2_idx = []
for i in range(k):
    indices = np.where(kmeans.labels_ == i)
    f2_idx.append(list(indices[0]))
    feature = np.zeros(word_doc.shape[1])
    for idx in indices:
        feature = np.vstack((feature, word_doc[idx]))
    f2.append(feature[1:, :])
    
print(len(f2))
print(f2[1])
print(f2_idx[1])

# for terms in each cluster, they are similar in the way that each term appears in a similar range of docs
# each cluster will have more terms than docs, so f2[i].shape[0] and len(f2_idx[i]) will be larger

20
[[-2.090192  -2.090192  -2.090192  ... -2.090192  -2.090192  -2.090192 ]
 [ 6.187139  -2.330254  -2.330254  ... -2.330254  -2.330254  -2.330254 ]
 [-1.830366  -1.830366  -1.830366  ... -1.830366  -1.830366  -1.830366 ]
 ...
 [-0.1584664 -0.1584664 -0.1584664 ... -0.1584664 -0.1584664 -0.1584664]
 [-0.1928384 -0.1928384 -0.1928384 ... -0.1928384 -0.1928384 -0.1928384]
 [-0.1124251 -0.1124251 -0.1124251 ... -0.1124251 -0.1124251 -0.1124251]]
[72, 147, 151, 164, 177, 188, 195, 207, 247, 268, 278, 281, 282, 296, 297, 314, 315, 337, 364, 393, 412, 433, 478, 486, 501, 502, 507, 521, 524, 561, 562, 567, 582, 601, 611, 615, 619, 644, 652, 655, 661, 662, 664, 666, 689, 694, 713, 716, 723, 740, 742, 743, 748, 752, 758, 770, 774, 777, 780, 781, 805, 807, 825, 836, 845, 850, 863, 878, 881, 886, 889, 892, 893, 896, 909, 921, 923, 939, 947, 966, 977, 1029, 1036, 1068, 1078, 1088, 1093, 1143, 1164, 1190, 1206, 1225, 1231, 1241, 1259, 1260, 1267, 1283, 1284, 1285, 1293, 1296, 1317, 1322, 1332, 1333

In [106]:
# predict indices of ten words/docs

x2_bar = word_doc.mean(axis = 0)
print(x2_bar.shape)
print(x2_bar)
top_ten_words2 = []
top_ten_docs2 = []

for i in range(len(f2)):
    m = f2[i].mean(axis = 0)
    pos_dist = m - x2_bar
    top_ten_docs_per_cluster = np.argsort(pos_dist)[-10:]
    top_ten_docs2.append(list(top_ten_docs_per_cluster))
    
    word_dist = euclidean_distances(x2_bar.reshape(1, -1), f2[i])
    top_ten_word_idx_in_cluster = np.argsort(word_dist)[0][-10:]
    actual_idx = []
    for in_cluster_idx in list(top_ten_word_idx_in_cluster):
        idx = f2_idx[i][in_cluster_idx]
        actual_idx.append(idx)    
    top_ten_words2.append(actual_idx)
    
pprint.pprint(top_ten_words2)
pprint.pprint(top_ten_docs2)

# largest positive distance from x bar means 
# the frequency of certain doc which the current term appears in
# is much higher than the frequency of that doc which average term appears in
# ie. it's current term's "most salient" doc

(1373,)
[-0.38194837 -0.34658092 -0.27070615 ...  0.13722999  0.20827001
  0.11868435]
[[714, 676, 682, 440, 319, 953, 258, 956, 588, 809],
 [412, 268, 151, 207, 177, 297, 314, 72, 164, 147],
 [1191, 1434, 1356, 928, 906, 638, 787, 1145, 1012, 400],
 [469, 434, 326, 137, 205, 67, 29, 14, 28, 17],
 [83, 22, 63, 59, 56, 42, 33, 7, 5, 1],
 [71, 127, 102, 50, 15, 48, 36, 47, 45, 16],
 [144],
 [484, 403, 358, 64, 24, 20, 0],
 [55],
 [97, 27, 135, 130, 88, 168, 75, 79, 18, 19],
 [76, 34],
 [741, 113, 678, 203, 252, 530, 162, 301, 321, 70],
 [101, 182, 126, 109, 90, 60, 92, 84, 57, 8],
 [1026, 683, 1099, 338, 586, 791, 621, 705, 279, 227],
 [2676, 2794, 1680, 1412, 979, 444, 246],
 [186, 217, 201, 105, 26, 91, 78, 98, 65, 68],
 [497, 454, 406, 236, 353, 133, 200, 134, 103, 35],
 [213],
 [74, 32, 38, 40, 69, 52, 6, 41, 37, 31],
 [198]]
[[216, 16, 1302, 636, 17, 435, 15, 436, 437, 18],
 [741, 1076, 135, 931, 1303, 986, 1307, 672, 913, 631],
 [376, 759, 236, 46, 830, 289, 519, 709, 710, 499],
 [

In [108]:
# find actual words/titles with indices

top_doc_names2 = {}
for i in range(len(top_ten_docs2)):
    one_doc_names = []
    for doc in top_ten_docs2[i]:
        one_doc_names.append(titles[doc])
    top_doc_names2[i] = one_doc_names
pprint.pprint(top_doc_names2)

{0: ['"The Atom-Cavity Microscope: Single Atoms Bound in Orbit by Single '
     'Photons"',
     '"The Dark Halo of the Milky Way"',
     '"Piecing Together the Biggest Puzzle of All"',
     '"Negative Poisson\'s Ratios for Extreme States of Matter"',
     '"The Baryon Halo of the Milky Way: A Fossil Record of Its Formation"',
     '"Orbital Physics in Transition-Metal Oxides"',
     '"The Formation and Early Evolution of the Milky Way Galaxy"',
     '"Advances in the Physics of High-Temperature Superconductivity"',
     '"Quantum Criticality: Competing Ground States in Low Dimensions"',
     '"The Galactic Center: An Interacting System of Unusual Sources"'],
 1: ['"Causes of Climate Change over the past 1000 Years"',
     '"The Global Carbon Cycle: A Test of Our Knowledge of Earth as a System"',
     '"Cool Glacial Temperatures and Changes in Moisture Source Recorded in '
     'Oman Groundwaters"',
     '"Climate Impact of Late Quaternary Equatorial Pacific Sea Surface '
     'Tempera

In [109]:
top_words2 = {}
for i in range(len(top_ten_words2)):
    one_doc_words = []
    for doc in top_ten_words2[i]:
        one_doc_words.append(words[doc])
    top_words2[i] = one_doc_words
pprint.pprint(top_words2)

{0: ['background',
     'origin',
     'units',
     'dark',
     'images',
     'separate',
     'flow',
     'clearly',
     'white',
     'rapidly'],
 1: ['late',
     'events',
     'age',
     'rates',
     'global',
     'areas',
     'source',
     'water',
     'period',
     'area'],
 2: ['london',
     'online',
     'germany',
     'division',
     'august',
     'web',
     'weight',
     'tion',
     'authors',
     'report'],
 3: ['mutants',
     'mutation',
     'mutations',
     'mutant',
     'genetic',
     'wild',
     'genes',
     'dna',
     'type',
     'gene'],
 4: ['experiments',
     'proteins',
     'molecular',
     'function',
     'specific',
     'analysis',
     'control',
     'protein',
     'cell',
     'cells'],
 5: ['change',
     'possible',
     'lower',
     'state',
     'high',
     'small',
     'large',
     'low',
     'system',
     'surface'],
 6: ['indicate'],
 7: ['email',
     'correspondence',
     'addressed',
     'reports',
     'sh

In [None]:
# clustering documents might be useful for keyword extraction for a new paper
# clustering terms might be useful for finding related papers in the already seen pool

# clustering words is easier for humans without domain knowledge to make sense out of and validate the clusters
# i am able to imagine how the words in the same cluster are related, but cannot do the same for all the titles
# for example, for word cluster #3 it is very clear those words share a higher similarity, thus they are highly related 
# and should be clustered to the same cluster