In [2]:
# import pyspark
# from pyspark.sql import SparkSession
# from graphframes import GraphFrame

from sklearn.cluster import KMeans
from sklearn import preprocessing

import gensim
from gensim import corpora
import sys
import re
import json
import numpy as np

sys.path.append("../")  # Add "../" to utils folder path
from utils import globals

In [3]:
FILENAME_GL = globals.DATA_PATH + 'output_2_2.txt'
FILENAME_CORPUS = globals.DATA_PATH + 'output_1_3.txt'

In [4]:
DICTIONARY_PATH = globals.DATA_PATH + "dictionary.pkl"

In [5]:
NUM_OF_SPECIES = 2

## Read input file

In [6]:
# Test line cleaning method
with open(FILENAME_GL) as f:
    content = f.readlines()
    
a = content[0]

re.sub('[null\t\n\[\]\""]', '', a).replace(' ', '').split(',')

['438']

In [7]:
GL = []

with open(FILENAME_GL) as f:
    content_vertices = f.readlines()

for line in content_vertices:
    clean_line = re.sub('[null\t\n\[\]\""]', '', line).replace(' ', '').split(',')
    GL.append(list(map(int, clean_line))) # Convert all strings in a list to int
    
print(GL[:5])

[[438], [1251], [119], [937], [881]]


Load dictionary file

In [15]:
dictionary = corpora.Dictionary.load(DICTIONARY_PATH)

Load corpus

In [16]:
corpus = []

with open(FILENAME_CORPUS) as f:
    content_corpus = f.readlines()

for line in content_corpus:
    clean_line = json.loads(line.replace('null\t', '{"a":').replace("\n", "}"))["a"][1]
    corpus.append(clean_line)

corpus[:1]

[[[0, 21],
  [1, 4],
  [2, 5],
  [3, 18],
  [4, 4],
  [5, 4],
  [6, 1],
  [7, 10],
  [8, 2],
  [9, 2],
  [10, 3],
  [11, 1],
  [12, 18],
  [13, 2],
  [14, 4],
  [15, 14],
  [16, 3],
  [18, 1],
  [19, 6],
  [20, 5],
  [21, 3],
  [23, 5],
  [24, 2],
  [27, 1],
  [28, 3],
  [29, 3],
  [30, 3],
  [31, 1],
  [32, 1],
  [34, 2],
  [35, 3],
  [36, 1],
  [37, 1],
  [38, 5],
  [41, 1],
  [42, 4],
  [43, 2],
  [44, 3],
  [45, 19],
  [46, 4],
  [47, 4],
  [48, 5],
  [49, 3],
  [50, 4],
  [52, 3],
  [54, 2],
  [55, 6],
  [56, 5],
  [57, 3],
  [58, 8],
  [59, 2],
  [61, 4],
  [62, 3],
  [63, 1],
  [64, 1],
  [65, 3],
  [67, 5],
  [68, 6],
  [69, 1],
  [70, 5],
  [71, 5],
  [72, 2],
  [73, 3],
  [74, 5],
  [76, 1],
  [79, 2],
  [80, 4],
  [81, 1],
  [82, 1],
  [91, 7],
  [92, 3],
  [94, 3],
  [95, 4],
  [97, 4],
  [98, 3],
  [99, 14],
  [100, 2],
  [101, 1],
  [103, 2],
  [105, 2],
  [106, 1],
  [107, 1],
  [110, 1],
  [114, 2],
  [116, 1],
  [117, 1],
  [120, 1],
  [121, 2],
  [124, 2],
  [125, 3],

In [17]:
np.shape(corpus)

(1281,)

## Compute distribution

In [18]:
def compute_dist(dist, groups, seeds, only_seed=True):
    res = []
    if only_seed:
        print(seeds)
        for seednodes in seeds:
            tmp = dist[seednodes, :]
            print(tmp)
            if globals.GROUP_AGGREGATION == "MEAN":
                res += [np.mean(tmp, axis=0)]
            elif globals.GROUP_AGGREGATION == "MEDIAN":
                res += [np.median(tmp, axis=0)]
    else:
        for groupnodes in groups:
            tmp = dist[groupnodes, :]
            if globals.GROUP_AGGREGATION == "MEAN":
                res += [np.mean(tmp, axis=0)]

            elif globals.GROUP_AGGREGATION == "MEDIAN":
                res += [np.median(tmp, axis=0)]
                
    return np.array(res)

In [19]:
def cluster_groups( group_dist ):
    if globals.SCALING:
        scaler = preprocessing.StandardScaler()
        X_scaled = scaler.fit_transform(group_dist)
        print(X_scaled)
    else:
        X_scaled = group_dist

    if globals.CLUSTERING_METHOD == 'KMEANS':
        # clustering by k-means
        kmeans = KMeans( n_clusters=NUM_OF_SPECIES, init='k-means++')
        y_grp_cl = kmeans.fit_predict( X_scaled )
        
    elif globals.CLUSTERING_METHOD == 'SPECTRAL':
        spectral = SpectralClustering(n_clusters=NUM_OF_SPECIES, eigen_solver='arpack',
                                      affinity="nearest_neighbors")
        #spectral = SpectralClustering(n_clusters=NUM_OF_SPECIES, eigen_solver='arpack',
        #                              affinity="rbf")
        y_grp_cl = spectral.fit_predict( X_scaled )

    return y_grp_cl;

Run command for `compute_dist()`
```python
corpus_m = gensim.matutils.corpus2dense(corpus, len(dictionary.keys())).T
kmer_group_dist = compute_dist( corpus_m, GL, SL, only_seed=False )
z_kmer_grp_cl = cluster_groups( kmer_group_dist )
```

In [20]:
corpus_m = gensim.matutils.corpus2dense(corpus, len(dictionary.keys())).T

In [21]:
SL = []

kmer_group_dist = compute_dist(corpus_m, GL, SL, only_seed=False)

In [22]:
kmer_group_dist

array([[24. ,  9. , 10. , ...,  0. ,  0. ,  0. ],
       [29. ,  8. ,  6. , ...,  0. ,  0. ,  0. ],
       [ 6. ,  2. ,  4. , ...,  0. ,  0. ,  0. ],
       ...,
       [36. ,  7.5, 13. , ...,  0. ,  0. ,  0. ],
       [ 7. ,  3. ,  5. , ...,  0. ,  0. ,  0. ],
       [33. , 10. ,  9. , ...,  0. ,  0. ,  0. ]], dtype=float32)

In [23]:
np.shape(kmer_group_dist)

(1101, 263)

In [24]:
z_kmer_grp_cl = cluster_groups(kmer_group_dist)

[[ 0.49923995  1.2308713   0.6810736  ... -0.03015113 -0.03015113
  -0.03015113]
 [ 0.9406396   0.8763093  -0.27496645 ... -0.03015113 -0.03015113
  -0.03015113]
 [-1.0897987  -1.251063   -0.7529865  ... -0.03015113 -0.03015113
  -0.03015113]
 ...
 [ 1.558599    0.69902825  1.3981037  ... -0.03015113 -0.03015113
  -0.03015113]
 [-1.0015187  -0.89650095 -0.51397645 ... -0.03015113 -0.03015113
  -0.03015113]
 [ 1.2937593   1.5854332   0.4420636  ... -0.03015113 -0.03015113
  -0.03015113]]
