In [1]:
from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


# Task 1: Recommender System Challenge 

## Import packages

In [2]:
!pip install implicit
!pip install lightfm

Collecting implicit
[?25l  Downloading https://files.pythonhosted.org/packages/bc/07/c0121884722d16e2c5beeb815f6b84b41cbf22e738e4075f1475be2791bc/implicit-0.4.4.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.1MB 4.9MB/s 
Building wheels for collected packages: implicit
  Building wheel for implicit (setup.py) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.4.4-cp37-cp37m-linux_x86_64.whl size=3406409 sha256=133a2a150d5612dd6011f0b86944bd054dc10d598df118b8b2c91f6ffa98db78
  Stored in directory: /root/.cache/pip/wheels/bf/d4/ec/fd4f622fcbefb7521f149905295b2c26adecb23af38aa28217
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.4.4
Collecting lightfm
[?25l  Downloading https://files.pythonhosted.org/packages/5e/fe/8864d723daa8e5afc74080ce510c30f7ad52facf6a157d4b42dec83dfab4/lightfm-1.16.tar.gz (310kB)
[K     |████████████████████████████████| 317kB 6.6MB/s 
Building wheels for collected packages: li

In [3]:
import implicit
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from lightfm import LightFM

## Load data

In [9]:
# read data
test_df = pd.read_csv('flickr_test_data.csv')
train_df = pd.read_csv('flickr_train_data.csv')
valid_df = pd.read_csv('flickr_validation_data.csv')

### NDCG function
This function is used to get the NDCG score of the validation dataset, this score may have a positive relationship with the score in Kaggle.

In [10]:
# there is no rating, we treat all the rate is 1
def NDCG(valid_gt, valid_perd):
  DCG = []
  # get all the use-items
  GT = valid_gt[valid_gt['rating'] == 1]
  for i in range(len(GT)):
    # the use_id and correspoding item 
    # (only one item per person in valid data set)
    user_id = GT.iloc[i]['user_id']
    item_id = GT.iloc[i]['item_id']
    # find the items we recommended to user i
    user_item = list(valid_perd[valid_perd['user_id'] == i]['item_id'])
    # if it shows in the 15, we get the DCG
     # if not we just take 0
    if item_id in user_item:
      rank = user_item.index(item_id) + 1
      # rank 1
      if rank == 1:
        DCG.append(1)
      # rest
      else:
        DCG.append(1/math.log(rank, 2))
    else: 
      DCG.append(0)
  # the average DCG of each person is the NDCG of the recommendations
  # (trate the ground true is 1)
  return np.mean(DCG)

### Recommend funciton for ALS and LMF

In [10]:
def recommend(user_id, sparse_user_item, user_vecs, item_vecs, test_df):
    # Get the interactions scores from the sparse person content matrix
    user_interactions = sparse_user_item[user_id,:].toarray()
    # Add 1 to everything, so that articles with no interaction yet become equal to 1
    user_interactions = user_interactions.reshape(-1) + 1
    # Make articles already interacted zero
    user_interactions[user_interactions > 1] = 0
    # Get dot product of person vector and all content vectors
    rec_vector = user_vecs[user_id,:].dot(item_vecs.T).toarray()
    
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    # Content already interacted have their recommendation multiplied by zero
    recommend_vector = user_interactions * rec_vector_scaled

    # Start empty list to store items and scores
    items = []
    scores = []
    test_item_id = test_df[test_df['user_id'] == user_id]['item_id']
    for i in test_item_id:
        items.append(i)
        scores.append(recommend_vector[i])
    # record the score of all 100 items
    recommendations = pd.DataFrame({'user_id': user_id, 'item_id': items, 'score': scores})
    # get the top 15 recommend items
    recommendations = recommendations.sort_values(by='score', ascending = False)[:15]
    return recommendations


## ALS

Among all the above recommendation system models, ALS with adjusted parameters has the best performance, reaching 0.253 in the NDCG of the verification set, and getting a score of 0.22 in Kaggle in the predicted test data. The larger k is, the more accurate it will be, but the longer the calculation time will be. However, at the same time, the data set we get is not large. Blindly increasing the number of hidden variables will only lead to overfitting, which may reduce the accuracy of our recommendation.

In [181]:
#convert to sparse matrix
sparse_item_user = sparse.csr_matrix((train_df['rating'].astype(float), (train_df['item_id'], train_df['user_id'])))
sparse_user_item = sparse.csr_matrix((train_df['rating'].astype(float), (train_df['user_id'], train_df['item_id'])))

In [182]:
# The rate in which we'll increase our confidence in a preference with more interactions.
alpha = 60
data = (sparse_item_user * alpha).astype('double')

In [190]:
# create and fit the model
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=350, iterations=50)
model.fit(data)



HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




### Test the NDCG score in validation dataset

In [191]:
# Get the trained person and content vectors. We convert them to csr matrices
user_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)

user_id_list = []
item_id_list = []

for user_id in set(valid_df['user_id']):
  recommendations = recommend(user_id, sparse_user_item, user_vecs, item_vecs, valid_df)
  user_id_list += list(recommendations['user_id'])
  item_id_list += list(recommendations['item_id'])
valid_rec = pd.DataFrame({'user_id': user_id_list, 'item_id': item_id_list})

print('The NDCG score in validation dataset is: ', NDCG(valid_df, valid_rec))

The NDCG score in validation dataset is:  0.25314880128637063


### Apply the model in test data

In [192]:
# Get the trained person and content vectors. We convert them to csr matrices
user_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)

user_id_list = []
item_id_list = []

for user_id in set(test_df['user_id']):
  recommendations = recommend(user_id, sparse_user_item, user_vecs, item_vecs, test_df)
  user_id_list += list(recommendations['user_id'])
  item_id_list += list(recommendations['item_id'])

ouput_df = pd.DataFrame({'user_id': user_id_list, 'item_id': item_id_list})
ouput_df.to_csv('data_df.csv',index=False)

## LMF

Compared with ALS and FM, they both carry out implicit matrix decomposition and can solve the sparse matrix decomposition problem through different algorithms. Theoretically, FM runs faster than ALS because the time complexity of the FM training step and prediction step is linear. Compared with ALS, FM's model is more general, and it can be applied to any situation with real numbers. These two models are similar, they all based on matrix decomposition, the reason of LMF only take 0.15 is the use of parameter may different between ALS.


In [12]:
# create and fit the model
model = implicit.lmf.LogisticMatrixFactorization(factors=32, regularization=350, iterations=60)
model.fit(data)

100%|██████████| 60/60 [00:03<00:00, 15.51it/s]


### Test the NDCG score in validation dataset

In [13]:
# Get the trained person and content vectors. We convert them to csr matrices
user_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)

user_id_list = []
item_id_list = []

for user_id in set(valid_df['user_id']):
  recommendations = recommend(user_id, sparse_user_item, user_vecs, item_vecs, valid_df)
  user_id_list += list(recommendations['user_id'])
  item_id_list += list(recommendations['item_id'])
valid_rec = pd.DataFrame({'user_id': user_id_list, 'item_id': item_id_list})

print('The NDCG score in validation dataset is: ', NDCG(valid_df, valid_rec))

The NDCG score in validation dataset is:  0.1479774464720333


## Light FM

The algorithm principle of Light-FM with only use training dataset is the same as the MF method. By testing different loss functions in Light-FM, we can see that LMF and Light-FM models have similar performance. I think that if I can adjust the parameters to a correct range, these two models can have the same performance as the ALS model.
The score obtained by LightFM using features of users and items is not ideal. The highest NDCG in the validation dataset with loss function BPR is only 0.059, which is inconsistent with my expectation. It may be because all our data sets are not large enough, and the prediction using features can only represent part of the time period. Or because the features does not match the training data, which will reduce the accuracy of our prediction.


In [None]:
#model = LightFM(loss='warp-kos', k=15)
#model = LightFM(loss='logistic') #0.158227
#model = LightFM(loss='bpr')#0.15682
#model.fit(data.T, user_features = user_features, item_features = item_features, epochs=10, num_threads=10)

CPU times: user 7.05 s, sys: 6.77 ms, total: 7.06 s
Wall time: 3.57 s


<lightfm.lightfm.LightFM at 0x7f7f25ff69d0>

### LightMF using train dataset

In [28]:
def test(loss):
  model = LightFM(no_components = 25, loss=loss)# 25:0.178 real 0.15565
  model.fit(data.T, epochs=30, num_threads=2)
  user_ids = list(set(valid_df['user_id']))
  item_ids = list(valid_df.groupby('user_id')['item_id'].apply(list))
  #user_ids = list(set(test_df['user_id']))
  #item_ids = list(test_df.groupby('user_id')['item_id'].apply(list))

  user_id_list = []
  item_id_list = []

  for i in user_ids:
    # get the predicted score of each item
    predictions = model.predict(i, item_ids[i], num_threads=4)

    recommendations = pd.DataFrame({'user_id': i, 'item_id': item_ids[i], 'score': predictions})
    recommendations = recommendations.sort_values(by='score', ascending = False)[:15] 
    # print(recommendations)
    user_id_list += list(recommendations['user_id'])
    item_id_list += list(recommendations['item_id'])

  valid_rec = pd.DataFrame({'user_id': user_id_list, 'item_id': item_id_list})
  print('NDCG score of LightFM using error function', loss , 'is :', NDCG(valid_df, valid_rec))

#### The NDCG score of different error function

In [29]:
loss = ['warp', 'logistic', 'warp-kos', 'bpr']
for i in loss:
  test(i)

NDCG score of LightFM using warp is : 0.17979786284622803
NDCG score of LightFM using logistic is : 0.1477172451104682
NDCG score of LightFM using warp-kos is : 0.1526017947356816
NDCG score of LightFM using bpr is : 0.17435181416276796


### LightMF using train data, item feature and user feature

In [30]:
# load the feature data
item_fea = pd.read_csv('flickr_item_fea.csv')
user_fea = pd.read_csv('flickr_user_fea.csv')

user_ids = list(set(valid_df['user_id']))
item_ids = list(valid_df.groupby('user_id')['item_id'].apply(list))
# this part is for predict base on test data
#user_ids = list(set(test_df['user_id']))
#item_ids = list(test_df.groupby('user_id')['item_id'].apply(list))

# trasform the feature dataframe into sparse matrix
user_features = sparse.csr_matrix(user_fea.values)
item_features = sparse.csr_matrix(item_fea.values)

In [31]:
def test_fea(loss):
  # The rate in which we'll increase our confidence in a preference with more interactions.
  alpha = 2 
  data = (sparse_item_user * alpha).astype('double')
  # create and fit the model
  model = LightFM(loss=loss)
  model.fit(data.T, user_features = user_features, item_features = item_features, epochs=10, num_threads=10)

  user_id_list = []
  item_id_list = []

  for i in user_ids:
    predictions = model.predict(i, item_ids[i], user_features=user_features,
                                item_features=item_features, num_threads=4)
    recommendations = pd.DataFrame({'user_id': i, 'item_id': item_ids[i], 'score': predictions})
    recommendations = recommendations.sort_values(by='score', ascending = False)[:15] 

    user_id_list += list(recommendations['user_id'])
    item_id_list += list(recommendations['item_id'])
  # get the recommendations
  valid_rec = pd.DataFrame({'user_id': user_id_list, 'item_id': item_id_list})
  print('NDCG score of LightFM using error function', loss , 'is :', NDCG(valid_df, valid_rec))

#### The NDCG score of different error function

In [33]:
for i in loss:
  test_fea(i)

NDCG score of LightFM using error function warp is : 0.04878550597605831
NDCG score of LightFM using error function logistic is : 0.05012234405657364
NDCG score of LightFM using error function warp-kos is : 0.050443260893844336
NDCG score of LightFM using error function bpr is : 0.059241248193599674


## Neural Network Model

The accuracy of the neural network is only about 0.6 and is very unstable, sometimes the training error is 0.4, but in the validation set NDCG is only 0.1, this may be because I use only two hidden layers neural network, and only 10 hidden neurons, compared to the training set in the neural network is small, it is also the cause of the result is not stable. Another reason is that neural network needs a large amount of data for training, and our data set is too small to complete the training of the large-scale neural network, which is also the reason why I did not choose to use a neural network.


In [11]:
# Note here there is no matrix multiplication, we could potentially make the embeddings of different sizes.
# Here we could get better results by keep playing with regularization.
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return x

In [12]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(train_df.user_id.values) # .cuda()
        items = torch.LongTensor(train_df.item_id.values) #.cuda()
        ratings = torch.FloatTensor(train_df.rating.values) #.cuda()
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item()) 
    y_hat = test_loss(model, unsqueeze)

In [13]:
# this function give the loss in validation data
def test_loss(model, unsqueeze=False):
    model.eval()
    # turn the matix to tensor
    users = torch.LongTensor(valid_df.user_id.values) #.cuda()
    items = torch.LongTensor(valid_df.item_id.values) #.cuda()
    ratings = torch.FloatTensor(valid_df.rating.values) #.cuda()
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    # predict the score of each item
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [14]:
# this function take the model and output the 15 recommendations of valid data
def nn_recom(model, unsqueeze=False):
  user_id_list = []
  item_id_list = []
  for i in range(num_users):
    items = torch.LongTensor(valid_df[valid_df['user_id'] ==i].item_id.values)
    users = torch.LongTensor(valid_df[valid_df['user_id'] ==i].user_id.values)
    scores = model(users, items)
    # record the score of all 100 items
    recommendations = pd.DataFrame({'user_id': users.tolist(), 'item_id': items.tolist(), 'score': scores.tolist()})
    # get the top 15 recommend items
    recommendations = recommendations.sort_values(by='score', ascending = False)[:15]
    user_id_list += list(recommendations['user_id'])
    item_id_list += list(recommendations['item_id'])
  valid_rec = pd.DataFrame({'user_id': user_id_list, 'item_id': item_id_list})
  return valid_rec

In [15]:
# get the number of users and the number of items
num_users = len(set(list(train_df.user_id)))
num_items = len(set(list(train_df.item_id)))

### The NDCG score of NN in valid dataset

In [16]:
# create model
model = CollabFNet(num_users, num_items, emb_size=100) #.cuda()
# train model
train_epocs(model, epochs=10, lr=0.1,  wd=3, unsqueeze=True)
# get the 15 recommendations
predict = nn_recom(model)

0.6616938710212708
3.9662060737609863
0.5383036732673645
1.099510908126831
0.8639771342277527
0.44130101799964905
0.1430162638425827
0.10928338021039963
0.36615920066833496
0.6584334373474121
test loss 0.085 


In [17]:
NDCG(valid_df, predict)

0.06204013904567362

## Conclusion

After comparing the NDCG score of the validation set and the half test set on Kagge, at present, ALS has the best performance with the NDCG score of 0.22. However, I think MF and Light-Fm only use the training set, can get a good score after adjusting the parameters because they are both based on the matrix decomposition method. For the Light-FM using user-item features, we cannot prove that the features are related to the training set. However, I think it will save time if features are used to predict in a large-scale recommendation system, and the accuracy can be improved by combining the user and item features with the training data. In the neural network model, due to our lack of data and use of a simple neural network, we can only get a low NDCG score.


# Task 2

## Import packages

In [21]:
!pip install Node2Vec
from node2vec import Node2Vec



In [22]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import scipy as sp
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")
import pandas as pd
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
import scipy.sparse.linalg as linalg
import scipy.cluster.hierarchy as hr
from scipy.spatial.distance import pdist, squareform
import torch
from sklearn.metrics.cluster import normalized_mutual_info_score
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7f261a005150>

## Read data

In [40]:
# get the document and same into a dictionary
docs = open('docs.txt')
docs_dic = {}
for i in docs.readlines():
  docs_dic[int(i.split(' ', 1)[0])] = i.split(' ', 1)[1].strip()
docs.close()

In [41]:
# show the dict
list(docs_dic.items())[:5]

[(12828558,
  'Assessing Local Institutional Capacity, Data Availability, and Outcomes by'),
 (66779408,
  'THE PROSPECTS FOR INTERNET TELEPHONY IN EUROPE AND LATIN AMERICA TPP 127 Telecom Modeling and Policy Analysis'),
 (38902949,
  'Economic Shocks, Safety Nets, and Fiscal Constraints: Social Protection for the Poor in Latin America'),
 (33450563, 'Reform, Growth, and Poverty in Vietnam'),
 (57470294,
  'Households and Economic Growth in Latin America and the Caribbean')]

In [42]:
# get the ture label of each node
labels_txt = open('labels.txt')
node_list = []
lable_list = []
for i in labels_txt.readlines():
  node_list += [int(i.strip().split(' ')[0])]
  lable_list+= [int(i.strip().split(' ')[1])]
labels_df = pd.DataFrame({'node': node_list, 'true_label': lable_list})       
labels_txt.close()

# get the true lable list
true_label = list(labels_df['true_label'])

In [43]:
# we get the k clusters we need to clustering from true label
k = len(set(list(labels_df['true_label'])))

## K-means fucniton

In [67]:
# take the embedding matrix as the input
# output the predicted lables by kmeans 
def kmeans(X):
  kmeans = KMeans(init='k-means++', n_clusters=k, n_init=50)
  kmeans.fit_predict(X)
  centroids = kmeans.cluster_centers_
  # get the clustering lables
  labels = kmeans.labels_
  return labels

## Create graph

In [45]:
G = nx.read_adjlist('adjedges.txt', create_using = nx.DiGraph(), nodetype = int)
# make the graph to undirected graph
G = nx.to_undirected(G)

In [94]:
print('There are', len(G.nodes()), 'nodes,', 'and', len(G.edges()), 'edges in graph G.')

There are 36928 nodes, and 54328 edges in graph G.


In [92]:
# check how many sub-graph in G
i = 0
for subg in nx.connected_components(G):
  i += 1
print('There are', i, 'numbers of sub-graph in G.')

There are 10440 numbers of sub-graph in G.


There are 36928 nodes in the graph, but the are only 18720 nodes we need, we need to extract them form the embedding, then use kmeans to predict the label to do the clusering.

In [46]:
# add the node id then left join two dataframe then remove the id 
# as the embedding matrix to apply kmeans
def getdocnode(embedding_vec):
  vec = pd.DataFrame(embedding_vec)
  # all the nodes in the graph
  all_nodes = pd.DataFrame(G.nodes, columns=['node_id'])
  # link the node and predicted label
  all_nodes_vec = pd.concat([all_nodes, vec],axis=1)
  doc_ids = pd.DataFrame(list(docs_dic.keys()), columns=['node_id'])
  # get the node we use(documents)
  node_vec = doc_ids.merge(all_nodes_vec, on = 'node_id', how = 'left').iloc[:,1:]
  return node_vec

 ## Spectral Clustering

### Get the eigenvalues and eigenvectors of the matrix

In [47]:
# get the laplacian matrix
L = nx.laplacian_matrix(G).astype(float)
# compute eigenvalues and eigenvectors of the matrix.
w,v = sp.sparse.linalg.eigsh(L, k = 5, which='SM')
X = w * v

### Apply kmeans

In [48]:
# take only nodes shows in the document id
node_vec_sc = getdocnode(X)

In [49]:
# get the lables predicted by kmeans
labels_sc = kmeans(node_vec_sc)

### The nmi score

In [50]:
# calculate the nmi score
nmi_sc = normalized_mutual_info_score(true_label, labels_sc)
print('the nmi score of Spectral Clustering is: ', nmi_sc)

the nmi score of Spectral Clustering is:  0.056940878168318824


## Node clustering

### node2vec

In [51]:
#pre-compute the probabilities and generate walks :
node2vec = Node2Vec(G, dimensions=64, walk_length=10, num_walks=5, workers=16)
# nodes embedding
model_node2vec = node2vec.fit(window=10, min_count=2, batch_words=10)

HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=36928.0, style=P…




### Apply kmeans

In [52]:
# get the embedding matrix we use
node_vec = getdocnode(model_node2vec.wv.vectors)
# get the lables predicted by kmeans
labels_node2vec = kmeans(node_vec)

### The nmi score

In [54]:
# comput the nmi score
nmi_sc = normalized_mutual_info_score(true_label, list(labels_node2vec))
print('The nmi score of Node embedding with k-means is: ', nmi_sc)

The nmi score of Node embedding with k-means is:  0.3024426634326591


## Text clustering

### Tokenize and preprocessing

In [55]:
# to store the tokens of each title
titles = []
for text in list(docs_dic.values()):
    # get only word in the text, this step can remove number
    tokens = tokenizer.tokenize(text.lower())
    # stemming
    stem = [stemmer.stem(w) for w in tokens]
    titles.append(stem)

In [56]:
# show the tokens
titles[0]

['assess',
 'local',
 'institut',
 'capac',
 'data',
 'avail',
 'and',
 'outcom',
 'by']

### word2vec

This function is use to turn word embedding to sentence embedding.

In [70]:
def sent_vectorizer(sent, model):
    sent_vec =[]
    numw = 0
    for w in sent:
        try:
            if numw == 0:
                sent_vec = model[w]
            else:
                sent_vec = np.add(sent_vec, model[w])
            numw+=1
        except:
            pass
    return np.asarray(sent_vec) / numw

In [71]:
# train the word2vec model
model_word2vec = Word2Vec(titles, min_count=0)

In [72]:
# the embedding matrix of document
text_vec_list = []
for sentence in titles:
    text_vec_list.append(sent_vectorizer(sentence, model_word2vec))
# turn the array in array to a dataframe
text_vec_df = pd.DataFrame(text_vec_list)
# turn the NaN values to number
text_vec = np.nan_to_num(text_vec_df)

  import sys
  if __name__ == '__main__':


### Apply kmeans

In [60]:
# apply kmeans
labels_text = kmeans(text_vec)

### The nmi score

In [61]:
# comput the nmi score
nmi_sc = normalized_mutual_info_score(true_label, list(labels_text))
print('The nmi score of text clustering with k-means is: ', nmi_sc)

The nmi score of text clustering with k-means is:  0.08210659584960936


## Text clustering + Node clustering
We have the embedding matrix base on the graph with embedding size 64.

In [62]:
node_vec.shape

(18720, 64)

And the embedding matrix using title of each document with embedding size 100.

In [63]:
text_vec_df.shape

(18720, 100)

Now we can just column bind those two dataframe and the apply kmeans.

In [64]:
text_and_node_vec = pd.concat([text_vec_df, node_vec],axis=1)
text_and_node_vec.shape

(18720, 164)

In [68]:
# turn the NaN values to number
text_and_node_vec = np.nan_to_num(text_and_node_vec)
label_comb = kmeans(text_and_node_vec)

### The nmi score

In [69]:
nmi_sc = normalized_mutual_info_score(true_label, label_comb)
print('The nmi score of the combination of text clustering and node clustering is: ', nmi_sc)

The nmi score of the combination of text clustering and node clustering is:  0.2993665902939315
