#1. Import libraries

In [None]:
import numpy as np
import networkx as nx
import pandas as pd
import scipy
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [None]:
from google.colab import drive
import os, io


drive.mount('/content/drive')

os.chdir('/content/drive/MyDrive/prompt_recc')

Mounted at /content/drive


## 2. Import data

## 2 (a) Import cosine similarity matrix in numpy array

In [None]:
# Import cosine similarity matrix
model1_Tf_sim = np.loadtxt('/content/drive/MyDrive/prompt_recc/model1_Tf_sim.csv', delimiter = ',')
model1_CV_sim = np.loadtxt('/content/drive/MyDrive/prompt_recc/model1_CV_sim.csv', delimiter = ',')
model2_Tf_sim = np.loadtxt('/content/drive/MyDrive/prompt_recc/model2_Tf_sim.csv', delimiter = ',')
model2_CV_sim = np.loadtxt('/content/drive/MyDrive/prompt_recc/model2_CV_sim.csv', delimiter = ',')

In [None]:
model3_word2vec_sim = np.loadtxt('/content/drive/MyDrive/prompt_recc/model3_word2vec_sim.csv', delimiter = ',')
model3_glove_sim  = np.loadtxt('/content/drive/MyDrive/prompt_recc/model3_glove_sim.csv', delimiter = ',')
model3_bert_sim = np.loadtxt('/content/drive/MyDrive/prompt_recc/model3_bert_sim.csv', delimiter = ',')

## 2 (b) Import cosine similarity matrix in dataframe 

In [None]:
# Preprocess harshika's csv file to create a graph

df = pd.read_csv('/content/drive/MyDrive/prompt_recc/similarity_matrix_harshika_model.csv')
df.dtypes


Unnamed: 0      int64
0             float64
1             float64
2             float64
3             float64
               ...   
4995          float64
4996          float64
4997          float64
4998          float64
4999          float64
Length: 5001, dtype: object

In [None]:
# drop the id column or Unnamed: 0 column
df.drop(columns = df.columns[0], axis = 1, inplace = True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,1.0,0.335868,0.340639,0.333695,0.319815,0.336406,0.319133,0.33948,0.288963,0.347205,...,0.307803,0.397171,0.280548,0.351371,0.312033,0.351212,0.351535,0.333856,0.368328,0.316776
1,0.335868,1.0,0.293735,0.350205,0.257142,0.389351,0.332074,0.39752,0.295801,0.418522,...,0.394574,0.360643,0.324604,0.395757,0.353504,0.43643,0.37494,0.367746,0.366214,0.317529
2,0.340639,0.293735,1.0,0.337351,0.34295,0.307485,0.3156,0.310919,0.311884,0.279518,...,0.304241,0.299493,0.267555,0.302531,0.278214,0.31426,0.322356,0.332732,0.313556,0.391975
3,0.333695,0.350205,0.337351,1.0,0.316604,0.353427,0.380974,0.352612,0.387483,0.322822,...,0.386006,0.310978,0.279548,0.321343,0.331737,0.347738,0.361557,0.349593,0.402199,0.318103
4,0.319815,0.257142,0.34295,0.316604,1.0,0.268562,0.307682,0.25103,0.309974,0.255252,...,0.265501,0.272668,0.298859,0.268792,0.275969,0.297623,0.283812,0.281741,0.325842,0.373789


In [None]:
# convert dataframe to numpy
harshika_sim = df.to_numpy()
harshika_sim

array([[1.        , 0.33586833, 0.34063928, ..., 0.33385556, 0.3683284 ,
        0.31677635],
       [0.33586833, 1.        , 0.29373521, ..., 0.36774582, 0.36621418,
        0.31752912],
       [0.34063928, 0.29373521, 1.        , ..., 0.33273187, 0.31355561,
        0.3919748 ],
       ...,
       [0.33385556, 0.36774582, 0.33273187, ..., 1.        , 0.35550381,
        0.34482629],
       [0.3683284 , 0.36621418, 0.31355561, ..., 0.35550381, 1.        ,
        0.27278235],
       [0.31677635, 0.31752912, 0.3919748 , ..., 0.34482629, 0.27278235,
        1.        ]])

## 2 (c) Import YOLO result and then calculate similarity matrix 

In [None]:
# Preprocess data for YOLO feature extraction result
df3 = pd.read_csv('/content/drive/MyDrive/prompt_recc/Yolo/YOLO_results.csv')
df3.head()

Unnamed: 0.1,Unnamed: 0,filenames,scores,classes,class label
0,0,im_in_0.png,[ [0.98582095 0.3349454 0.30242687 0. ...,[ [ 0. 77. 59. 0. 0. 0. 0. 0. 0. 0. 0....,[person bicycle car]
1,1,im_in_1.png,[ [0.8281204 0. 0. 0. 0. ...,[ [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0...,[person]
2,2,im_in_10.png,[ [0.6871961 0.44457653 0. 0. ...,[ [ 0. 59. 0. 0. 0. 0. 0. 0. 0. 0. 0....,[person bicycle]
3,3,im_in_100.png,[ [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0...,[ [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0...,[]
4,4,im_in_1000.png,[ [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0...,[ [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0...,[]


In [None]:
# extract id from filenames
df_yolo = df3.copy()
def extract_id(row):
  filename = row[1]
  split_1 = filename.split('_')[2]
  return int(split_1.split('.')[0])
df_yolo['id'] = df_yolo.apply(extract_id, axis = 1)

# convert string of numpy array, scores column, to numpy array
def extract_np_array(row):
  scores = row[2]
  remove_braket = scores[3:-2]
  return np.fromstring(remove_braket, dtype = float, sep = ' ')

df_yolo['np_scores'] = df_yolo.apply(extract_np_array, axis = 1)
df_yolo.head()


Unnamed: 0.1,Unnamed: 0,filenames,scores,classes,class label,id,np_scores
0,0,im_in_0.png,[ [0.98582095 0.3349454 0.30242687 0. ...,[ [ 0. 77. 59. 0. 0. 0. 0. 0. 0. 0. 0....,[person bicycle car],0,"[0.98582095, 0.3349454, 0.30242687, 0.0, 0.0, ..."
1,1,im_in_1.png,[ [0.8281204 0. 0. 0. 0. ...,[ [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0...,[person],1,"[0.8281204, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,2,im_in_10.png,[ [0.6871961 0.44457653 0. 0. ...,[ [ 0. 59. 0. 0. 0. 0. 0. 0. 0. 0. 0....,[person bicycle],10,"[0.6871961, 0.44457653, 0.0, 0.0, 0.0, 0.0, 0...."
3,3,im_in_100.png,[ [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0...,[ [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0...,[],100,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,im_in_1000.png,[ [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0...,[ [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0...,[],1000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
# Get only id and np_scores columns
df_yolo2 = df_yolo[['id', 'np_scores']]

# Sort data by data id
df_yolo2.sort_values(by = ['id'], inplace = True)

# reset index by the sorted order
df_yolo2 = df_yolo2.reset_index(drop = True)
df_yolo2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_yolo2.sort_values(by = ['id'], inplace = True)


Unnamed: 0,id,np_scores
0,0,"[0.98582095, 0.3349454, 0.30242687, 0.0, 0.0, ..."
1,1,"[0.8281204, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,2,"[0.61112, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0..."
3,3,"[0.2824024, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,4,"[0.8000197, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [None]:
# check number of features from yolo
len(df_yolo2['np_scores'][0])

50

In [None]:
# Convert scores into numpy array
yolo_emb = np.asarray(df_yolo2['np_scores'].to_list())

# Calculate cosine similarities

feat_sparse = sparse.csr_matrix(yolo_emb)
yolo_sim = cosine_similarity(feat_sparse)
yolo_sim

array([[1.        , 0.90925971, 0.90925971, ..., 0.94918364, 0.        ,
        0.90925971],
       [0.90925971, 1.        , 1.        , ..., 0.8870462 , 0.        ,
        1.        ],
       [0.90925971, 1.        , 1.        , ..., 0.8870462 , 0.        ,
        1.        ],
       ...,
       [0.94918364, 0.8870462 , 0.8870462 , ..., 1.        , 0.        ,
        0.8870462 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.90925971, 1.        , 1.        , ..., 0.8870462 , 0.        ,
        1.        ]])

#3. Build a graph
### Let threshold = 0.7. For example, if cosine similarity(i, j) >= 0.7, then there is a link between i and j with weight of w. 

In [None]:
def create_graph(correlation_matrix, threshold):
    G = nx.Graph()
    for i in range(len(correlation_matrix)):
        for j in range(i+1, len(correlation_matrix)):
            if correlation_matrix[i][j] >= threshold:
                G.add_edge(i, j, weight=correlation_matrix[i][j])
    return G

In [None]:
thresh = 0.7
graph_model1_Tf = create_graph(model1_Tf_sim, thresh)
graph_model1_CV = create_graph(model1_CV_sim, thresh)
graph_model2_Tf = create_graph(model2_Tf_sim, thresh)
graph_model2_CV = create_graph(model2_CV_sim, thresh)
graph_model3_w2v = create_graph(model3_word2vec_sim, thresh)
graph_model3_glo = create_graph(model3_glove_sim, thresh)
graph_model3_bert = create_graph(model3_bert_sim, thresh)

In [None]:
thresh = 0.7
graph_harshika = create_graph(harshika_sim, thresh)


In [None]:
# For YOLO, 0.7 gives a graph with 0.99 density. 
# So, we increase threshold only for YOLO and CLIP
thresh = 0.9
graph_yolo = create_graph(yolo_sim, thresh)


# 4. Save the graph as gexf file

In [None]:
# Save graph as gexf file
nx.write_gexf(graph_model1_Tf, "graph_model1_Tf.gexf")
nx.write_gexf(graph_model1_CV, "graph_model1_CV.gexf")
nx.write_gexf(graph_model2_Tf, "graph_model2_Tf.gexf")
nx.write_gexf(graph_model2_CV, "graph_model2_CV.gexf")
nx.write_gexf(graph_model3_w2v, "graph_model3_w2v.gexf")
nx.write_gexf(graph_model3_glo, "graph_model3_glo.gexf")
nx.write_gexf(graph_model3_bert, "graph_model3_bert.gexf")
nx.write_gexf(graph_yolo, "graph_yolo_09.gexf")
nx.write_gexf(graph_harshika, "graph_harshika.gexf")

# 5. Open gexf file in Gephi software



### 1. Download Gephi software from [this link](https://gephi.org/users/download/). The tutorial link is also available in [here](https://gephi.org/tutorials/gephi-tutorial-quick_start.pdf). 
### 2. Open Gephi software and then open a gexf file.
### 3. Open 'Statistic' window to calculate average degree, average weighted degree, graph density, and average clustering coefficient. 
### 4. Run 'Modularity' under commnity detection. The default setting was used where the setting was it's randomized, used weights, resolution of 1 and classes start at 0. Gephi implements the Louvain method for community detection as it's described in the tutorial.
### 5. Go to Appearance tab > Nodes > Partition module. Then, choose an attribute as modularity class to color nodes by their communities.
### 5. Click the setting for the size of nodes. Go to Appearance tab > Nodes > Ranking. Then, choose an attribute as degree to differenciate each node size by its degree. The min and max sizes of node was varied depending on gragh's density, the number of nodes, and the number of edges. 
### 6. Go to Layout and then choose random layout with sapce size of 1000 and then hit run button.
### 7. Then, in the same "Layout" tab, choose Yifan Hu Proporational layout and then play with optimal distance to obtain the desirable graph layout. It might create clusters with the same community, which is what we want.
### 8. If some communities were scattered, manually drag each node to create a cluster with the same node colors.
### 9. Go to 'Preview' tab and then select show labes with font size of 4 in Preview setting. Then, click the refresh button. Then, export the graph as pdf file. The exporting was failed for graphs that have high density due to lack of memory. For those graphs, screen shot was done, which has much lower quality than pdf file. 