# 1. Install and import libraries

In [None]:
!pip install Pillow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
import os, io
from PIL import Image
import scipy
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
drive.mount('/content/drive')
parent_folder = '/content/drive/MyDrive/prompt_recc'
os.chdir(parent_folder)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 2. Load data

In [None]:
# Load raw data to see images later
df = pd.read_csv("prompt_with_img_path.csv")
df.head()

Unnamed: 0,id,prompt,seed,step,cfg,sampler,width,height,user_name,timestamp,image_nsfw,prompt_nsfw,image_path
0,0,"a renaissance portrait of dwayne johnson, art ...",2480545905,50,16.0,k_euler_ancestral,512,768,e9dfc969d22cb9c5621ad075b3826c28f18ef3840c6dda...,2022-08-20 05:28:00+00:00,0.163488,0.000793,/content/drive/MyDrive/prompt_recc/Hyelim_firs...
1,1,"portrait of a dancing eagle woman, beautiful b...",2250159284,50,9.0,k_lms,512,640,aa60a36693bf9e079b421c1200fd42ed94061fb02078ad...,2022-08-20 05:28:00+00:00,0.27665,0.00309,/content/drive/MyDrive/prompt_recc/Hyelim_firs...
2,2,"epic 3 d, become legend shiji! gpu mecha contr...",4292948605,50,7.0,k_lms,512,768,3e774662f24c78b0590d57365d755d7dacde8f33dbdc56...,2022-08-20 05:28:00+00:00,0.090421,0.000533,/content/drive/MyDrive/prompt_recc/Hyelim_firs...
3,3,an airbrush painting of cyber war machine scen...,2374713726,50,12.0,k_lms,512,768,c2bd0b60caa0aa04d0a7e5972fefc60b21ece352d88353...,2022-08-20 05:29:00+00:00,0.078309,0.000597,/content/drive/MyDrive/prompt_recc/Hyelim_firs...
4,4,concept art of a silent hill monster. painted ...,2320897141,50,6.0,k_lms,640,512,08ac4153ab7e13baa55ee74213eb117a29a0024267b29e...,2022-08-20 05:29:00+00:00,0.086802,0.083516,/content/drive/MyDrive/prompt_recc/Hyelim_firs...


In [None]:
# Load text only data with words count
prompt = pd.read_csv("clean_text_only.csv")
prompt.head()

Unnamed: 0,id,prompt,no_emojis,after_stemming,word_count
0,0,"a renaissance portrait of dwayne johnson, art ...","a renaissance portrait of dwayne johnson, art ...",renaiss portrait dwayn johnson art style rembr...,27
1,1,"portrait of a dancing eagle woman, beautiful b...","portrait of a dancing eagle woman, beautiful b...",portrait danc eagl woman beauti blond hair lak...,36
2,2,"epic 3 d, become legend shiji! gpu mecha contr...","epic 3 d, become legend shiji! gpu mecha contr...",epic becom legend shiji gpu mecha control tele...,26
3,3,an airbrush painting of cyber war machine scen...,an airbrush painting of cyber war machine scen...,airbrush paint cyber war machin scene area des...,15
4,4,concept art of a silent hill monster. painted ...,concept art of a silent hill monster. painted ...,concept art silent hill monster paint edward h...,8


# 3. Feature extraction from TfidVectorizer, or CountVectorizer

## 3 (a) TfidVectorizer

In [None]:
# Build TfidVectorizer
v = TfidfVectorizer()
x = v.fit_transform(prompt['after_stemming'])
embedding = x.toarray()

In [None]:
# Convert features from TfidVectorizer
df_Tf = pd.DataFrame(x.toarray(), columns=v.get_feature_names_out())
df_Tf.head()

Unnamed: 0,aardman,aaron,ab,abandon,abbott,abdomen,abel,abercrombi,abomin,aborigin,...,zuckerberg,zull,zulu,zurbaran,ʻaʻa,беляева,сat,светлана,サイハーハンク,ハイネ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Check is there is any null values for each feature
df_Tf.isnull().values.any()

False

In [None]:
# Check scales for each feature
max_values = df_Tf.max().to_numpy()
print(max(max_values))
print(min(max_values))

1.0
0.1448367459280097


Since each feature has a maximum value in range from 0 to 1, we don't need standardization.

## 3 (b) CounterVectorizer

In [None]:
# Build CountVectorizer
v1 = CountVectorizer()
x1 = v1.fit_transform(prompt['after_stemming'])
embedding1 = x1.toarray()

In [None]:
# Convert features from CounterVectorizer
df_CV = pd.DataFrame(embedding1, columns=v1.get_feature_names_out())
df_CV.head()

Unnamed: 0,aardman,aaron,ab,abandon,abbott,abdomen,abel,abercrombi,abomin,aborigin,...,zuckerberg,zull,zulu,zurbaran,ʻaʻa,беляева,сat,светлана,サイハーハンク,ハイネ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Check scales for each feature
max_values1 = df_CV.max().to_numpy()
print(max(max_values1))
print(min(max_values1))

14
1


The scale varies in order of 1 to 10. Therefore, we apply min-max standardization for each column.

In [None]:
# Apply min-max standardization
df_CV = df_CV.apply(lambda x: (x-x.min())/(x.max() - x.min()), axis = 1)


In [None]:
# Check the scale again
max_values1 = df_CV.max().to_numpy()
print(max(max_values1))
print(min(max_values1))

1.0
0.14285714285714285


In [None]:
# Check is there is any null values for each feature
df_CV.isnull().values.any()

True

In [None]:
# Check is there is any null values for each feature
df_CV.fillna(0, inplace = True)
df_CV.isnull().values.any()

False

In [None]:
#count = np.count_nonzero(df_temp, axis = 0)
#stemmed_words = v.get_feature_names_out()

#import matplotlib.pyplot as plt
#%matplotlib inline
#fig = plt.figure()
#ax = fig.add_axes([0,0,1,1])

#ax.bar(stemmed_words,count)
#plt.show()

# 4. Add 'word count' feature for each Vectorizer 

## 4 (a) Min-Max standardization for world_count

In [None]:
# Extract data
df3 = prompt[['id', 'word_count']]

# min-max normalization for word_count
df3['scaled_word_count'] = (df3['word_count'] - df3['word_count'].min())/(df3['word_count'].max() - df3['word_count'].min())
df3.drop(['word_count'], axis = 1, inplace = True)
df3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['scaled_word_count'] = (df3['word_count'] - df3['word_count'].min())/(df3['word_count'].max() - df3['word_count'].min())
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.drop(['word_count'], axis = 1, inplace = True)


Unnamed: 0,id,scaled_word_count
0,0,0.553191
1,1,0.744681
2,2,0.531915
3,3,0.297872
4,4,0.148936


## 4 (b) Add word_count column to each TfidVectorizer and CounterVectorizer feature

In [None]:
# Copy data frames
df4 = df_Tf.copy()
df5 = df_CV.copy()

# Create index column for merging 
df4['id'] = df4.index
df5['id'] = df5.index
first_column = df4.pop('id')
first_column1 = df5.pop('id')
df4.insert(0, 'id', first_column)
df5.insert(0, 'id', first_column1)

# Merge word count column to TfidVectorizer
df_Tf_wc = df4.merge(df3, how = 'left')

# Merge word count column to CounterVectorizer
df_CV_wc = df5.merge(df3, how = 'left')

# drop id column to calculate cosine similarities in the next step
df_Tf_wc.drop(['id'], axis = 1, inplace = True)
df_CV_wc.drop(['id'], axis = 1, inplace = True)

# check after the merge
df_Tf_wc.head()

Unnamed: 0,aardman,aaron,ab,abandon,abbott,abdomen,abel,abercrombi,abomin,aborigin,...,zull,zulu,zurbaran,ʻaʻa,беляева,сat,светлана,サイハーハンク,ハイネ,scaled_word_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.553191
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.744681
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.531915
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.297872
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.148936


In [None]:
# check after the merge for counter vectorizer features
df_CV_wc.head()

Unnamed: 0,aardman,aaron,ab,abandon,abbott,abdomen,abel,abercrombi,abomin,aborigin,...,zull,zulu,zurbaran,ʻaʻa,беляева,сat,светлана,サイハーハンク,ハイネ,scaled_word_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.553191
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.744681
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.531915
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.297872
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.148936


In [None]:
# Replace na in word_count to 0
df_Tf_wc['scaled_word_count'].fillna(0, inplace = True)
df_CV_wc['scaled_word_count'].fillna(0, inplace = True)

In [None]:
# Check if there are any null values
print(df_Tf_wc[df_Tf_wc['scaled_word_count'].isnull()])
print(df_CV_wc[df_CV_wc['scaled_word_count'].isnull()])

Empty DataFrame
Columns: [aardman, aaron, ab, abandon, abbott, abdomen, abel, abercrombi, abomin, aborigin, abramov, absolut, abstract, absurdli, abyss, acacia, academ, academia, academic, acadia, accent, accept, accessori, accid, accord, accur, acid, ackerman, acril, across, acryl, act, action, actor, actress, actshulli, ad, adam, adamu, addam, adebanji, adeptu, adequ, adern, adida, adler, admir, adob, adolf, adolph, ador, adream, adriaen, adrian, adriana, adult, advanc, adventur, advert, advertis, aenami, aerosmith, aesthet, aether, af, afat, affandi, afghan, afraid, africa, african, afro, afrofutur, afropunk, afshar, agar, agav, age, aggress, agoni, agress, agusia, ai, aid, aidaprima, air, airbrush, airi, airplan, airship, aivazovski, aix, aizom, akb, akihiko, akira, al, alabama, alad, aladdin, ...]
Index: []

[0 rows x 6553 columns]
Empty DataFrame
Columns: [aardman, aaron, ab, abandon, abbott, abdomen, abel, abercrombi, abomin, aborigin, abramov, absolut, abstract, absurdli, abyss

# 5. Calculate similarities between prompts

## 5 (a) Define a function to calculate similarities

In [None]:
# Reference: https://stackoverflow.com/questions/17627219/whats-the-fastest-way-in-python-to-calculate-cosine-similarity-given-sparse-mat
def calculate_cosine_sim(dataframe):
  features = dataframe.to_numpy()
  feat_sparse = sparse.csr_matrix(features)
  similarities = cosine_similarity(feat_sparse)
  return similarities

## 5 (b) Obtain cosine similarities matrix for each feature extraction method

In [None]:
# Obtain cosine similarity matrix for TfidVectorizer
sim_Tf = calculate_cosine_sim(df_Tf)

# Obtain cosine similarity matrix for CounterVectorizer
sim_CV = calculate_cosine_sim(df_CV)

# Obtain cosine similarity matrix for TfidVectorizer with word count
Tf_wc_sim = calculate_cosine_sim(df_Tf_wc)

# Obtain cosine similarity matrix for CounterVectorizer with word count
CV_wc_sim = calculate_cosine_sim(df_CV_wc)

# 6. Qualitative test

Since we don't have ranks, we can't compare models by comparing RMSE. Therefore, we select random 5 prompts and then obtain top 3 recommendations for each model. We evaluate models by rating their recommendation in range of 1 to 4 for each sampled prompt.

## 6 (a) Define a function to display top-N similar prompts 

In [None]:
# create dictionary map prompt_id to prompt
temp = prompt[['id', 'prompt']]
temp.head()

Unnamed: 0,id,prompt
0,0,"a renaissance portrait of dwayne johnson, art ..."
1,1,"portrait of a dancing eagle woman, beautiful b..."
2,2,"epic 3 d, become legend shiji! gpu mecha contr..."
3,3,an airbrush painting of cyber war machine scen...
4,4,concept art of a silent hill monster. painted ...


In [None]:
# create a function to select top-N prompts 
def similar_prompts(topN, sim_mat, ID):
  """
  --- Inputs ---
  topN: the number of prompts to be recommended
  sim_mat: similarity matrix. This should be square matrix
  ID: id of prompt
  --- Output ---
  original prompt
  topN similar prompts
  """
  temp_prompt = sim_mat[ID]
  index_after_sort = np.argsort(temp_prompt)[:topN]
  recommendation = []
  print('The input prompt: ', id_prompt_dic[ID])
  print('The corresponding image for the prompt: ')
  display(Image.open(df['image_path'][ID]))
  for i in index_after_sort:
    print('Recommended prompt: ', id_prompt_dic[i])
    display(Image.open(df['image_path'][i]))
    recommendation.append((i, id_prompt_dic[i]))
  return id_prompt_dic[ID], recommendation

## 6 (b) Pick the random 5 prompts 

In [None]:
random_prompt = [11, 428, 1212, 3654, 1178]
top_N = 3

## 6 (c) top-N recommendations from TfidVectorizer features for the given random prompts

In [None]:
for id in random_prompt:
  similar_prompts(top_N, sim_Tf, id)

Output hidden; open in https://colab.research.google.com to view.

## 6 (d) top-N recommendations from CounterVectorizer features for the given random prompts

In [None]:
for id in random_prompt:
  similar_prompts(top_N, sim_CV, id)

Output hidden; open in https://colab.research.google.com to view.

## 6 (e) top-N recommendations from TfidVectorizer with word count features for the given random prompts

In [None]:
for id in random_prompt:
  similar_prompts(top_N, Tf_wc_sim, id)

Output hidden; open in https://colab.research.google.com to view.

## 6 (f) top-N recommendations from CounterVectorizer with word count features for the given random prompts

In [None]:
for id in random_prompt:
  similar_prompts(top_N, CV_wc_sim, id)

Output hidden; open in https://colab.research.google.com to view.

## 6 (g) Calculate the average score

In [127]:
# The scores are created by Hyelim Yang 
# Therefore, they're subjective
score_Tf = [2.5, 1.5, 4, 2.5, 4]
score_CV = [2.5, 1.5, 2.5, 4, 1]
score_Tf_wc = [1, 3, 2.5, 1, 2.5]
score_CV_wc = [4, 4, 1, 2.5, 2.5]
avg_score = [np.mean(score_Tf), np.mean(score_CV), np.mean(score_Tf_wc), np.mean(score_CV_wc)]
print('The average score for each model: ', avg_score)



The average score for each model:  [2.9, 2.3, 2.0, 2.8]


We can divide the average socres into two groups easily. One group has average scores of higher than 2.5 and the other has average scores of lower than 2.5. Hence, we select the first and last model to create ensemble model.

# 6 (g)  Create ensemble model

In [128]:
# calculate weight for each model
w1 = avg_score[0]/(avg_score[0] + avg_score[-1])
w2 = avg_score[-1]/(avg_score[0] + avg_score[-1])
print(w1)
print(w2)

final_model = sim_Tf*w1 + CV_wc_sim*w2
final_model

0.5087719298245614
0.4912280701754386


array([[1.        , 0.1832032 , 0.01341509, ..., 0.01771291, 0.09821616,
        0.1565143 ],
       [0.1832032 , 1.        , 0.03102719, ..., 0.01600446, 0.08979652,
        0.15622678],
       [0.01341509, 0.03102719, 1.        , ..., 0.00698216, 0.00346473,
        0.05653204],
       ...,
       [0.01771291, 0.01600446, 0.00698216, ..., 1.        , 0.00457474,
        0.00268485],
       [0.09821616, 0.08979652, 0.00346473, ..., 0.00457474, 1.        ,
        0.04046457],
       [0.1565143 , 0.15622678, 0.05653204, ..., 0.00268485, 0.04046457,
        1.        ]])

# 7. Test the ensemble model

In [129]:
for id in random_prompt:
  similar_prompts(top_N, final_model, id)

Output hidden; open in https://colab.research.google.com to view.

# 8. Create a graph using the final model

We create a graph using the similarity matrix, which is a square and symmetric. We set a threshold of 0.7, meaning that if the similarity between i and j is greater and equal to 0.7, put edge between i and j. If the similarity between i and j is less than 0.7, there is no edge between i and j. The resulted graph would be undirected weighted graph where weight of an edge would be the similarity. 

In [130]:
import networkx as nx
def create_graph(correlation_matrix, threshold):
    G = nx.Graph()
    for i in range(len(correlation_matrix)):
        for j in range(i+1, len(correlation_matrix)):
            if correlation_matrix[i][j] >= threshold:
                G.add_edge(i, j, weight=correlation_matrix[i][j])
    return G

In [131]:
thresh = 0.7
graph_model1_final = create_graph(final_model, thresh)
nx.write_gexf(graph_model1_final, parent_folder + '/gexf files/' + "graph_model1_f.gexf")