# Product Matching Using Graphs

Install the Neo4j [Graph Data Science Client](https://neo4j.com/docs/graph-data-science-client/current/installation/) and import modules

In [1]:
%%capture
import pandas as pd
import numpy as np
import getpass
import os

try:
    from graphdatascience import GraphDataScience
except:
    %pip install graphdatascience
    from graphdatascience import GraphDataScience

Set connection credentials and OpenAI api key 

In [14]:
connectionUrl = str(input("Neo4j Database Url (press enter to use \"neo4j://localhost:7687\"): ") or "neo4j://localhost:7687")
username = str(input("Username (press enter for \"neo4j\"): ") or "neo4j")
os.environ["NEO4J_PASSWORD"] = getpass.getpass(prompt='Password: ')
password = os.getenv('NEO4J_PASSWORD')
database = str(input("Database name (press enter for \"neo4j\"): ") or "neo4j")
os.environ["OPENAI_API_KEY"] = getpass.getpass(prompt='OpenAI API key: ')
openai_api_key = os.getenv('OPENAI_API_KEY') # Needed for the last part: Product Matching using OpenAI text embedding

Verify the database connection and return the Graph Data Science library version

In [15]:
gds = GraphDataScience(connectionUrl, auth=(username, password))
gds.set_database(database)
print(gds.version())

2.4.6


## 0. Data
Data has been downloaded from [openfoodfacts.org](https://world.openfoodfacts.org/cgi/search.pl?action=display&search_terms=ice&tagtype_0=countries&tag_contains_0=contains&tag_0=uk&tagtype_1=languages&tag_contains_1=contains&tag_1=en&sort_by=unique_scans_n&page_size=20)

search criteria: 
['bread','ice','peas'] with country contains 'uk' and languages contains "en" 


In [9]:
# load data
bread_df = pd.read_csv('openfoodfacts_export_bread_UK_en.csv', sep='\t')
ice_df = pd.read_csv('openfoodfacts_export_ice_UK_en.csv', sep='\t')
peas_df = pd.read_csv('openfoodfacts_export_peas_UK_en.csv', sep='\t')

In [10]:
all_df = pd.concat([bread_df, ice_df,peas_df], ignore_index=True, axis=0)
all_df

Unnamed: 0,code,lc,product_name_da,product_name_de,product_name_en,product_name_es,product_name_fi,product_name_fr,product_name_it,product_name_nl,...,nutriscore_grade_producer,customer_service_fr,sources_fields:org-gs1:gln,sources_fields:org-gs1:gpcCategoryCode,sources_fields:org-gs1:gpcCategoryName,sources_fields:org-gs1:isAllergenRelevantDataProvided,sources_fields:org-gs1:lastChangeDateTime,sources_fields:org-gs1:partyName,sources_fields:org-gs1:productionVariantDescription,sources_fields:org-gs1:publicationDateTime
0,4088600107646,en,,,Original Rye Crispbread,,,,,,...,,,,,,,,,,
1,5070000210605,en,,,Keto bread,,,,,,...,,,,,,,,,,
2,29296484,en,,,Fruit and treacle bread,,,,,,...,,,,,,,,,,
3,5057967395071,en,,,White Sourdough,,,,,,...,,,,,,,,,,
4,5060235980480,en,,,White Sourdough,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2426,5024121380419,en,,,Organix cheese pea snaps,,,,,,...,,,,,,,,,,
2427,5060413751109,en,,,Yushoi lightly salted pea snacks,,,,,,...,,,,,,,,,,
2428,656320,en,,,Tenderstem Broccoli Carrots & Sugar snap peas,,,,,,...,,,,,,,,,,
2429,116022,en,,,British Giant Marrowfat Peas,,,,,,...,,,,,,,,,,


In [11]:
min_col = ['code', 'product_name_en','generic_name_en','quantity','serving_size',
           'packaging_tags','brands_tags','categories_tags','labels_tags','countries_tags','stores_tags', 
           'ingredients_text_en','allergens_tags','traces_tags','packaging_1_shape',
           'link','off:food_groups','off:food_groups_tags','off:nova_groups_tags','off:nutriscore_grade','off:nutriscore_score']

In [12]:
df = all_df[min_col]

In [13]:
df =df.dropna(subset=['product_name_en'])
df

Unnamed: 0,code,product_name_en,generic_name_en,quantity,serving_size,packaging_tags,brands_tags,categories_tags,labels_tags,countries_tags,...,ingredients_text_en,allergens_tags,traces_tags,packaging_1_shape,link,off:food_groups,off:food_groups_tags,off:nova_groups_tags,off:nutriscore_grade,off:nutriscore_score
0,4088600107646,Original Rye Crispbread,,250 g,10 g,,savour-bakes,"en:plant-based-foods-and-beverages,en:plant-ba...","en:vegetarian,en:source-of-fibre,en:high-fibres",en:united-kingdom,...,"rye flour, salt",en:gluten,"en:gluten,en:sesame-seeds,en:soybeans",,,en:bread,"en:cereals-and-potatoes,en:bread",en:3-processed-foods,a,-4.0
1,5070000210605,Keto bread,,,,,,"en:flax-seed-bread,en:flax-seed-bread",,en:united-kingdom,...,"Water, Sunflower seeds, Flaxseed (13%), Tapioc...",,,,,,,en:3-processed-foods,c,7.0
2,29296484,Fruit and treacle bread,,,,,,"en:plant-based-foods-and-beverages,en:plant-ba...",,en:united-kingdom,...,"wheat flour, calcium carbonate, iron, niacin, ...",en:gluten,,,,en:bread,"en:cereals-and-potatoes,en:bread",en:3-processed-foods,b,2.0
3,5057967395071,White Sourdough,,400g,,"en:card-sleeve,en:ldpe-film","tesco,walkers","en:plant-based-foods-and-beverages,en:plant-ba...",en:vegetarian,en:united-kingdom,...,"wheat flour (wheat flour, calcium carbonate, i...",en:gluten,en:en-eggs-en-gluten-en-milk-en-may-contain-se...,en:sleeve,,en:bread,"en:cereals-and-potatoes,en:bread",en:3-processed-foods,a,-2.0
4,5060235980480,White Sourdough,,1 kg,,,bertinet-bakery,en:sourdough-white-sliced-bread,,en:united-kingdom,...,,,,,,,,unknown,c,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2425,274012,"Pea, mint, feta",,,,,sainsbury-s,,,"en:france,en:united-kingdom",...,"Peas (52%), Water, Spinach, Soured Cream (Cows...",en:milk,,,,,,en:4-ultra-processed-food-and-drink-products,,
2426,5024121380419,Organix cheese pea snaps,,,,,,,,en:united-kingdom,...,,,,,,,,unknown,,
2427,5060413751109,Yushoi lightly salted pea snacks,,,21.0g,,yushoi,,,en:united-kingdom,...,,,,,,,,unknown,,
2428,656320,Tenderstem Broccoli Carrots & Sugar snap peas,,,80g,,m-s,,,en:united-kingdom,...,,,,,,,,unknown,,


### 0.1 Data Loading

In [None]:
gds.run_cypher("create constraint if not exists for (p:Product) require (p.code) is node key")

In [None]:
#load product
gds.run_cypher(
"""
unwind $df as df 
merge (p:Product{code: df['code'], name:df['product_name_en']})
set p.genericName = df['generic_name_en'], 
    p.quantity = df['quantity'], 
    p.servingSize = df['serving_size']
""",
params = {'df':df.to_dict(orient='records')}
)

In [None]:
#load brands
gds.run_cypher(
"""
unwind $df as df
with df, split(df['brands_tags'],",") as brands
unwind brands as brand
match (p:Product {code: df['code']})
merge (b:Brand {name:brand})
merge (p)-[:HAS_BRAND]->(b)
""",
params = {'df':df.dropna(subset=['brands_tags']).to_dict(orient='records')}
)

In [None]:
#load packaging
gds.run_cypher(
"""
unwind $df as df
with df, split(replace(df['packaging_tags'],"en:",""),",") as packagings
unwind packagings as packaging
match (p:Product {code: df['code']})
merge (pa:Packaging {name:packaging})
merge (p)-[:HAS_PACKAGING]->(pa)
""",
params = {'df':df.dropna(subset=['packaging_tags']).to_dict(orient='records')}
)

In [None]:
#load categories
gds.run_cypher(
"""
unwind $df as df
with df, split(replace(df['categories_tags'],"en:",""),",") as categories
unwind categories as category
match (p:Product {code: df['code']})
merge (ca:Category {name:category})
merge (p)-[:HAS_CATEGORY]->(ca)
""",
params = {'df':df.dropna(subset=['categories_tags']).to_dict(orient='records')}
)

In [None]:
#load labels
gds.run_cypher(
"""
unwind $df as df
with df, split(replace(df['labels_tags'],"en:",""),",") as labels
unwind labels as label
match (p:Product {code: df['code']})
merge (l:Label {name:label})
merge (p)-[:HAS_LABEL]->(l)
""",
params = {'df':df.dropna(subset=['labels_tags']).to_dict(orient='records')}
)

In [None]:
#load countries
gds.run_cypher(
"""
unwind $df as df
with df, split(replace(df['countries_tags'],"en:",""),",") as countries
unwind countries as country
match (p:Product {code: df['code']})
merge (c:Country {name:country})
merge (p)-[:HAS_COUNTRY]->(c)
""",
params = {'df':df.dropna(subset=['countries_tags']).to_dict(orient='records')}
)

In [None]:
#load stores
gds.run_cypher(
"""
unwind $df as df
with df, split(df['stores_tags'],",") as stores
unwind stores as store
match (p:Product {code: df['code']})
merge (s:Store {name:store})
merge (p)-[:HAS_STORE]->(s)
""",
params = {'df':df.dropna(subset=['stores_tags']).to_dict(orient='records')}
)

In [None]:
# load allergens
gds.run_cypher(
"""
unwind $df as df
with df, split(replace(df['allergens_tags'],"en:",""),",") as allergens
unwind allergens as allergen
match (p:Product {code: df['code']})
merge (a:Allergen {name:allergen})
merge (p)-[:HAS_ALLERGEN]->(a)
""",
params = {'df':df.dropna(subset=['allergens_tags']).to_dict(orient='records')}
)

In [None]:
# load foodGroup1
gds.run_cypher(
"""
unwind $df as df
with df, split(replace(df['off:food_groups'],"en:",""),",") as foodgroups
unwind foodgroups as foodgroup
match (p:Product {code: df['code']})
merge (c:Category {name:foodgroup})
merge (p)-[:HAS_GROUP1]->(c)
""",
params = {'df':df.dropna(subset=['off:food_groups']).to_dict(orient='records')}
)

In [None]:
# load foodGroup2
gds.run_cypher(
"""
unwind $df as df
with df, split(replace(df['off:food_groups_tags'],"en:",""),",") as foodgroups
unwind foodgroups as foodgroup
match (p:Product {code: df['code']})
merge (c:Category {name:foodgroup})
merge (p)-[:HAS_GROUP2]->(c)
""",
params = {'df':df.dropna(subset=['off:food_groups_tags']).to_dict(orient='records')}
)

In [None]:
# import ingredients as free text
gds.run_cypher(
"""
unwind $df as df
match (p:Product {code: df['code']})
set p.ingredients = df.ingredients_text_en
""",
params = {'df':df.dropna(subset=['ingredients_text_en']).to_dict(orient='records')}
)

In [None]:
# Ugly clean
gds.run_cypher("""MATCH (p:Product ) where toString(p.genericName)="NaN" set p.genericName = NULL""")
gds.run_cypher("""MATCH (p:Product ) where toString(p.quantity)="NaN" set p.genericName = NULL""")
gds.run_cypher("""MATCH (p:Product ) where toString(p.servingSize)="NaN" set p.genericName = NULL""")

## 1. EDA

In [None]:
d = gds.run_cypher("""
call apoc.meta.stats
YIELD labels
""")
pd.DataFrame([d.labels[0]])

In [None]:
# Most represented category
gds.run_cypher("""
MATCH (p:Product)-[:HAS_CATEGORY]->(c:Category)
RETURN c.name as category, count(p) as productCount
ORDER BY productCount DESC
""").head(20)

In [None]:
# Most represented group1 category
gds.run_cypher("""
MATCH (p:Product)-[:HAS_GROUP1]->(c:Category)
RETURN c.name as category, count(p) as productCount
ORDER BY productCount DESC
""").head(20)

In [None]:
# store - product
gds.run_cypher("""
    MATCH (b:Brand)--(p:Product)--(s:Store)
    RETURN b.name as brand, s.name as store, count(p) as productCount order by productCount desc
""").head(20)

In [None]:
# Number of product per brand
gds.run_cypher("""
    MATCH (b:Brand)--(p:Product)
    RETURN b.name, count(p) as productCount order by productCount desc
""")

In [None]:
# how many products has labels, per brand
gds.run_cypher("""
    MATCH path=(b:Brand)--(p:Product)--(l:Label)
    RETURN b.name as brand, 
        count(l) as totalLabelCount,  
        count(distinct l) as uniqueLabel, 
        count(distinct p) as uniqueProduct,
        count(l) / count(distinct p) as ratio
    order by ratio desc
""").head(20)

## 2. Find Similar Products

### 2.1. based on categories - Cypher

In [None]:
gds.run_cypher("""
    MATCH (p:Product {code:1216486})--(c)
    with p, collect (c.name) as context
    return p.name, p.genericName, p.quantity, p.ingredients, context
""")

In [None]:
# similar product to "Garden Peas"
gds.run_cypher("""
MATCH (p:Product {code:1216486})-[r:HAS_ALLERGEN|HAS_GROUP1|HAS_GROUP2|HAS_LABEL*2]-(sim:Product)
return sim.code, sim.name, sim.genericName, sim.quantity,sim.servingSize, count(r) as score order by score desc limit 15
""")

In [None]:
# similar product to Magnum batonnet classic
gds.run_cypher("""
MATCH (p:Product {name:"Batonnet Classic"})-[r:HAS_ALLERGEN|HAS_GROUP1|HAS_GROUP2|HAS_LABEL*2]-(sim:Product)
return sim.code, sim.name, sim.genericName, sim.quantity,sim.servingSize, count(r) as score order by score desc limit 15
""")

In [None]:
# similar product to Magnum batonnet classic that is vegan
gds.run_cypher("""
MATCH (p:Product {name:"Batonnet Classic"})-[r:HAS_ALLERGEN|HAS_GROUP1|HAS_GROUP2|HAS_LABEL*2]-(sim:Product)-[:HAS_LABEL]->(:Label {name:"vegan"})
return sim.code, sim.name, sim.genericName, sim.quantity,sim.servingSize, count(r) as Score order by Score desc limit 15
""")

## How to improve this basic product matching?
- Extract more descriptive labels
    - ingredients, size, weights, nutriscore...
    - entity extraction with NLP libraries or LLMs like OpenAI
- Use Graph Data Science
    - node similarity
    - Graph embeddings + KNN
- Curate, clean and organize labels
    - Taxonomies and ontologies
    - Can be done manually (experts) or using automatic procedures (Ontologies, GML node classification or link prediction)

    

### Import ingredients as nodes

In [None]:
# ingredients
ing_df = df.dropna(subset=['ingredients_text_en'])
ing_df.ingredients_text_en

In [None]:
ing_df['ing'] = ing_df['ingredients_text_en'].str.replace(r"\(.*\)","") # remove what's inside parentheses
ing_df['ing'] = ing_df['ing'].str.replace(r"\[.*\]","")
ing_df['ing'] = ing_df['ing'].str.replace("_","") # remove "_"
ing_df['ing'] = ing_df['ing'].str.replace("  "," ")
ing_df['ing'] = ing_df['ing'].str.replace(" ,",",").apply(str.lower)
ing_df['ing'] = ing_df['ing'].str.replace(".","")
ing_df['ing'] = ing_df['ing'].str.replace(", ,",",")
ing_df['ing'] = ing_df['ing'].str.replace(",,",",")

In [None]:
pd.DataFrame(ing_df.ing.str.split(",",expand=True).stack().value_counts()).head(20)

In [None]:
# import ingredients as nodes
gds.run_cypher(
"""
unwind $df as df
with df, split(df['ing'],", ") as ingredients
unwind ingredients as ingredient
match (p:Product {code: df['code']})
merge (i:Ingredient {name:ingredient})
merge (p)-[:HAS_INGREDIENT]->(i)
""",
params = {'df':ing_df.dropna(subset=['ing']).to_dict(orient='records')}
)

### 2.2. Find similar products with ingredients

In [None]:
# Use ingredients
gds.run_cypher("""
MATCH (p:Product {code:1216486})-[:HAS_INGREDIENT]-(i:Ingredient)-[:HAS_INGREDIENT]-(sim:Product)
WITH p, sim, collect(i.name) as ingredients
return sim.code, sim.name, sim.genericName, sim.quantity,sim.servingSize, ingredients, size(ingredients) as score order by score desc
""").head(20)

In [None]:
# similar product to Magnum batonnet classic
gds.run_cypher("""
MATCH (p:Product {name:"Batonnet Classic"})-[:HAS_INGREDIENT]-(i:Ingredient)-[:HAS_INGREDIENT]-(sim:Product)
WITH p, sim, collect(i.name) as ingredients
return sim.code, sim.name, sim.genericName, ingredients, size(ingredients) as score order by score desc limit 15
""")


In [None]:
# similar product to Magnum batonnet classic using all labels
gds.run_cypher("""
MATCH (p:Product {name:"Batonnet Classic"})-[r:HAS_ALLERGEN|HAS_GROUP1|HAS_GROUP2|HAS_LABEL|HAS_INGREDIENT*2]-(sim:Product)
return sim.code, sim.name, sim.genericName, sim.quantity,sim.servingSize, count(r) as Score order by Score desc limit 15
""")

## 3. Product Matching using Similarities

### 3.1 fastRP + Knn on allergen, category and label

In [None]:
# For this demo, we will use compute similarities between Bread product only
# create Bread label 
gds.run_cypher("""
    match (p:Product)-[:HAS_GROUP1]->(c:Category {name:"bread"})
    set p:Bread
""")

In [None]:
g, _= gds.graph.project('simcat', 
    ['Bread','Category','Label','Allergen'],
    {'HAS_ALLERGEN':{'orientation':'UNDIRECTED'},
     'HAS_CATEGORY':{'orientation':'UNDIRECTED'},
     'HAS_GROUP2':{'orientation':'UNDIRECTED'},
     'HAS_LABEL':{'orientation':'UNDIRECTED'}
     }) 

In [None]:
# It is possible to use nodeSimilarity directly. This is an example of results
gds.nodeSimilarity.stream(g)

In [None]:
# But fastRP + KNN will be used in the rest of the demo
# This is an example of fastRP embeddings
gds.fastRP.stream(g,embeddingDimension=128)

In [None]:
# algorithms in Graph Data Science can be chained
gds.fastRP.mutate(g,embeddingDimension=128, mutateProperty='fastRP')

In [None]:
gds.knn.write(g,nodeLabels=['Bread'], topK=10, nodeProperties=['fastRP'],writeRelationshipType='SIMILAR_TO',writeProperty='score')

SIMILAR_TO relationships can then be shown in Bloom. Use Louvain in Bloom to display clusters of bread types.

### 3.2 Product reconciliation

In [None]:
# Example of query to find similar products from the same brand. 
gds.run_cypher("""
MATCH (b:Brand)--(b1:Bread)-[r:SIMILAR_TO]->(b2:Bread)--(b)
RETURN b1.name, b2.name, b.name, r.score as score order by score desc
""").head(20)

### 3.3. Improve Product classification by learning Taxonomy from data

In [None]:
# Compute co-ocurence
gds.run_cypher("""
MATCH (c:Category)
WITH c, count{ (c)<-[:HAS_CATEGORY]-() } as totalCount
MATCH (c)<-[:HAS_CATEGORY]-(p)-[:HAS_CATEGORY]->(relatedCategory)
WITH c, relatedCategory, toFloat(count(p)) as countp, totalCount
CREATE (c)-[:CO_OCCURS {index: countp/ totalCount}]->(relatedCategory)
""")

In [None]:
# Infer same-as relationships 
gds.run_cypher("""
MATCH (c1)-[co1:CO_OCCURS {index:1}]->(c2),
    (c2)-[co2:CO_OCCURS {index:1}]->(c1)
WHERE ID(c1) > ID(c2)
MERGE (c1)-[:SAME_AS]-(c2)
""")

In [None]:
# Infer narrower-than relationships
gds.run_cypher("""
MATCH (c1)-[:CO_OCCURS {index:1}]->(c2),
    (c2)-[co2:CO_OCCURS]->(c1)
WHERE co2.index < 1
MERGE (c1)-[:NARROWER_THAN]->(c2)
""")

In [None]:
# Reduce transitive narrower-than relationships
gds.run_cypher("""
MATCH (c1)-[:NARROWER_THAN*2..]->(c3),
    (c1)-[d:NARROWER_THAN]->(c3)
DELETE d
""") 

Analysis can then be done in Bloom to improve Taxonomy / Categories

### 3.4 Improved Product Matching using FastRP + KNN

In [None]:
g2, _= gds.graph.project('simall3', 
    ['Bread','Category','Label','Allergen','Ingredient'],
    {'HAS_ALLERGEN':{'orientation':'UNDIRECTED'},
     'HAS_CATEGORY':{'orientation':'UNDIRECTED'},
     'HAS_GROUP1':{'orientation':'UNDIRECTED'},
     'HAS_GROUP2':{'orientation':'UNDIRECTED'},
     'HAS_LABEL':{'orientation':'UNDIRECTED'},
     'NARROWER_THAN':{'orientation':'UNDIRECTED'},
     'HAS_INGREDIENT':{'orientation':'UNDIRECTED'},
     'SAME_AS':{'orientation':'UNDIRECTED'}
     }) 

In [None]:
gds.fastRP.mutate(g2,embeddingDimension=1028, mutateProperty='fastRP')

In [None]:
gds.knn.write(g2,nodeLabels=['Bread'], topK=10, nodeProperties=['fastRP'],writeRelationshipType='SIMILAR_TO_ALL',writeProperty='score')

In [None]:
gds.run_cypher("""
    MATCH (p:Product {code:"5060195901334"})-[r:SIMILAR_TO_ALL]-(sim)
    RETURN p.name, sim.name, r.score as score order by score desc
""")

## 4. Product Matching using free text (product description)

### 4.1 Using Lucene Full-text search index

In [None]:
# lucene analyzers 
gds.run_cypher("""CALL db.index.fulltext.listAvailableAnalyzers""")

In [None]:
# create index on string properties (name, genericName and ingredients as free text)
gds.run_cypher("""
    CREATE FULLTEXT INDEX namesAndIng FOR (n:Product) ON EACH [n.name, n.genericName, n.ingredients]
""")

In [None]:
# Products can be searched using free text queries
gds.run_cypher("""
    CALL db.index.fulltext.queryNodes("namesAndIng","wheat")
    YIELD node, score
    RETURN node.name, node.genericName, node.ingredients, score
""").head(20)

In [None]:
# This command does not return any result
gds.run_cypher("""
    CALL db.index.fulltext.queryNodes("namesAndIng",'genericName:bred')
    YIELD node, score
    RETURN node.name, node.genericName, score
""")

In [None]:
# Fuzzy search
gds.run_cypher("""
    CALL db.index.fulltext.queryNodes("namesAndIng",'genericName:bred~')
    YIELD node, score
    RETURN node.name, node.genericName, score
""")

In [None]:
# search and match
gds.run_cypher("""
    CALL db.index.fulltext.queryNodes("namesAndIng",'name:"chocolate ice cream"')
    YIELD node, score
    OPTIONAL MATCH (node)-[:HAS_BRAND]-(b:Brand)
    RETURN node.name, node.genericName, node.ingredients, b.name, score
""")

### 4.2. Using OpenAI text embeddings

This part requires:
- an OpenAI key
- [APOC extended](https://github.com/neo4j-contrib/neo4j-apoc-procedures) version > 5.8

In [None]:
# Will be used only on Bread with genericName size > 20 characters
gds.run_cypher("""
    MATCH (b:Bread) where size(b.genericName) > 20 
    set b:Bgname 
    return count(b)
""")


In [None]:
# Compute OpenAi Embeddings
gds.run_cypher("""
    MATCH (b:Bgname)
    WITH b, b.genericName as genName
    CALL apoc.ml.openai.embedding([genName],$apiKey) 
    YIELD embedding
    SET b.openAiEmbedding = embedding
""", {'apiKey':openai_api_key})

In [None]:
# OpenAI embeddings dimension: 1536
gds.run_cypher("""
    MATCH (b:Bgname) return size(b.openAiEmbedding) limit 5
""")

In [None]:
# KNN on OpenAIEmbeddings
g3,_= gds.graph.project("TextEmbed2", {"Bgname": {"properties":"openAiEmbedding"}},'*')

In [None]:
gds.knn.write(g3, topK=10, nodeProperties=['openAiEmbedding'],writeRelationshipType='SIMILAR_OPENAI',writeProperty='score')

In [None]:
gds.run_cypher("""
    MATCH (n:Bgname)-[r:SIMILAR_OPENAI]->(m)
    where r.score < 1 AND id(n)>id(m)
    return r.score as score, n.genericName, m.genericName
    order by score desc limit 20
""")

In [None]:
# Comparison between the different similarity scores
gds.run_cypher("""
    MATCH (n:Bgname)-[r:SIMILAR_OPENAI]->(m)
    where r.score < 1 AND id(n)>id(m)
    OPTIONAL MATCH (n)-[r2:SIMILAR_TO_ALL]->(m)
    return r2.score as nodescore, r.score as score, n.genericName, m.genericName
    order by score desc limit 20
""")

### 4.3. It is possible to concatenate embeddings for improved similarities

In [None]:
g4, _= gds.graph.project('simall5', 
    ['Bread','Category','Label','Allergen','Ingredient'],
    {'HAS_ALLERGEN':{'orientation':'UNDIRECTED'},
     'HAS_CATEGORY':{'orientation':'UNDIRECTED'},
     'HAS_GROUP1':{'orientation':'UNDIRECTED'},
     'HAS_GROUP2':{'orientation':'UNDIRECTED'},
     'HAS_LABEL':{'orientation':'UNDIRECTED'},
     'NARROWER_THAN':{'orientation':'UNDIRECTED'},
     'HAS_INGREDIENT':{'orientation':'UNDIRECTED'},
     'SAME_AS':{'orientation':'UNDIRECTED'}
     }) 

In [None]:
# Compute FastRP embeddings
gds.fastRP.write(g4,embeddingDimension=1028, writeProperty="FastRP_all")

In [None]:
# Concatenate fastRP and OpenAI Embeddings
gds.run_cypher("""
    MATCH (b:Bgname)
    SET b.totalEmbeddings = b.FastRP_all + b.openAiEmbedding
""")

In [None]:
# Embedding size is now 2564
gds.run_cypher("""
    MATCH (b:Bgname)
    RETURN size(b.totalEmbeddings) limit 1
""")

In [None]:
# KNN on Concatenated Embeddings
g5,_= gds.graph.project("AllEmbed2", {"Bgname": {"properties":"totalEmbeddings"}},'*')

In [None]:
gds.knn.write(g5, topK=10, nodeProperties=['totalEmbeddings'],writeRelationshipType='SIMILAR_TOTAL_EMB',writeProperty='score')

Show results in Bloom

### 4.4 Using OpenAI embeddings for online product matching

In [None]:
question = "bake at home french baguette"

gds.run_cypher("""

    CALL apoc.ml.openai.embedding([$question],$apiKey) 
    YIELD embedding
    MATCH (b:Bgname)
    WITH b, gds.similarity.cosine(embedding, b.openAiEmbedding) AS score
    RETURN b.name, score order by score desc
""", {'apiKey':openai_api_key, 'question':question})

In [None]:
# Results of OpenAI embedding matching queries can be chained with regular cypher queries
question = "bake at home french baguette"

gds.run_cypher("""

    CALL apoc.ml.openai.embedding([$question],$apiKey) 
    YIELD embedding
    MATCH (b:Bgname)-[:HAS_LABEL]->(:Label {name:"vegan"})
    WITH b, gds.similarity.cosine(embedding, b.openAiEmbedding) AS score
    RETURN b.name, score order by score desc
""", {'apiKey':openai_api_key, 'question':question})

## NEXT 
- Any embeddings, like images, can be stored on external node (so several images per product is possible)

- Customers can be imported as nodes, with (:Customer)-[:PURCHASED]-(:Product) or (:Customer)-[:VIEWED]-(:Product) relationships

    - Similar segmentation can be done on Customers (vegan, halal...) to improve customer recommendation

    - SIMILAR_TO relationships can be computed on customer to allow hyper personalisation
    
    - Using apoc.ml.openai.completion, product descriptions can be personalised on the fly