Loading Data...

In [2]:
import requests, json

data = json.loads(
    requests.get("https://thigm85.github.io/data/mind/mind_demo_fields_parsed.json").text
)
data[0]

{'abstract': "Shop the notebooks, jackets, and more that the royals can't live without.",
 'title': 'The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By',
 'subcategory': 'lifestyleroyals',
 'news_id': 'N3112',
 'category': 'lifestyle',
 'url': 'https://www.msn.com/en-us/lifestyle/lifestyleroyals/the-brands-queen-elizabeth,-prince-charles,-and-prince-philip-swear-by/ss-AAGH0ET?ocid=chopendata',
 'date': 20191103,
 'clicks': 0,
 'impressions': 0}

In [3]:
len(data)

28603

Initializing App & Schema...

In [4]:
from vespa.package import ApplicationPackage

app_package = ApplicationPackage(name="news")

In [5]:
from vespa.package import Field

app_package.schema.add_fields(
    Field(name="news_id", type="string", indexing=["summary", "attribute"], attribute=["fast-search"]),
    Field(name="category", type="string", indexing=["summary", "attribute"]),
    Field(name="subcategory", type="string", indexing=["summary", "attribute"]),
    Field(name="title", type="string", indexing=["index", "summary"], index="enable-bm25"),
    Field(name="abstract", type="string", indexing=["index", "summary"], index="enable-bm25"),
    Field(name="url", type="string", indexing=["index", "summary"]),        
    Field(name="date", type="int", indexing=["summary", "attribute"]),            
    Field(name="clicks", type="int", indexing=["summary", "attribute"]),            
    Field(name="impressions", type="int", indexing=["summary", "attribute"]),                
)


In [6]:
from vespa.package import FieldSet

app_package.schema.add_field_set(
    FieldSet(name="default", fields=["title", "abstract"])
)


Deploying with Docker...

In [7]:
from vespa.deployment import VespaDocker

vespa_docker = VespaDocker()
app = vespa_docker.deploy(
    application_package=app_package, 
)

Waiting for configuration server, 0/60 seconds...
Waiting for configuration server, 5/60 seconds...
Waiting for configuration server, 10/60 seconds...
Waiting for application to come up, 0/300 seconds.
Waiting for application to come up, 5/300 seconds.
Waiting for application to come up, 10/300 seconds.
Waiting for application to come up, 15/300 seconds.
Waiting for application to come up, 20/300 seconds.
Application is up!
Finished deployment.


Feeding date to Vespa...

In [8]:
for article in data:
    res = app.feed_data_point(
        data_id=article["news_id"], 
        fields=article, 
        schema="news"
    )


Querying the app...


Search over indexed fields using keywords...

In [9]:
res = app.query(body={"yql" : "select * from sources * where default contains 'music'"})
res.hits[0]

{'id': 'id:news:news::N14152',
 'relevance': 0.25641557752127125,
 'source': 'news_content',
 'fields': {'sddocname': 'news',
  'documentid': 'id:news:news::N14152',
  'news_id': 'N14152',
  'category': 'music',
  'subcategory': 'musicnews',
  'title': 'Music is hot in Nashville this week',
  'abstract': 'Looking for fun, entertaining music events to check out in Nashville this week? Here are top picks with dates, times, locations and ticket links.',
  'url': 'https://www.msn.com/en-us/music/musicnews/music-is-hot-in-nashville-this-week/ar-BBWImOh?ocid=chopendata',
  'date': 20191101,
  'clicks': 0,
  'impressions': 3}}

In [11]:
res = app.query(body = {"yql" : "select title, abstract from sources * where title contains 'music' AND default contains 'festival'"})
res.hits[0]

{'id': 'index:news_content/0/988f76793a855e48b16dc5d3',
 'relevance': 0.19593085505063085,
 'source': 'news_content',
 'fields': {'title': "At Least 3 Injured In Stampede At Travis Scott's Astroworld Music Festival",
  'abstract': "A stampede Saturday outside rapper Travis Scott's Astroworld musical festival in Houston, left three people injured. Minutes before the gates were scheduled to open at noon, fans began climbing over metal barricades and surged toward the entrance, according to local news reports."}}

In [12]:
res = app.query(body = {"yql" : "select title from sources * where sddocname contains 'news'"})
res.hits[0]

{'id': 'index:news_content/0/698f73a87a936f1c773f2161',
 'relevance': 0.0,
 'source': 'news_content',
 'fields': {'title': 'The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By'}}

In [13]:
res = app.query(body={"yql" : "select title, date from sources * where date contains '20191110'"})
res.hits[0]

{'id': 'index:news_content/0/debbdfe653c6d11f71cc2353',
 'relevance': 0.0017429193899782135,
 'source': 'news_content',
 'fields': {'title': 'These Cranberry Sauce Recipes Are Perfect for Thanksgiving Dinner',
  'date': 20191110}}

In [14]:
res = app.query(body={"yql" : "select title, abstract, date from sources * where default contains 'weather' AND date contains '20191110'"})
res.hits[0]

{'id': 'index:news_content/0/bb88325ae94d888c46538d0b',
 'relevance': 0.27025156546141466,
 'source': 'news_content',
 'fields': {'title': 'Weather forecast in St. Louis',
  'abstract': "What's the weather today? What's the weather for the week? Here's your forecast.",
  'date': 20191110}}

In [25]:
res = app.query({"yql" : "select title,abstract from sources * where date <= 20191110 AND date >= 20191108"})
res.hits[0]

{'id': 'index:news_content/0/c41a873213fdcffbb74987c0',
 'relevance': 0.0017429193899782135,
 'source': 'news_content',
 'fields': {'title': 'How to report weather-related closings, delays',
  'abstract': 'When there are active closings, view them here. WXII 12 News receives a number of phone calls and e-mails from viewers with questions. Sign up for our Newsletters To report a closure, please visit wxii.reportclosing.com The weather closing system is a viewer-operated system. Employees of WXII-TV and WXII12.com DO NOT enter the information in the system. That comes straight from the school/business/institution. Before you can enter information,...'}}

In [31]:
res = app.query(body={"yql" : "select title, date from sources * where default contains 'music'"})
len(res.hits)

10

In [32]:
res.hits[:2]

[{'id': 'index:news_content/0/5f1b30d14d4a15050dae9f7f',
  'relevance': 0.25641557752127125,
  'source': 'news_content',
  'fields': {'title': 'Music is hot in Nashville this week',
   'date': 20191101}},
 {'id': 'index:news_content/0/6a031d5eff95264c54daf56d',
  'relevance': 0.23351089409559303,
  'source': 'news_content',
  'fields': {'title': 'Apple Music Replay highlights your favorite tunes of the year',
   'date': 20191105}}]

In [33]:
res = app.query(body={"yql" : "select title, date from sources * where default contains 'music' order by date desc"})
res.hits[:2]

[{'id': 'index:news_content/0/934a8d976ff8694772009362',
  'relevance': 0.0,
  'source': 'news_content',
  'fields': {'title': 'Korg Minilogue XD update adds key triggers for synth sequences',
   'date': 20191113}},
 {'id': 'index:news_content/0/4feca287fdfa1d027f61e7bf',
  'relevance': 0.0,
  'source': 'news_content',
  'fields': {'title': 'Tom Draper, Black Music Industry Pioneer, Dies at 79',
   'date': 20191113}}]

In [34]:
res = app.query(body={"yql" : "select * from sources * where sddocname contains 'news' limit 0 | all(group(category) max(3) order(-count())each(output(count())))"})
res.hits[0]

{'id': 'group:root:0',
 'relevance': 1.0,
 'continuation': {'this': ''},
 'children': [{'id': 'grouplist:category',
   'relevance': 1.0,
   'label': 'category',
   'continuation': {'next': 'BGAAABEBGBC'},
   'children': [{'id': 'group:string:news',
     'relevance': 1.0,
     'value': 'news',
     'fields': {'count()': 9115}},
    {'id': 'group:string:sports',
     'relevance': 0.6666666666666666,
     'value': 'sports',
     'fields': {'count()': 6765}},
    {'id': 'group:string:finance',
     'relevance': 0.3333333333333333,
     'value': 'finance',
     'fields': {'count()': 1886}}]}]}

Use news popularity to recommend news with ranking...

In [35]:
from vespa.package import RankProfile, Function

app_package.schema.add_rank_profile(
    RankProfile(
        name="popularity",
        inherits="default",
        functions=[
            Function(
                name="popularity", 
                expression="if (attribute(impressions) > 0, attribute(clicks) / attribute(impressions), 0)"
            )
        ], 
        first_phase="nativeRank(title, abstract) + 10 * popularity"
    )
)

In [36]:
app = vespa_docker.deploy(
    application_package=app_package, 
)


Waiting for configuration server, 0/60 seconds...
Waiting for configuration server, 5/60 seconds...
Waiting for configuration server, 10/60 seconds...
Waiting for application to come up, 0/300 seconds.
Waiting for application to come up, 5/300 seconds.
Waiting for application to come up, 10/300 seconds.
Waiting for application to come up, 15/300 seconds.
Waiting for application to come up, 20/300 seconds.
Application is up!
Finished deployment.


In [37]:
app.deployment_message

Query using the new popularity signal...

In [38]:
res = app.query(body={
    "yql" : "select * from sources * where default contains 'music'",
    "ranking" : "popularity"
})
res.hits[0]

{'id': 'id:news:news::N5870',
 'relevance': 5.156596018746151,
 'source': 'news_content',
 'fields': {'sddocname': 'news',
  'documentid': 'id:news:news::N5870',
  'news_id': 'N5870',
  'category': 'music',
  'subcategory': 'musicnews',
  'title': 'Country music group Alabama reschedules their Indy show until next October 2020',
  'abstract': 'INDIANAPOLIS, Ind.   Fans of the highly acclaimed country music group Alabama, scheduled to play Bankers Life Fieldhouse Saturday night, will have to wait until next year to see the group. The group famous for such notable songs like "If You\'re Gonna Play in Texas", "Love In The First Degree", and "She and I", made the announcement that their 50th Anniversary Tour is being rescheduled till ...',
  'url': 'https://www.msn.com/en-us/music/musicnews/country-music-group-alabama-reschedules-their-indy-show-until-next-october-2020/ar-BBWB0d7?ocid=chopendata',
  'date': 20191108,
  'clicks': 1,
  'impressions': 2}}

In [39]:
from vespa.package import Schema, Document, Field

app_package.add_schema(
    Schema(
        name="user", 
        document=Document(
            fields=[
                Field(
                    name="user_id", 
                    type="string", 
                    indexing=["summary", "attribute"], 
                    attribute=["fast-search"]
                ), 
                Field(
                    name="embedding", 
                    type="tensor<float>(d0[51])", 
                    indexing=["summary", "attribute"]
                )
            ]
        )
    )
)

In [40]:
[schema.name for schema in app_package.schemas]


['news', 'user']

In [41]:
from vespa.package import Field, HNSW

app_package.get_schema(name="news").add_fields(
    Field(
        name="embedding", 
        type="tensor<float>(d0[51])", 
        indexing=["attribute", "index"],
        ann=HNSW(distance_metric="euclidean")
    )
)

In [42]:
from vespa.package import RankProfile

app_package.get_schema(name="news").add_rank_profile(
    RankProfile(
        name="recommendation", 
        inherits="default", 
        first_phase="closeness(field, embedding)"
    )
)

In [43]:
from vespa.package import QueryTypeField

app_package.query_profile_type.add_fields(
    QueryTypeField(
        name="ranking.features.query(user_embedding)",
        type="tensor<float>(d0[51])"
    )
)


In [44]:
from vespa.deployment import VespaDocker

vespa_docker = VespaDocker.from_container_name_or_id("news")
app = vespa_docker.deploy(application_package=app_package)

Waiting for configuration server, 0/60 seconds...
Waiting for configuration server, 5/60 seconds...
Waiting for configuration server, 10/60 seconds...
Waiting for application to come up, 0/300 seconds.
Waiting for application to come up, 5/300 seconds.
Waiting for application to come up, 10/300 seconds.
Waiting for application to come up, 15/300 seconds.
Waiting for application to come up, 20/300 seconds.
Application is up!
Finished deployment.


In [45]:
app.deployment_message

In [46]:
import requests, json

user_embeddings = json.loads(
    requests.get("https://thigm85.github.io/data/mind/mind_demo_user_embeddings_parsed.json").text
)
news_embeddings = json.loads(
    requests.get("https://thigm85.github.io/data/mind/mind_demo_news_embeddings_parsed.json").text
)

In [47]:
for user_embedding in user_embeddings:
    response = app.feed_data_point(
        schema="user", 
        data_id=user_embedding["user_id"], 
        fields=user_embedding
    )


In [48]:
for news_embedding in news_embeddings:
    response = app.update_data(
        schema="news", 
        data_id=news_embedding["news_id"], 
        fields={"embedding": news_embedding["embedding"]}
    )

In [49]:
def parse_embedding(hit_json):
    embedding_json = hit_json["fields"]["embedding"]["values"]
    embedding_vector = [0.0] * len(embedding_json)
    i=0
    for val in embedding_json:
        embedding_vector[i] = val
        i+=1
    return embedding_vector

def query_user_embedding(user_id):
    result = app.query(body={"yql": "select * from sources user where user_id contains '{}'".format(user_id)})
    embedding = parse_embedding(result.hits[0])
    return embedding


In [50]:
query_user_embedding(user_id="U63195")[:5]

[0.0,
 -0.1694680005311966,
 -0.0703359991312027,
 -0.03539799898862839,
 0.14579899609088898]

In [51]:
yql = "select title, category from sources news where ({targetHits:10}nearestNeighbor(embedding, user_embedding))" 

In [52]:
result = app.query(
    body={
        "yql": yql,        
        "hits": 10,
        "ranking.features.query(user_embedding)": str(query_user_embedding(user_id="U63195")),
        "ranking.profile": "recommendation"
    }
)

In [53]:
result.hits[0:2]

[{'id': 'index:news_content/0/aca03f4ba2274dd95b58db9a',
  'relevance': 0.1460561756063909,
  'source': 'news_content',
  'fields': {'category': 'music',
   'title': 'Broadway Star Laurel Griggs Suffered Asthma Attack Before She Died at Age 13'}},
 {'id': 'index:news_content/0/bd02238644c604f3a2d53364',
  'relevance': 0.14591827245062294,
  'source': 'news_content',
  'fields': {'category': 'tv',
   'title': "Rip Taylor's Cause of Death Revealed, Memorial Service Scheduled for Later This Month"}}]

In [54]:
yql = "select title, category from sources news where " \
      "({targetHits:10}nearestNeighbor(embedding, user_embedding)) AND " \
      "category contains 'sports'"


In [55]:
result = app.query(
    body={
        "yql": yql,        
        "hits": 10,
        "ranking.features.query(user_embedding)": str(query_user_embedding(user_id="U63195")),
        "ranking.profile": "recommendation"
    }
)


In [56]:
result.hits[0:2]

[{'id': 'index:news_content/0/375ea340c21b3138fae1a05c',
  'relevance': 0.14417346200569972,
  'source': 'news_content',
  'fields': {'category': 'sports',
   'title': 'Charles Rogers, former Michigan State football, Detroit Lions star, dead at 38'}},
 {'id': 'index:news_content/0/2b892989020ddf7796dae435',
  'relevance': 0.14404365847394848,
  'source': 'news_content',
  'fields': {'category': 'sports',
   'title': "'Monday Night Football' commentator under fire after belittling criticism of 49ers kicker for missed field goal"}}]

In [57]:
from vespa.package import Schema, Document, Field

app_package.add_schema(
    Schema(
        name="category_ctr",
        global_document=True,
        document=Document(
            fields=[
                Field(
                    name="ctrs", 
                    type="tensor<float>(category{})", 
                    indexing=["attribute"], 
                    attribute=["fast-search"]
                ), 
            ]
        )
    )
)


In [58]:
app_package.get_schema("news").add_fields(
    Field(
        name="category_ctr_ref",
        type="reference<category_ctr>",
        indexing=["attribute"],
    )
)


In [59]:
from vespa.package import ImportedField

app_package.get_schema("news").add_imported_field(
    ImportedField(
        name="global_category_ctrs",
        reference_field="category_ctr_ref",
        field_to_import="ctrs",
    )
)


In [60]:
app_package.get_schema("news").add_fields(
    Field(
        name="category_tensor",
        type="tensor<float>(category{})",
        indexing=["attribute"],
    )
)


In [61]:
from vespa.package import Function

app_package.get_schema("news").add_rank_profile(
    RankProfile(
        name="recommendation_with_global_category_ctr", 
        inherits="recommendation",
        functions=[
            Function(
                name="category_ctr", 
                expression="sum(attribute(category_tensor) * attribute(global_category_ctrs))"
            ),
            Function(
                name="nearest_neighbor", 
                expression="closeness(field, embedding)"
            )
            
        ],
        first_phase="nearest_neighbor * category_ctr",
        summary_features=[
            "attribute(category_tensor)", 
            "attribute(global_category_ctrs)", 
            "category_ctr", 
            "nearest_neighbor"
        ]
    )
)

In [62]:
from vespa.deployment import VespaDocker

vespa_docker = VespaDocker.from_container_name_or_id("news")
app = vespa_docker.deploy(application_package=app_package)


Waiting for configuration server, 0/60 seconds...
Waiting for configuration server, 5/60 seconds...
Waiting for configuration server, 10/60 seconds...
Waiting for application to come up, 0/300 seconds.
Waiting for application to come up, 5/300 seconds.
Waiting for application to come up, 10/300 seconds.
Waiting for application to come up, 15/300 seconds.
Waiting for application to come up, 20/300 seconds.
Application is up!
Finished deployment.


In [63]:
import requests, json

global_category_ctr = json.loads(
    requests.get("https://data.vespa-cloud.com/blog/news/global_category_ctr_parsed.json").text
)
global_category_ctr


{'ctrs': {'cells': [{'address': {'category': 'entertainment'},
    'value': 0.029266420380943244},
   {'address': {'category': 'autos'}, 'value': 0.028475809103747123},
   {'address': {'category': 'tv'}, 'value': 0.05374837981352176},
   {'address': {'category': 'health'}, 'value': 0.03531784305129329},
   {'address': {'category': 'sports'}, 'value': 0.05611187986670051},
   {'address': {'category': 'music'}, 'value': 0.05471192953054426},
   {'address': {'category': 'news'}, 'value': 0.04420778372641991},
   {'address': {'category': 'foodanddrink'}, 'value': 0.029256852366228187},
   {'address': {'category': 'travel'}, 'value': 0.025144552013730358},
   {'address': {'category': 'finance'}, 'value': 0.03231013195899643},
   {'address': {'category': 'lifestyle'}, 'value': 0.04423279317474416},
   {'address': {'category': 'video'}, 'value': 0.04006693315980292},
   {'address': {'category': 'movies'}, 'value': 0.03335647459420146},
   {'address': {'category': 'weather'}, 'value': 0.045321

In [64]:
response = app.feed_data_point(schema="category_ctr", data_id="global", fields=global_category_ctr)

In [65]:
news_category_ctr = json.loads(
    requests.get("https://data.vespa-cloud.com/blog/news/news_category_ctr_update_parsed.json").text
)
news_category_ctr[0]


{'id': 'N3112',
 'fields': {'category_ctr_ref': 'id:category_ctr:category_ctr::global',
  'category_tensor': {'cells': [{'address': {'category': 'lifestyle'},
     'value': 1.0}]}}}

In [66]:
for data_point in news_category_ctr:
    response = app.update_data(schema="news", data_id=data_point["id"], fields=data_point["fields"])

In [67]:
def parse_embedding(hit_json):
    embedding_json = hit_json["fields"]["embedding"]["values"]
    embedding_vector = [0.0] * len(embedding_json)
    i=0
    for val in embedding_json:
        embedding_vector[i] = val
        i+=1
    return embedding_vector

def query_user_embedding(user_id):
    result = app.query(body={"yql": "select * from sources user where user_id contains '{}'".format(user_id)})
    embedding = parse_embedding(result.hits[0])
    return embedding
yql = "select * from sources news where " \
      "({targetHits:10}nearestNeighbor(embedding, user_embedding))"
result = app.query(
    body={
        "yql": yql,        
        "hits": 10,
        "ranking.features.query(user_embedding)": str(query_user_embedding(user_id="U33527")),
        "ranking.profile": "recommendation_with_global_category_ctr"
    }
)

In [68]:
result.hits[0]


{'id': 'id:news:news::N5316',
 'relevance': 0.008369192847921151,
 'source': 'news_content',
 'fields': {'sddocname': 'news',
  'documentid': 'id:news:news::N5316',
  'news_id': 'N5316',
  'category': 'sports',
  'subcategory': 'football_nfl',
  'title': "Matthew Stafford's status vs. Bears uncertain, Sam Martin will play",
  'abstract': "Stafford's start streak could be in jeopardy, according to Ian Rapoport.",
  'url': "https://www.msn.com/en-us/sports/football_nfl/matthew-stafford's-status-vs.-bears-uncertain,-sam-martin-will-play/ar-BBWwcVN?ocid=chopendata",
  'date': 20191112,
  'clicks': 0,
  'impressions': 1,
  'summaryfeatures': {'attribute(category_tensor)': {'type': 'tensor<float>(category{})',
    'cells': {'sports': 1.0}},
   'attribute(global_category_ctrs)': {'type': 'tensor<float>(category{})',
    'cells': {'entertainment': 0.029266420751810074,
     'autos': 0.0284758098423481,
     'tv': 0.05374838039278984,
     'health': 0.03531784191727638,
     'sports': 0.0561118

In [69]:
vespa_docker.container.stop()
#vespa_docker.container.remove()
