In [9]:
from absl import logging

import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import feedparser

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)


module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [10]:
def plot_similarity(labels, features, rotation):
  corr = np.inner(features, features)
  sns.set(font_scale=1.2)
  g = sns.heatmap(
      corr,
      xticklabels=labels,
      yticklabels=labels,
      vmin=0,
      vmax=1,
      cmap="YlOrRd")
  g.set_xticklabels(labels, rotation=rotation)
  g.set_title("Semantic Textual Similarity")

def run_and_plot(messages_):
  message_embeddings_ = embed(messages_)
  plot_similarity(messages_, message_embeddings_, 90)
  return message_embeddings_


In [14]:
'''Connect to redis database, if it is running'''
import redis
client = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)
print(client.ping())

True


In [103]:
'''Execute the model and begin to compare similarities'''

messages = titles
logging.set_verbosity(logging.ERROR)
message_embeddings = embed(messages)

for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
    vec_dimension = len(message_embedding)

    if i < len(messages):
        print("Message: {}".format(messages[i]))
    else:
        print("No corresponding message for embedding at index {}".format(i))
    print("Embedding size: {}".format(vec_dimension))
    message_embedding_snippet = ", ".join(
        (str(x) for x in message_embedding[:3]))
    print("Embedding: [{}, ...]\n".format(message_embedding_snippet))


Message: Ilia Topuria urged to get fighting amid  title belt tour: ‘It’s not coming off great, it’s just weird’
Embedding size: 512
Embedding: [-0.00786014087498188, -0.009746174328029156, 0.06660237163305283, ...]

Message: Charles Oliveira weighs up welterweight leap after  300 defeat: ‘Why not move up and do a big fight?’
Embedding size: 512
Embedding: [-0.059396855533123016, -0.05188683792948723, -0.03848329186439514, ...]

Message: Dana White blasts  star Jon Jones over recent drug testing fiasco: ‘He’s literally always in trouble’
Embedding size: 512
Embedding: [-0.005037330556660891, -0.028578992933034897, -0.0041779810562729836, ...]

Message: Tom Aspinall rips ‘Terrible’ start times for  304 card in Manchester: ‘I think it’s just not fair on the fans’
Embedding size: 512
Embedding: [0.03304662927985191, -0.07407993078231812, -0.004538500215858221, ...]

Message: Joe Rogan set to miss  301 card in Brazil, Paul Felder drafts in for South America fighting return
Embedding size: 5

In [104]:
'''Clear all keys in the redis database'''
client.flushall()
if client.flushall():
    print("All keys have been cleared")
else:
    print("Error clearing keys")


All keys have been cleared


In [105]:
'''Add all articles(messages) to the redis database as JSON objects'''
import json
pipeline = client.pipeline()
for i, article in enumerate(ents, start=1):
    redis_key = f"article:{i}"
    pipeline.json().set(redis_key, '$', article)
pipeline.execute()

keys = sorted(client.keys('article:*'))


tit = client.json().mget(keys, '$.title')
tit = [item for sublist in tit for item in sublist]


title_embeddings = np.array(embed(tit)).tolist()

pipeline = client.pipeline()
for key, embedding in zip(keys, title_embeddings):
    pipeline.json().set(key, '$.title_embedding', embedding)
pipeline.execute()
print(json.dumps(client.json().get('article:10'), indent=2)) 

{
  "published": "2024-04-30T15:00:00-04:00",
  "published_parsed": [
    2024,
    4,
    30,
    19,
    0,
    0,
    1,
    121,
    0
  ],
  "updated": "2024-04-30T15:00:00-04:00",
  "updated_parsed": [
    2024,
    4,
    30,
    19,
    0,
    0,
    1,
    121,
    0
  ],
  "title": "Copa Combate winner Ramiro Jimenez re-signs with Combate Global to new multi-fight deal",
  "title_detail": {
    "type": "text/plain",
    "language": "en",
    "base": "https://www.mmafighting.com/rss/current.xml",
    "value": "Copa Combate winner Ramiro Jimenez re-signs with Combate Global to new multi-fight deal"
  },
  "content": [
    {
      "type": "text/html",
      "language": "en",
      "base": "https://www.mmafighting.com/rss/current.xml",
      "value": "<figure>\n      <img alt=\"\" src=\"https://cdn.vox-cdn.com/thumbor/IBebhCfAHUKPCvfLSqWkhsNGqGg=/0x0:1436x957/1310x873/cdn.vox-cdn.com/uploads/chorus_image/image/73318260/Ramiro_Jimenez_vs_Pablo_Burgos_1316__1_.0.jpg\" />\n        <

In [106]:
from redis.commands.search.field import TextField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query

INDEX_NAME = 'idx:articles_vss'
DOC_PREFIX = 'article:'

try:
    # check to see if index exists
    client.ft(INDEX_NAME).info()
    print('Index already exists!')
except:
    # schema
    schema = (
        TextField('$.link', no_stem=True, as_name='link'),
        TextField('$.author', no_stem=True, as_name='author'),
        TextField('$.summary', no_stem=True, as_name='summary'),
        TextField('$.id', no_stem=True, as_name='id'),
        TextField('$.published', no_stem=True, as_name='published'),
        TextField('$.title', no_stem=True, as_name='title'),
        VectorField('$.title_embedding',
            'FLAT', {
                'TYPE': 'FLOAT32',
                'DIM': 512,
                'DISTANCE_METRIC': 'COSINE',
            },  as_name='vector'
        ),
    )

    # index Definition
    definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON)

    # create Index
    client.ft(INDEX_NAME).create_index(fields=schema, definition=definition)


In [107]:

'''Print indexing status'''
info = client.ft(INDEX_NAME).info()

num_docs = info['num_docs']
indexing_failures = info['hash_indexing_failures']
total_indexing_time = info['total_indexing_time']
percent_indexed = int(info['percent_indexed']) * 100


print(f"{num_docs} documents ({percent_indexed} percent) indexed with {indexing_failures} failures in {float(total_indexing_time):.2f} milliseconds")



23 documents (100 percent) indexed with 0 failures in 9.33 milliseconds


In [108]:
import sys

q_text = "Alexandre Pantoja faces off with Steve Erceg before world title fight"
q_vector = np.array(embed([q_text]), dtype=np.float32).tobytes()
print(f"Query vector: {q_vector}")

query = (
    Query('(*)=>[KNN 3 @vector $query_vector AS vector_score]')
     .sort_by('vector_score')
     .return_fields('vector_score', 'title', 'link', 'author', 'summary')
     .dialect(2)
)

res = client.ft(INDEX_NAME).search(query, {'query_vector': q_vector})


for doc in res.docs:
    print(f'Document: {doc}\n')


Query vector: b'1a=;U\xac-\xbd\xac\xe6\x8c=ax\x01\xbd\xc6\x96\x82\xbb\x00Js=\xf4Jt=~\xb8\xae\xbb\x93\x80\xa9\xbb\x1c\xfe\xb3\xbcz\x96(\xbd\x97/7=R2\x9d\xbczqY<\xefw.\xbc\x96\x04\x06\xbd\xcd\n\x9c<U\x06\x8e\xbb\xa0E\x7f\xbc\x89(L=1\xecm\xbd\xc3\x88\xc8<Z\xc4_<\t\xa4!=\x95\xb1\x1e=\x96-\x19\xbc]q\xa2;oP\x82\xbcK\x0e\xe0\xbc\x9b\xc0\xa5\xbd3Q\x1c<\xd2\xb2\x8b<].\x85=fq\x99\xbd+\xf5\xcc<1\x94<<.#W\xbcK\\\x81=0\x90\xd1<\xe3\xf3\xef\xbcQ\x87H=k``\xbc\xe9\x1b\xf2\xbc\x0e^\x9c\xbd\xf3\x12\xab=\xc8x\xde<\n\xd0\x98\xbc\x98\x8a \xbdd\x06\x17\xbdEQ\xea<\xc2\xb5\xbb=\xbc\\\xa5\xbdp\x1aH=\xd8\xba\x08\xbdT\xfeT\xbd\n\xbfA\xbdW\xe1-\xbb\x14\x961=\xab)!=\xb7_\x00<k\x12\xc0=\x9d^\x8d\xbd\xca\x11\xc0\xbd\x8d\x1b\xe6<T\xfe\x93=aU\xeb<\xcb&S<]\x00\xa7<\xc19\xdd\xbc\x1b\xb7\x8e\xbd\xc6QR\xbcN\\K=b6\x99<\xf4=\x84=\xd8x\x98=v\xa2\x9e\xbc\x10\xd4&=\xbatW=,Y\x86=\x83\xc6z\xba\x86#\xa5\xbd6M\x93<F$T<\x19\xc7\xd5<\xef\x97\xda<\xfer\xef\xba\x1c\xef\xe3<\x8b\xe2\xb9=\x94\xbf\xeb\xbb\x11\xe01=\x8eI\x1e\xbd\xd0{\xdd\

In [102]:
'''Pull articles from the list of feeds and compile a global list of articles and titles'''

feeds = ["http://lowkickmma.com/feed", "https://www.mmafighting.com/rss/index.xml", "https://www.bloodyelbow.com/rss"]


titles = []
ents = []
i = 0

def clean_html(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def format_event_name(title: str):
    title = title.replace("UFC on ESPN 55", "UFC Vegas 91")
    return title

for f in feeds:
    print(f"\n{f}")
    i += 1
    NewsFeed = feedparser.parse(f)
    entries = NewsFeed.entries
    for e in entries:
        if len(titles) >= i * 8 - 1:
            break
        if "play-by-play" in e.title.lower():
            print("skipping")
            continue
        if "ufc" and "prediction" and "pick" and "odds" in e.title.lower():
            print("skipping")
            continue
        if e.summary:                           #Clean html tags from body/summary
            e.summary = clean_html(e.summary)
            print(e.summary)
        t = format_event_name(e.title).replace("UFC", "")
        print(t)
        ents.append(e)
        titles.append(t)


http://lowkickmma.com/feed
Ilia Topuria urged to get fighting amid  title belt tour: ‘It’s not coming off great, it’s just weird’
Charles Oliveira weighs up welterweight leap after  300 defeat: ‘Why not move up and do a big fight?’
Dana White blasts  star Jon Jones over recent drug testing fiasco: ‘He’s literally always in trouble’
Tom Aspinall rips ‘Terrible’ start times for  304 card in Manchester: ‘I think it’s just not fair on the fans’
Joe Rogan set to miss  301 card in Brazil, Paul Felder drafts in for South America fighting return
Ex- commentator blasts Ronda Rousey amid criticizm of media: ‘The people behind the scenes can’t stand you’
Arman Tsarukyan’s fight purse withheld by NSAC after fan incident at  300, faces disciplinary action

https://www.mmafighting.com/rss/index.xml
Daniel Cormier gives advice to Ilia Topuria after ‘not coming off great’ in booking first title defense
Claressa Shields’ takeaway from first 3 MMA fights: ‘I have potential to be an MMA champion’
Copa C