# Vector Database

Luke Needs to write in this one

In [2]:
from sklearn.datasets import fetch_openml
from difflib import get_close_matches
import numpy as np
import pandas as pd

DOWNLOAD_AND_REPROCESS_DATA = False

if DOWNLOAD_AND_REPROCESS_DATA:

    base_movies_and_ratings = fetch_openml(data_id=43603).frame
    base_movie_revenue = fetch_openml(data_id=43113).frame
    
    
    title_possibilites = base_movie_revenue.title.values.tolist()
    def get_match(e: str):
        matches = get_close_matches(e, title_possibilites)
        if len(matches) == 0:
            return None 
        else:
            i = title_possibilites.index(matches[0])
            return i
    
    base_movies_and_ratings['lookup_index'] = base_movies_and_ratings.Title.apply(get_match)
    
    TARGET_COLUMNS = {
        'Straight Revenue': 'revenue',
        'Revenue as PCT Above a Clipped Mean': 'revenue_pct_mean',
        'Log of Revenue': 'revenue_log1p',
        'High Gross Classifier': 'high_gross'
    }
    joined_data = base_movies_and_ratings.join(base_movie_revenue[['revenue', 'status']], on='lookup_index', how='inner').drop(columns=['Revenue_(Millions)'])	
    
    joined_data['revenue_pct_mean'] = (joined_data.revenue.clip(lower=10_000_000, upper=300_000_000) / joined_data.revenue.mean()).round(2)* 100
    joined_data['high_gross']  = joined_data['revenue_pct_mean'] > 100
    joined_data['revenue_log1p'] = np.log1p(joined_data.revenue)
else:
    joined_data = pd.read_csv("Producers Chair Dataset.csv")


TARGET_COLUMNS = {
    'Straight Revenue': 'revenue',
    'Revenue as PCT Above a Clipped Mean': 'revenue_pct_mean',
    'Log of Revenue': 'revenue_log1p',
    'High Gross Classifier': 'high_gross'
}


FEATURE_COLUMNS = ['Title', 'Year',
       'Description', 'Genre',  'Actors', 'Director',]

In [3]:
from fastembed import TextEmbedding
import pandas as pd

from langchain_community.document_loaders import DataFrameLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings




def embed_and_index_documents(documents: pd.DataFrame, column: str ='title', embedding_model):
  """
  This function embeds documents using FastEmbed and builds a FAISS index.
  Credit: Google Gemini which created this function

  Args:
      documents: A pandas dataframe containg the text
      column: the name of the column with the text data

  Returns:
      A tuple containing the FAISS index and the document embeddings.
  """
  loader = DataFrameLoader(documents, page_content_column=column)
  docs = loader.load()
  db = FAISS.from_documents(docs, embedding_model)

  return db

  from .autonotebook import tqdm as notebook_tqdm
Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 64437.63it/s]


In [4]:
CREATE_VECTOR_DB = False
VECTOR_DB_FILE = "faiss_index"

In [6]:
if CREATE_VECTOR_DB:
    embedding_model = FastEmbedEmbeddings(model_name='jinaai/jina-embeddings-v2-base-en')
    index = embed_and_index_documents(joined_data, column='Title', embedding_model=embedding_model)
    index.save_local(VECTOR_DB_FILE)
else:
    index = FAISS.load_local(VECTOR_DB_FILE, embedding_model, allow_dangerous_deserialization=True)	

In [13]:
import networkx as nx


def get_network_graph(query: str, index=index, docs_to_return=40):
    G = nx.Graph()
    documents = index.similarity_search_with_score(query, k=docs_to_return)
    G.add_edge(query, query, similarity=1)
    for d in documents:
        doc, score = d
        G.add_edge(query, doc.page_content, similarity=score)
        supplemental = index.similarity_search_with_score(doc.page_content, k=docs_to_return)
        for d2 in supplemental:
            doc2, score2 = d2
            G.add_edge(doc.page_content, doc2.page_content,  similarity=score2)
    layout = nx.drawing.spring_layout(G, weight='similarity' )
    edge_rows = [{'node1': edge_tuple[0], 'node2': edge_tuple[1], 'similarity': edge_tuple[2]['similarity'],
                 "x1": layout[edge_tuple[0]][0], "x2": layout[edge_tuple[1]][0],
                  "y1": layout[edge_tuple[0]][1], "y2": layout[edge_tuple[1]][1], 
                  'name': edge_tuple[0] + "  |  " + edge_tuple[1],
                  "is_query": edge_tuple[0] == query and edge_tuple[1] == query, "connects_to_query": edge_tuple[0] == query or edge_tuple[1] == query
                 }
                 for edge_tuple in nx.to_edgelist(G)
                ]
    
    edge_frame = pd.DataFrame(edge_rows)

    return edge_frame

def get_difflib_graph(query: str, index=index, docs_to_return=40):
    G = nx.Graph()
    titles = get_close_matches(query, joined_data.Title.values, cutoff=0.1)
    G.add_edge(query, query, similarity=0.2)
    for t in titles:
        G.add_edge(query, t, similarity=0.2)
        supplemental = get_close_matches(query, joined_data.Title.values)
        for d2 in supplemental:

            G.add_edge(t, d2,  similarity=0.2)
            
    layout = nx.drawing.spring_layout(G)
    edge_rows = [{'node1': edge_tuple[0], 'node2': edge_tuple[1], 'similarity': edge_tuple[2]['similarity'],
                 "x1": layout[edge_tuple[0]][0], "x2": layout[edge_tuple[1]][0],
                  "y1": layout[edge_tuple[0]][1], "y2": layout[edge_tuple[1]][1], 
                  'name': edge_tuple[0] + "  |  " + edge_tuple[1],
                  "is_query": edge_tuple[0] == query and edge_tuple[1] == query, "connects_to_query": edge_tuple[0] == query or edge_tuple[1] == query
                 }
                 for edge_tuple in nx.to_edgelist(G)
                ]
    
    edge_frame = pd.DataFrame(edge_rows)

    return edge_frame


import altair as alt

def make_vector_embed_chart(query: str, difflib=False)-> alt.LayerChart:
    if difflib:
        df = get_difflib_graph(query)
    else:
        df = get_network_graph(query)
    base = alt.Chart(df).encode(
        x=alt.X("x1:Q"),
        y=alt.Y("y1:Q"),
        x2=alt.X2("x2:Q"),
        y2=alt.Y2("y2:Q"),
        color=alt.Color(
            "is_query:N",
            legend=None,
            scale=alt.Scale(range=["#357edd", "#B942CB"], domain=[False, True]),
        ),
    )
    chart = (
        base.mark_point(filled=True, color="#64C169", fillOpacity=1).encode(
            size=alt.Size("is_query:N", scale=alt.Scale(range=[100, 400]), legend=None),
            tooltip="name:N",
        )
        + base.mark_rule(strokeWidth=0.4, stroke='gray')
        + base.mark_point(filled=True, fillOpacity=1, size=100).encode(
            x=alt.X("x2:Q"), y=alt.Y("y2:Q"), tooltip="name:N"
        )
    )
    
    chart.title = f"Vector Embedding Map for {query}"
    chart.background = "#69F7BE"
    chart.width = 650
    chart.height = 400
   
    return chart.configure_axis(labels=False, title=None, grid=False, ticks=False).configure_title(
        fontSize=20, color="#DB5D57"
    )





In [19]:
# Query a Sample Title 
TITLE = "Three Nights in LA"
make_vector_embed_chart(
    query=TITLE
)

  for col_name, dtype in df.dtypes.iteritems():


In [20]:
make_vector_embed_chart(
    query=TITLE, difflib=True
)

  for col_name, dtype in df.dtypes.iteritems():


In [None]:
layout = nx.drawing.spring_layout(G, weight='similarity', faiss_db=index, )

In [None]:
import altair as alt

df = pd.DataFrame.from_dict(layout, orient='index', columns=['x1', 'y1'])
center_node = layout[query]

df['x2'] = 0
df['y2'] = 0


base = alt.Chart(df.reset_index()).encode(
    alt.X('x1:Q'), 
    alt.Y('y1:Q'), 
    alt.Tooltip('index:N'), 
    # alt.Color('mark:N')
)
c = base.mark_point()
c

In [None]:
G.edges

In [None]:
df.reset_index()

In [None]:
layout