In [1]:
import math
from typing import Optional
from functools import partial
import json
import os 
import numpy as np 
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor

from transformers import AutoTokenizer

from model import MambaEmbedModel

import datasets

In [2]:
model = MambaEmbedModel.from_pretrained("state-spaces/mamba-2.8b", device="cuda", dtype=torch.float16)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
outputs, ssm_state_lst = model.encode("Last night I was drinking alone in my room.")

In [4]:
query = "What is the life story of Napolean Bonaparte?"
doc1 = "Napoleon Bonaparte (born Napoleone di Buonaparte;[1][b] 15 August 1769 – 5 May 1821), later known by his regnal name Napoleon I, was a French emperor and military commander who rose to prominence during the French Revolution and led successful campaigns during the Revolutionary Wars. He was the leader of the French Republic as First Consul from 1799 to 1804, then of the French Empire as Emperor of the French from 1804 until 1814, and briefly again in 1815. His political and cultural legacy endures as a celebrated and controversial leader. He initiated many enduring reforms, but has been criticized for his authoritarian rule. He is considered one of the greatest military commanders in history and his wars and campaigns are still studied at military schools worldwide. However, historians still debate the degree to which he was responsible for the Napoleonic Wars, in which between three and six million people died"
doc2 = "The French Revolution[a] was a period of political and societal change in France that began with the Estates General of 1789, and ended with the coup of 18 Brumaire in November 1799 and the formation of the French Consulate. Many of its ideas are considered fundamental principles of liberal democracy,[1] while its values and institutions remain central to modern French political discourse.[2]"
doc3 = """Bibliography (from Ancient Greek: βιβλίον, romanized: biblion, lit. 'book' and -γραφία, -graphía, 'writing'), as a discipline, is traditionally the academic study of books as physical, cultural objects; in this sense, it is also known as bibliology[1] (from Ancient Greek: -λογία, romanized: -logía). English author and bibliographer John Carter describes bibliography as a word having two senses: one, a list of books for further study or of works consulted by an author (or enumerative bibliography); the other one, applicable for collectors, is "the study of books as physical objects" and "the systematic description of books as objects" or descriptive bibliography"""

In [None]:
query = """How do I run a model locally on my laptop with Ollama?"""
doc1 = """"""

In [11]:
_,query_state_lst = model.encode(query)
_,doc1_state_lst = model.encode(doc1)
_,doc2_state_lst = model.encode(doc2)
_,doc3_state_lst = model.encode(doc3)

In [12]:
query_state_lst[0]

array([ 4.0262230e-03, -3.9898408e-05, -6.2477594e-04, ...,
        1.3457025e-04,  2.3413151e-04,  2.3073424e-03], dtype=float32)

In [43]:
i = 1
q_last = np.mean(query_state_lst,axis=0)
d1_last = np.mean(doc1_state_lst,axis=0)
d2_last = np.mean(doc2_state_lst,axis=0)
d3_last = np.mean(doc3_state_lst,axis=0)

In [45]:
import numpy as np

def cosine_similarity(vector_a, vector_b):
    # Normalize the vectors
    norm_a = np.linalg.norm(vector_a)
    norm_b = np.linalg.norm(vector_b)
    vector_a_normalized = vector_a / norm_a
    vector_b_normalized = vector_b / norm_b
    
    # Calculate the dot product
    dot_product = np.dot(vector_a_normalized, vector_b_normalized)
    
    return dot_product

In [46]:
cosine_similarity(q_last,d1_last)

0.5531247

In [47]:
cosine_similarity(q_last,d2_last)

0.7076628

In [48]:
cosine_similarity(q_last,d3_last)

0.63831043

In [49]:
q_last

array([-0.00147485,  0.00256407,  0.00043956, ..., -0.00150927,
        0.00414287, -0.00115663], dtype=float32)

In [2]:
hagrid = datasets.load_dataset("miracl/hagrid", split="train")

Using the latest cached version of the module from /home/jiatongy/.cache/huggingface/modules/datasets_modules/datasets/miracl--hagrid/eacbf943f121db3e04762c358f4498a9891ce13798404e46854992ac0216327d (last modified on Wed Mar 20 16:09:09 2024) since it couldn't be found locally at miracl/hagrid, or remotely on the Hugging Face Hub.


In [5]:
hagrid

Dataset({
    features: ['query_id', 'query', 'quotes', 'answers'],
    num_rows: 1922
})

In [4]:
hagrid[0]['query']

'When is a language considered dead?'

In [9]:
hagrid[0]['quotes'][1]

{'idx': 1,
 'docid': '161708#0',
 'text': 'An endangered language, or moribund language, is a language that is at risk of falling out of use as its speakers die out or shift to speaking another language. Language loss occurs when the language has no more native speakers and becomes a "dead language". If no one can speak the language at all, it becomes an "extinct language". A dead language may still be studied through recordings or writings, but it is still dead or extinct unless there are fluent speakers. Although languages have always become extinct throughout human history, they are currently dying at an accelerated rate because of globalization, neocolonialism and linguicide (language killing).'}

In [8]:
hagrid[0]['answers']

[{'answer': 'This can happen when the language has no more speakers at all, or only a few elderly speakers who no longer use the language for communication [1]. If no one can speak the language, it becomes an "extinct language" [2]. Although a dead language may still be studied through recordings or writings, it is still dead or extinct unless there are fluent speakers [2]. It is important to note that linguists distinguish between language "death" and the process where a language becomes a "dead language" through normal language change, leaving the old form with no native speakers [3].',
  'answer_type': 'short',
  'informative': 1,
  'attributable': None,
  'sentences': [{'text': 'This can happen when the language has no more speakers at all, or only a few elderly speakers who no longer use the language for communication [1].',
    'index': 1,
    'answer_type': 'short',
    'informative': 1,
    'attributable': None},
   {'text': 'If no one can speak the language, it becomes an "ext