In [1]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import xml.etree.ElementTree as ET
from pypdf import PdfReader
import torch

In [2]:
pdf_folder = '../../pdfs'
pdf_path = Path(pdf_folder) / os.listdir(pdf_folder)[2]
pdf_path.absolute()

PosixPath('/home/jupyter-ikharitonov/code/ecosystem1/../../pdfs/Projects_to_candidates.pdf')

In [3]:
reader = PdfReader(pdf_path)

In [4]:
for page in reader.pages:
    # page_text = page.extract_text()
    # page_text = page_text.replace('\n', '')
    # print(page.extract_text())
    pass

In [5]:
min_chunk_char = 10
max_chunk_char = 200

def stupid_chunking(text):
    chunks = []
    lower_bound = 0
    for i in range(len(text)//max_chunk_char):
        higher_bound = lower_bound + max_chunk_char
        chunks.append(text[lower_bound:higher_bound])
        lower_bound += max_chunk_char
    return chunks

chunks = stupid_chunking(' '.join([x.extract_text() for x in reader.pages]))

In [6]:
chunks

['    \n2022 Rancz Lab Projects    CONFIDENTIAL 1 Research Project – Predictive processing in the mouse cortex  Introduction  To understand how the brain works, we need hypotheses about what it is doing.',
 ' Historically, the brain was treated as feedforward processing device, based initially on the connectivity diagram of cortical visual areas compiled in 1991 by Felleman and Van Essen1. One alternative',
 ' framework recently gaining prominence is the so-called predictive processing theory2. It postulates that brains constantly attempt to match bottom-up sensory input with top-down, internally generated',
 ' predictions3-5. Consequently, the most common neuronal computation across many brain areas is processing prediction error signals6. In terms of implementing such a coding strategy, several algorithms',
 ' have been proposed7, however, they lack solid biological support. Similarly, despite efforts to uncover neuronal elements and circuit motifs supporting the computations needed 

In [7]:
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
cache_folder = '../../disk2/hugghingface_models'

In [9]:
# embedding chunks
model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=cache_folder)
embeddings = model.encode(chunks)

In [10]:
embeddings.shape

(110, 384)

In [11]:
# Save into a "database"

In [12]:
current_id = -1
# temp_list_chunks = []
# temp_list_embeddings = []
test_list = []

for i in range(len(chunks)):
    current_id+=1
    temp_dict = {'id': current_id, 'name': pdf_path.name, 'content': chunks[i]}
    for j in range(embeddings.shape[1]):
        temp_dict[f'dim_{j}'] =  embeddings[i, j]
    test_list.append(temp_dict)
    
pd.DataFrame(test_list).to_csv('../../disk1/vector_database.csv', index=False)

In [13]:
df = pd.read_csv('../../disk1/vector_database.csv')

In [14]:
def get_embedding_array(path):
    df = pd.read_csv(path)
    return [df.iloc[i].values[0] for i in range(df.shape[0])], [df.iloc[i].values[2] for i in range(df.shape[0])], np.vstack([df.iloc[i].values[3:] for i in range(df.shape[0])]).astype(np.float32)

loaded_ids, loaded_chunks, loaded_embeddings = get_embedding_array('../../disk1/vector_database.csv')
loaded_embeddings.shape

(110, 384)

In [15]:
# semantic search
def get_similar_results(query, embeddings, chunks, num_results=5):
    query_embedding = model.encode(query)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, embeddings)[0]
    top_results = torch.topk(cos_scores, k=num_results)
    return list(np.array(chunks)[top_results.indices.numpy()])

In [16]:
print('\n\n'.join(get_similar_results('eye movements', loaded_embeddings, loaded_chunks)))

 Next, we will develop a learned behavioural task involving eye-movements. The innate eye-movements we will investigate in Aim 2 consist of a slow pursuit phase followed by a fast “reset” saccade (Fig

 focus on eye-movements stabilizing the retinal image during self- and external movement. In particular, the optokinetic reflex driven by external motion has been shown to be modulated by the visual c

ure 2C). However, these reflexive saccades are not the only fast gaze shifts mice can do. Increasing evidence supports the presence of stimulus driven, fast targeted eye-movements in mice26,27. Simila

will employ a step-wise development. 1. Directed gaze shifts towards a stationary visual target.  2. Navigation to a stationary visual target using the vestibular platform in closed-loop (i.e. the rot

rly, slow pursuit eye movements driven by the movement of the whole visual field can likely be modulated by top-down cortical activity to focus on target objects. As mice have been shown capab

In [17]:
# Llama local launch

# https://ai.meta.com/blog/5-steps-to-getting-started-with-llama-2/
# https://huggingface.co/docs/transformers/main/en/model_doc/llama2#transformers.LlamaConfig
# https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.pipeline

import transformers
from transformers import LlamaForCausalLM, LlamaTokenizer


model_dir = "../../disk1/llama_weights/llama-2-7b-chat-hf"
model = LlamaForCausalLM.from_pretrained(model_dir)


# tokenizer_dir = '../../disk1/llama_weights/tokenizer.model'
tokenizer = LlamaTokenizer.from_pretrained(model_dir)

Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.27it/s]


In [19]:
# pipeline = transformers.pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.float16, device_map="auto")
# pipeline = transformers.pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype='auto', device='cuda:0')
pipeline = transformers.pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype='auto', device_map='auto')

In [20]:
sequences = pipeline('What kind of eye movement relfexes exist in humans?\n', do_sample=True,
top_k=10, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, max_length=400)

for seq in sequences:
    print(f"{seq['generated_text']}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


What kind of eye movement relfexes exist in humans?
There are several types of eye movement reflexes in humans, including:
1. Pursuit reflex: This reflex causes the eyes to move in a smooth, continuous manner to track a moving object.
2. Saccadic reflex: This reflex causes the eyes to make quick, jerky movements (saccades) to shift the gaze between different points in the visual field.
3. Fixational reflex: This reflex causes the eyes to make small adjustments to the point of gaze in order to maintain a steady fixation on a particular point or object.
4. Smooth pursuit reflex: This reflex causes the eyes to move in a smooth, continuous manner to track a moving object, but without the quick, jerky movements of the saccadic reflex.
5. Optokinetic reflex: This reflex causes the eyes to move in response to a moving visual stimulus, such as a spinning wheel.
6. Vestibulo-ocular reflex: This reflex causes the eyes to move in response to changes in the position and balance of the head, and he