In [None]:
'''
1. Get api key: secret(left side bar top 4) -> Gemni API keys (unavailable in HK, use vpn)
    as env: GOOGLE_API_KEY
2.
'''
!curl https://ipinfo.io/

In [None]:
%%capture
%%bash
#install packages

#hugging face
pip install -U datasets


#colbert: tpu/gpu
pip install gitpython
pip install faiss-cpu

pip install pandas gdown shortuuid

In [None]:
%%capture

'''
setup colbert / plaidrepro
'''

import os
original_cwd = os.getcwd()
os.chdir('/tmp')

!rm -rf ./ColBERT ./plaidrepro
!git -C ColBERT/ pull || git clone https://github.com/stanford-futuredata/ColBERT.git

#!mv ./plaidrepro ./ColBERT
import sys; sys.path.insert(0, '/tmp/ColBERT/')

try: # When on google Colab, let's install all dependencies with pip.
    import google.colab
    !pip install -U pip
    !pip install -e ColBERT/['faiss-gpu','torch']
except Exception:
  import sys; sys.path.insert(0, 'ColBERT/')
  try:
    from colbert import Indexer, Searcher
  except Exception:
    print("If you're running outside Colab, please make sure you install ColBERT in conda following the instructions in our README. You can also install (as above) with pip but it may install slower or less stable faiss or torch dependencies. Conda is recommended.")
    assert False

os.chdir(original_cwd)
print(f"Changed back to: {os.getcwd()}")

!pip install bitarray datasets gitpython ninja scipy spacy tqdm transformers ujson flask python-dotenv

In [None]:
'''
load benchmark data
'''
'''
from datasets import load_dataset
mrag_bench = load_dataset("brunokreiner/genius-lyrics", split='train')
display(mrag_bench)

display(mrag_bench[0]['id'])
display(mrag_bench[0]['lyrics'])
'''

In [None]:
%%bash
wget -qq https://github.com/gaussic/Chinese-Lyric-Corpus/raw/refs/heads/master/Chinese_Lyrics.zip
unzip Chinese_Lyrics.zip
rm -f Chinese_Lyrics.zip

In [None]:
import os

def find_and_read_txt_files(folder_path):
    """
    Recursively finds all .txt files in a folder, sorts them by filename,
    and reads their content into one array and their filenames into another.

    Args:
        folder_path: The path to the folder to search.

    Returns:
        A tuple containing two lists:
        - A list of file contents (strings).
        - A list of filenames (strings).
    """
    txt_files = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                txt_files.append(os.path.join(root, file))

    txt_files.sort()

    file_contents = []
    filenames = []
    for txt_file in txt_files:
        with open(txt_file, 'r', encoding='utf-8') as f:
            file_contents.append(f.read())
        filenames.append(os.path.basename(txt_file))

    return file_contents, filenames

# Example usage:
folder_to_search = './Chinese_Lyrics'
file_contents, filenames = find_and_read_txt_files(folder_to_search)



# To see the contents and filenames, you can print them:
#print("File contents:", file_contents)
#print("Filenames:", filenames)

# You can access individual elements like this:
print("First file content:", file_contents[0])
print("First filename:", filenames[0])

In [None]:
#----------------------------------------------------------------------------------

In [None]:
%%bash
#Download the pre-trained ColBERTv2 checkpoint

cd /tmp

rm -rf colbertv
mkdir -p colbertv
cd colbertv
#wget -qq https://downloads.cs.stanford.edu/nlp/data/colbert/colbertv2/colbertv2.0.tar.gz -O colbertv2.0.tar.gz
wget -qq -O- https://downloads.cs.stanford.edu/nlp/data/colbert/colbertv2/colbertv2.0.tar.gz | tar xvz


In [None]:
!rm -rf ./experiments

In [None]:
'''
step-1b: RAG system: Plaid
'''

class RAG():
  '''
    root_dir: for Colbertv working dir
    data_path: for storing input data array, which is missing from Colbertv
  '''
  def __init__(self, root_dir='./experiments', storage_path='./rag_data.pt'):
    import os

    self._root_dir = root_dir
    self._storage_path = storage_path

    if os.path.exists(self._storage_path):
        print(f"Loading data from {self._storage_path}")
        self._load_storage()
    else:
        print(f"Data file {self._storage_path} not found. Starting with empty data.")
        self._storage = {
            #image descriptions
            'data': [],
            #filenames
            'annotation': []
        }

  def _save_storage(self):
    import json
    with open(self._storage_path, 'w') as f:
      json.dump(self._storage, f, indent=4)

  def _load_storage(self):
    import json
    with open(self._storage_path, 'r') as f:
      self._storage = json.load(f)

  '''
    Warning:
      data list should be at least 100 to make it work, otherwise it stucks
  '''
  def index(self, data: list[str], annotation: list[str], kmeans_niters=4,
    checkpoint_dir="./colbertv/colbertv2.0"
  ):
    from colbert.infra import Run, RunConfig, ColBERTConfig
    from colbert.data import Queries, Collection
    from colbert import Indexer, Searcher

    with Run().context(RunConfig(nranks=1, experiment="msmarco")):
        config = ColBERTConfig(
            nbits=2,
            root=self._root_dir,
            kmeans_niters=kmeans_niters,
            avoid_fork_if_possible=True
        )
        indexer = Indexer(checkpoint=checkpoint_dir, config=config)
        indexer.index(name="msmarco.nbits.2", collection=data, overwrite=True)

    self._storage['data'] = data
    self._storage['annotation'] = annotation
    self._save_storage()

    print('overwritten storage file!')

  def query(self, query: str, top_k=5):
    from colbert.infra import Run, RunConfig, ColBERTConfig
    from colbert.data import Queries, Collection
    from colbert import Indexer, Searcher

    with Run().context(RunConfig(nranks=1, experiment="msmarco")):
      config = ColBERTConfig(
          root=self._root_dir,
      )
      searcher = Searcher(index="msmarco.nbits.2", config=config)
      ranking = searcher.search(query, k=top_k)

    return {
      'index': ranking[0],
      'data': [ self._storage['data'][passage_id] for passage_id in ranking[0] ],
      'annotation': [ self._storage['annotation'][passage_id] for passage_id in ranking[0] ]
    }

#---------------------------[Example usage]----------------------------------
songs_name = file_contents
songs_text = filenames

print('--------------------------use new rag------------------------------------')

!rm -f ./rag_data.json

rag = RAG(
  storage_path='./rag_data.json'
)

query='Who has 11 cats?'

rag.index(data=songs_text, annotation=songs_name,
  checkpoint_dir='/tmp/colbertv/colbertv2.0'
)

answer = rag.query(query=query)
print(f'query: {query}')
print(f'top-k answer: {answer}')
print(f'best answer: {answer["data"][0]} @{answer["annotation"][0]}')

In [None]:
query='Lost of sleep. Secret crush'

rag = RAG(
  storage_path='./rag_data.json'
)

answer = rag.query(query=query)

print(f'query: {query}')
#print(f'top-k answer: {answer}')
for data,annotation in zip(answer["data"], answer["annotation"]):
  print(f'<{annotation}>\n{data}\n\n')

In [None]:
%%bash
rm -f experiments.7z
mv ./rag_data.json ./experiments/
7z a experiments.7z ./experiments
rm -rf ./experiments