# Imports

In [None]:
from datasets import load_dataset
from infrastructure.db.mongo import connection as mongo_connection
from infrastructure.db.qdrant import connection as qdrant_connection
from settings import settings
from transformers import AutoModel, AutoTokenizer
from app import gradio_app

# ETL Milestone

In [None]:
mongo_db = mongo_connection.get_database(settings.DATABASE_NAME)

## Fetch users

In [None]:
collection_name = 'users'
user_collection = mongo_db[collection_name]

users = user_collection.find()
for user in users:
  print(user)

## Fetch repos

In [None]:
collection_name = 'repositories'
repo_collection = mongo_db[collection_name]

repos = repo_collection.find().limit(5)

print("Repositories Links:")
for repo in repos:
  print(repo.link)

In [None]:
print("Repositories Content:")
for repo in repos:
  print(repo.content)

## Fetch Youtube video sources

In [None]:
collection_name = 'videos'
youtube_collection = mongo_db[collection_name]

youtube_sources = youtube_collection.find().limit(5)

print("Youtube sources Links:")
for youtube_source in youtube_sources:
  print(youtube_source.link)

In [None]:
print("Youtube sources Content:")
for youtube_source in youtube_sources:
  print(youtube_source.content)

# Feature Pipeline Milestone

## Fetch cleaned repositories data

In [None]:
collection_name = 'cleaned_repositories'

records, _ = qdrant_connection.scroll(
  collection_name=collection_name,
  limit=5,
  with_payload=True,
  with_vectors=False,
)

print("Cleaned repositories data:")
for record in records:
  print(record)

## Fetch cleaned youtube sources data

In [None]:
collection_name = 'cleaned_videos'

records, _ = qdrant_connection.scroll(
  collection_name=collection_name,
  limit=5,
  with_payload=True,
  with_vectors=False,
)

print("Cleaned youtube sources data:")
for record in records:
  print(record)

## Fetch embedded and chunked repositories data

In [None]:
collection_name = 'embedded_repositories'

records, _ = qdrant_connection.scroll(
  collection_name=collection_name,
  limit=5,
  with_payload=True,
  with_vectors=False,
)

print("Embedded repositories data:")
for record in records:
  print(record)

## Fetch embedded youtube sources data

In [None]:
collection_name = 'embedded_videos'

records, _ = qdrant_connection.scroll(
  collection_name=collection_name,
  limit=5,
  with_payload=True,
  with_vectors=False,
)

print("Embedded youtube sources data:")
for record in records:
  print(record)

# Finetuning Milestone

## Fetch instruct dataset data

In [None]:
instruct_dataset = load_dataset(path=settings.DATASET_ID, split='train')
print("Example samples:")
print(instruct_dataset.select(range(5)))

## Fetch finetuned model

In [None]:
model_name = settings.HF_MODEL_ID

model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Model Properties:")
print(f"Model Name: {model_name}")
print(f"Model Architecture: {model.config.architectures}")
print(f"Number of Parameters: {model.num_parameters()}")

print("\nTokenizer Properties:")
print(f"Vocabulary Size: {tokenizer.vocab_size}")
print(f"Tokenizer Type: {type(tokenizer).__name__}")

# Deploy Milestone

In [None]:
gradio_app.launch()