<a href="https://colab.research.google.com/github/kairamilanifitria/PurpleBox-Intern/blob/main/07Jan_Scraping_Example_Data_Feature_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Scrapping

Get information about product name, price, and description based on one example prestashop e-commerce website "Magimix"

In [None]:
!pip install requests beautifulsoup4 pandas



page 1 : https://www.magimix.co.uk/6-products

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the product page
url = "https://www.magimix.co.uk/6-products"

# Send a request to the website
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# List to store the product data
products = []

# Scraping product data
for product in soup.find_all('div', class_='product-description'):
    # Extract the product name
    name_tag = product.find('h3', class_='product-title')
    name = name_tag.text.strip() if name_tag else 'N/A'

    # Extract the product description
    description_tag = product.find('div', class_='description')
    description = description_tag.text.strip() if description_tag else 'N/A'

    # Extract the product price
    price_tag = product.find('span', class_='price')
    price = price_tag.text.strip() if price_tag else 'N/A'

    # Store the data in a dictionary
    products.append({
        'name': name,
        'price': price,
        'description': description,
    })

# Convert the data into a pandas DataFrame for easier manipulation
df = pd.DataFrame(products)

# Save the data to a CSV file
df.to_csv('magimix_products_1.csv', index=False)

print("Data scraped and saved to magimix_products_1.csv")


Data scraped and saved to magimix_products_1.csv


page 2: https://www.magimix.co.uk/6-products?page=2

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the product page
url = "https://www.magimix.co.uk/6-products?page=2"

# Send a request to the website
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# List to store the product data
products = []

# Scraping product data
for product in soup.find_all('div', class_='product-description'):
    # Extract the product name
    name_tag = product.find('h3', class_='product-title')
    name = name_tag.text.strip() if name_tag else 'N/A'

    # Extract the product description
    description_tag = product.find('div', class_='description')
    description = description_tag.text.strip() if description_tag else 'N/A'

    # Extract the product price
    price_tag = product.find('span', class_='price')
    price = price_tag.text.strip() if price_tag else 'N/A'

    # Store the data in a dictionary
    products.append({
        'name': name,
        'price': price,
        'description': description,
    })

# Convert the data into a pandas DataFrame for easier manipulation
df = pd.DataFrame(products)

# Save the data to a CSV file
df.to_csv('magimix_products_2.csv', index=False)

print("Data scraped and saved to magimix_products_2.csv")


Data scraped and saved to magimix_products_2.csv


In [None]:
import pandas as pd

# Load the two CSV files into pandas DataFrames
df1 = pd.read_csv('magimix_products_1.csv')
df2 = pd.read_csv('magimix_products_2.csv')

# Concatenate the two DataFrames vertically
combined_df = pd.concat([df1, df2], ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('magimix_products.csv', index=False)

print("Combined data saved to magimix_products.csv")

Combined data saved to magimix_products.csv


In [None]:
df_magimix = pd.read_csv('magimix_products.csv')
df_magimix

Unnamed: 0,name,price,description
0,MINI PLUS FOOD PROCESSOR - SATIN,£200.00,"Small but mighty Food Processor, designed for ..."
1,MINI PLUS FOOD PROCESSOR - BLACK,£180.00,"Small but mighty Food Processor, designed for ..."
2,VERTUO NEXT & MILK - CHROME,£250.00,"The next big cup is here with Vertuo Next, off..."
3,VERTUO NEXT & MILK - DARK GREY,£200.00,"The next big cup is here with Vertuo Next, off..."
4,VERTUO NEXT & MILK - MATT BLACK,£200.00,"The next big cup is here with Vertuo Next, off..."
5,VERTUO NEXT & MILK - WHITE,£200.00,"The next big cup is here with Vertuo Next, off..."
6,5200XL FOOD PROCESSOR - SATIN,£390.00,Iconic multifunctional Food Processor for up t...
7,5200XL FOOD PROCESSOR - BLACK,£370.00,Iconic multifunctional Food Processor for up t...
8,5200XL FOOD PROCESSOR - RED,£370.00,Iconic multifunctional Food Processor for up t...
9,5200XL FOOD PROCESSOR - WHITE,£299.98,Iconic multifunctional Food Processor for up t...


## trial with additional links

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the product page
url = "https://www.magimix.co.uk/6-products"

# Send a request to the website
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# List to store the product data
products = []

# Scraping product data
for product in soup.find_all('div', class_='product-description'):
    # Extract the product name
    name_tag = product.find('h3', class_='product-title')
    name = name_tag.text.strip() if name_tag else 'N/A'

    # Extract the product link
    link_tag = product.find('a', href=True)
    product_link = link_tag['href'].strip() if link_tag else 'N/A'

    # Extract the product description
    description_tag = product.find('div', class_='description')
    description = description_tag.text.strip() if description_tag else 'N/A'

    # Extract the product price
    price_tag = product.find('span', class_='price')
    price = price_tag.text.strip() if price_tag else 'N/A'

    # Store the data in a dictionary
    products.append({
        'name': name,
        'link': product_link,
        'price': price,
        'description': description,
    })

# Convert the data into a pandas DataFrame for easier manipulation
df = pd.DataFrame(products)

# Save the data to a CSV file
df.to_csv('magimix_products_1.csv', index=False)

print("Data scraped and saved to magimix_products_1.csv")

Data scraped and saved to magimix_products_1.csv


In [None]:
df = pd.read_csv('magimix_products_1.csv')
df

Unnamed: 0,name,link,price,description
0,MINI PLUS FOOD PROCESSOR - SATIN,https://www.magimix.co.uk/food-processor/24-mi...,£200.00,"Small but mighty Food Processor, designed for ..."
1,MINI PLUS FOOD PROCESSOR - BLACK,https://www.magimix.co.uk/food-processor/24-mi...,£180.00,"Small but mighty Food Processor, designed for ..."
2,VERTUO NEXT & MILK - CHROME,https://www.magimix.co.uk/nespresso/178-vertuo...,£250.00,"The next big cup is here with Vertuo Next, off..."
3,VERTUO NEXT & MILK - DARK GREY,https://www.magimix.co.uk/nespresso/178-vertuo...,£200.00,"The next big cup is here with Vertuo Next, off..."
4,VERTUO NEXT & MILK - MATT BLACK,https://www.magimix.co.uk/nespresso/178-vertuo...,£200.00,"The next big cup is here with Vertuo Next, off..."
5,VERTUO NEXT & MILK - WHITE,https://www.magimix.co.uk/nespresso/178-vertuo...,£200.00,"The next big cup is here with Vertuo Next, off..."
6,5200XL FOOD PROCESSOR - SATIN,https://www.magimix.co.uk/food-processor/27-52...,£390.00,Iconic multifunctional Food Processor for up t...
7,5200XL FOOD PROCESSOR - BLACK,https://www.magimix.co.uk/food-processor/27-52...,£370.00,Iconic multifunctional Food Processor for up t...
8,5200XL FOOD PROCESSOR - RED,https://www.magimix.co.uk/food-processor/27-52...,£370.00,Iconic multifunctional Food Processor for up t...
9,5200XL FOOD PROCESSOR - WHITE,https://www.magimix.co.uk/food-processor/27-52...,£299.98,Iconic multifunctional Food Processor for up t...


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the product page
url = "https://www.magimix.co.uk/6-products?page=2"

# Send a request to the website
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# List to store the product data
products = []

# Scraping product data
for product in soup.find_all('div', class_='product-description'):
    # Extract the product name
    name_tag = product.find('h3', class_='product-title')
    name = name_tag.text.strip() if name_tag else 'N/A'

    # Extract the product link
    link_tag = product.find('a', href=True)
    product_link = link_tag['href'].strip() if link_tag else 'N/A'

    # Extract the product description
    description_tag = product.find('div', class_='description')
    description = description_tag.text.strip() if description_tag else 'N/A'

    # Extract the product price
    price_tag = product.find('span', class_='price')
    price = price_tag.text.strip() if price_tag else 'N/A'

    # Store the data in a dictionary
    products.append({
        'name': name,
        'link': product_link,
        'price': price,
        'description': description,
    })

# Convert the data into a pandas DataFrame for easier manipulation
df = pd.DataFrame(products)

# Save the data to a CSV file
df.to_csv('magimix_products_2.csv', index=False)

print("Data scraped and saved to magimix_products_2.csv")

Data scraped and saved to magimix_products_2.csv


In [None]:
df = pd.read_csv('magimix_products_2.csv')
df

Unnamed: 0,name,link,price,description
0,VERTUO POP - PACIFIC BLUE,https://www.magimix.co.uk/nespresso/206-vertuo...,£55.00,Add a touch of colour to your life with the Ne...
1,VERTUO POP - LIQUORICE BLACK,https://www.magimix.co.uk/nespresso/206-vertuo...,£59.00,Add a touch of colour to your life with the Ne...
2,BLENDER POWER 3,https://www.magimix.co.uk/blender/209-blender-...,£149.98,Compact & efficient blender. Ideal for up to 3...
3,BLENDER POWER 5XL,https://www.magimix.co.uk/blender/210-blender-...,£239.98,Ultra powerful & extra large blender. Ideal fo...
4,BLENDER POWER 3 PREMIUM,https://www.magimix.co.uk/blender/228-blender-...,£159.98,"Compact & efficient blender, ideal for up to 3..."
5,BLENDER POWER 4 PREMIUM - SATIN,https://www.magimix.co.uk/blender/229-blender-...,£199.98,"Powerful & large blender, ideal for families o..."
6,BLENDER POWER 4 PREMIUM - BLACK,https://www.magimix.co.uk/blender/229-blender-...,£179.98,"Powerful & large blender, ideal for families o..."
7,BLENDER POWER 4 PREMIUM - RED,https://www.magimix.co.uk/blender/229-blender-...,£290.00,"Powerful & large blender, ideal for families o..."
8,BLENDER POWER 4 PREMIUM - CREAM,https://www.magimix.co.uk/blender/229-blender-...,£179.98,"Powerful & large blender, ideal for families o..."
9,BLENDER POWER 5XL PREMIUM,https://www.magimix.co.uk/blender/230-blender-...,£269.98,"Ultra powerful & extra large blender, ideal fo..."


In [None]:
import pandas as pd

# Load the two CSV files into pandas DataFrames
df1 = pd.read_csv('magimix_products_1.csv')
df2 = pd.read_csv('magimix_products_2.csv')

# Concatenate the two DataFrames vertically
combined_df = pd.concat([df1, df2], ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('magimix_products.csv', index=False)

print("Combined data saved to magimix_products.csv")

Combined data saved to magimix_products.csv


In [None]:
df = pd.read_csv('magimix_products.csv')
df

Unnamed: 0,name,link,price,description
0,MINI PLUS FOOD PROCESSOR - SATIN,https://www.magimix.co.uk/food-processor/24-mi...,£200.00,"Small but mighty Food Processor, designed for ..."
1,MINI PLUS FOOD PROCESSOR - BLACK,https://www.magimix.co.uk/food-processor/24-mi...,£180.00,"Small but mighty Food Processor, designed for ..."
2,VERTUO NEXT & MILK - CHROME,https://www.magimix.co.uk/nespresso/178-vertuo...,£250.00,"The next big cup is here with Vertuo Next, off..."
3,VERTUO NEXT & MILK - DARK GREY,https://www.magimix.co.uk/nespresso/178-vertuo...,£200.00,"The next big cup is here with Vertuo Next, off..."
4,VERTUO NEXT & MILK - MATT BLACK,https://www.magimix.co.uk/nespresso/178-vertuo...,£200.00,"The next big cup is here with Vertuo Next, off..."
5,VERTUO NEXT & MILK - WHITE,https://www.magimix.co.uk/nespresso/178-vertuo...,£200.00,"The next big cup is here with Vertuo Next, off..."
6,5200XL FOOD PROCESSOR - SATIN,https://www.magimix.co.uk/food-processor/27-52...,£390.00,Iconic multifunctional Food Processor for up t...
7,5200XL FOOD PROCESSOR - BLACK,https://www.magimix.co.uk/food-processor/27-52...,£370.00,Iconic multifunctional Food Processor for up t...
8,5200XL FOOD PROCESSOR - RED,https://www.magimix.co.uk/food-processor/27-52...,£370.00,Iconic multifunctional Food Processor for up t...
9,5200XL FOOD PROCESSOR - WHITE,https://www.magimix.co.uk/food-processor/27-52...,£299.98,Iconic multifunctional Food Processor for up t...


In [None]:
link_list = df['link'].tolist()
link_list

['https://www.magimix.co.uk/food-processor/24-mini-plus.html',
 'https://www.magimix.co.uk/food-processor/24-mini-plus.html',
 'https://www.magimix.co.uk/nespresso/178-vertuo-next-milk.html',
 'https://www.magimix.co.uk/nespresso/178-vertuo-next-milk.html',
 'https://www.magimix.co.uk/nespresso/178-vertuo-next-milk.html',
 'https://www.magimix.co.uk/nespresso/178-vertuo-next-milk.html',
 'https://www.magimix.co.uk/food-processor/27-5200xl.html',
 'https://www.magimix.co.uk/food-processor/27-5200xl.html',
 'https://www.magimix.co.uk/food-processor/27-5200xl.html',
 'https://www.magimix.co.uk/food-processor/27-5200xl.html',
 'https://www.magimix.co.uk/accessories/36-citrus-press-blender.html',
 'https://www.magimix.co.uk/juicer/79-juice-expert-3-5018399180826.html',
 'https://www.magimix.co.uk/other-products/110-le-glacier-11l-5018399110472.html',
 'https://www.magimix.co.uk/other-products/112-gelato-expert-5018399116801.html',
 'https://www.magimix.co.uk/other-products/115-vision-toaste

In [None]:
len(link_list)

60

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


# List to store detailed product data
detailed_products = []

# Iterate over each product link to scrape detailed data
for link in link_list:
    product_response = requests.get(link)
    product_soup = BeautifulSoup(product_response.text, 'html.parser')

    # Extract product name
    name_tag = product_soup.find('h1', class_='product-title')
    name = name_tag.text.strip() if name_tag else 'N/A'

    # Extract product price
    price_tag = product_soup.find('span', itemprop='price')
    price = price_tag.text.strip() if price_tag else 'N/A'

    # Extract description (from the tab section)
    description_tab = product_soup.find('div', id='description_longue')
    description = description_tab.text.strip() if description_tab else 'N/A'

    # Extract specifications (from the tab section)
    specifications_tab = product_soup.find('div', id='product_detail')
    specifications = specifications_tab.text.strip() if specifications_tab else 'N/A'

    # Store the data in a dictionary
    detailed_products.append({
        'name': name,
        'price': price,
        'description': description,
        'specifications': specifications,
        'link': link,
    })

# Convert the data into a pandas DataFrame for easier manipulation
df = pd.DataFrame(detailed_products)

# Save the data to a CSV file
df.to_csv('magimix_detailed_products.csv', index=False)

print("Detailed data scraped and saved to magimix_detailed_products.csv")

Detailed data scraped and saved to magimix_detailed_products.csv


In [None]:
df = pd.read_csv('/content/magimix_detailed_products.csv')
df

Unnamed: 0,name,price,description,specifications,link
0,MINI PLUS FOOD PROCESSOR,£200.00,Revolutionise your kitchen skills and discover...,Reference\n18260\n\n\nTotal capacity\n1.7L\n\n...,https://www.magimix.co.uk/food-processor/24-mi...
1,MINI PLUS FOOD PROCESSOR,£200.00,Revolutionise your kitchen skills and discover...,Reference\n18260\n\n\nTotal capacity\n1.7L\n\n...,https://www.magimix.co.uk/food-processor/24-mi...
2,VERTUO NEXT & MILK,£200.00,Bring the convenience and choice of the coffee...,Reference\n11710\n\n\nTotal capacity\n1.1L\n\n...,https://www.magimix.co.uk/nespresso/178-vertuo...
3,VERTUO NEXT & MILK,£200.00,Bring the convenience and choice of the coffee...,Reference\n11710\n\n\nTotal capacity\n1.1L\n\n...,https://www.magimix.co.uk/nespresso/178-vertuo...
4,VERTUO NEXT & MILK,£200.00,Bring the convenience and choice of the coffee...,Reference\n11710\n\n\nTotal capacity\n1.1L\n\n...,https://www.magimix.co.uk/nespresso/178-vertuo...
5,VERTUO NEXT & MILK,£200.00,Bring the convenience and choice of the coffee...,Reference\n11710\n\n\nTotal capacity\n1.1L\n\n...,https://www.magimix.co.uk/nespresso/178-vertuo...
6,5200XL FOOD PROCESSOR,£299.98,Revolutionise your kitchen skills and discover...,Reference\n18590\n\n\nTotal capacity\n3.6L\n\n...,https://www.magimix.co.uk/food-processor/27-52...
7,5200XL FOOD PROCESSOR,£299.98,Revolutionise your kitchen skills and discover...,Reference\n18590\n\n\nTotal capacity\n3.6L\n\n...,https://www.magimix.co.uk/food-processor/27-52...
8,5200XL FOOD PROCESSOR,£299.98,Revolutionise your kitchen skills and discover...,Reference\n18590\n\n\nTotal capacity\n3.6L\n\n...,https://www.magimix.co.uk/food-processor/27-52...
9,5200XL FOOD PROCESSOR,£299.98,Revolutionise your kitchen skills and discover...,Reference\n18590\n\n\nTotal capacity\n3.6L\n\n...,https://www.magimix.co.uk/food-processor/27-52...


In [None]:
import pandas as pd

df = pd.read_csv('/content/magimix_detailed_products.csv')

# Remove duplicates based on the 'name' column, keeping the first occurrence
df_no_duplicates = df.drop_duplicates(subset=['name'], keep='first')

# Save the updated DataFrame to a new CSV file
df_no_duplicates.to_csv('magimix_detailed_products_new.csv', index=False)

print("Data with duplicate product names removed and saved to magimix_detailed_products_new.csv")

Data with duplicate product names removed and saved to magimix_detailed_products_new.csv


In [None]:
df = pd.read_csv('/content/magimix_detailed_products_new.csv')
df

Unnamed: 0,name,price,description,specifications,link
0,MINI PLUS FOOD PROCESSOR,£200.00,Revolutionise your kitchen skills and discover...,Reference\n18260\n\n\nTotal capacity\n1.7L\n\n...,https://www.magimix.co.uk/food-processor/24-mi...
1,VERTUO NEXT & MILK,£200.00,Bring the convenience and choice of the coffee...,Reference\n11710\n\n\nTotal capacity\n1.1L\n\n...,https://www.magimix.co.uk/nespresso/178-vertuo...
2,5200XL FOOD PROCESSOR,£299.98,Revolutionise your kitchen skills and discover...,Reference\n18590\n\n\nTotal capacity\n3.6L\n\n...,https://www.magimix.co.uk/food-processor/27-52...
3,CITRUS PRESS (BLENDER),£20.00,A must-have for freshly-squeezed fruit juice l...,Reference\n17270,https://www.magimix.co.uk/accessories/36-citru...
4,JUICE EXPERT 3,£199.98,Kick start your healthy lifestyle with the Mag...,Reference\n18082\n\n\n0% BPA\nYes\n\n\nWattage...,https://www.magimix.co.uk/juicer/79-juice-expe...
5,LE GLACIER 1.1L,£49.98,"Make fresh, and flavourful frozen treats in mi...",Reference\n11047\n\n\nTotal capacity\n1.1L\n\n...,https://www.magimix.co.uk/other-products/110-l...
6,GELATO EXPERT,£500.00,Professional quality gelato and ice cream at h...,Reference\n11680\n\n\nTotal capacity\n2 x 2L\n...,https://www.magimix.co.uk/other-products/112-g...
7,VISION TOASTER,£140.00,"Your toast, just how you like it\nAs the world...",Reference\n11526\n\n\nWorking capacity\n2 slic...,https://www.magimix.co.uk/other-products/115-v...
8,MULTIFUNCTION STEAMER,£215.00,Steam your way to delicious food\nFor those lo...,Reference\n11581\n\n\nTotal capacity\n2 x 5.5L...,https://www.magimix.co.uk/other-products/116-m...
9,SLICER T190,£160.00,Precision slicing made simple\nEnsure you get ...,Reference\n11651\n\n\nMaterial 1\nStainless St...,https://www.magimix.co.uk/other-products/117-s...


# MiniLM embedding : https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

In [None]:
!pip install pypdf2 python-docx pytesseract pillow sentence-transformers cassandra-driver openai

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting cassandra-driver
  Downloading cassandra_driver-3.29.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)
Collecting geomet<0.3,>=0.1 (from cassandra-driver)
  Downloading geomet-0.2.1.post1-py3-none-any.whl.metadata (1.0 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading cassandra_driver-3.29.2-cp310-cp310-manylinux_2_17

In [None]:
import os
import uuid
from typing import List, Dict, Any
from PyPDF2 import PdfReader
from docx import Document
from PIL import Image
import pytesseract
from sentence_transformers import SentenceTransformer
import pandas as pd

In [None]:
class DocumentParser:
    def parse_pdf(self, file_path: str) -> str:
        reader = PdfReader(file_path)
        return " ".join(page.extract_text() for page in reader.pages)

    def parse_docx(self, file_path: str) -> str:
        doc = Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])

    def parse_image(self, file_path: str) -> str:
        image = Image.open(file_path)
        return pytesseract.image_to_string(image)

class TextPreprocessor:
    def chunk_text(self, text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
        tokens = text.split()
        chunks = []
        for i in range(0, len(tokens), chunk_size - overlap):
            chunk = " ".join(tokens[i:i + chunk_size])
            chunks.append(chunk)
        return chunks

class EmbeddingGenerator:
    def __init__(self):
        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    def generate(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts).tolist()

In [None]:
# Example usage
def main():
    # 1. Parse sample documents
    parser = DocumentParser()
    preprocessor = TextPreprocessor()
    embedding_gen = EmbeddingGenerator()

    # Sample parsing (you'll need to add your test files)
    text = parser.parse_pdf("/content/MODE D'EMPLOI Juice expert UK 460265_BD.pdf")

    # 2. Preprocess and chunk text
    chunks = preprocessor.chunk_text(text)

    # 3. Generate embeddings
    embeddings = embedding_gen.generate(chunks)

    # 4. Prepare for AstraDB storage
    documents = []
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        doc = {
            "id": str(uuid.uuid4()),
            "text": chunk,
            "embedding": embedding,
            "metadata": {
                "source": "MODE D'EMPLOI Juice expert UK 460265_BD.pdf",
                "chunk_index": i
            }
        }
        documents.append(doc)

    return documents

if __name__ == "__main__":
    main()

In [None]:
main()

[{'id': '15c3cfe0-31fe-4f3a-b773-69d6cd0a6573',
  'embedding': [-0.017070520669221878,
   -0.02153882570564747,
   -0.04350646957755089,
   0.0007505653193220496,
   -0.07049557566642761,
   0.045050930231809616,
   0.03294045478105545,
   0.04098536819219589,
   -0.01811922714114189,
   -0.1148734763264656,
   0.012342450208961964,
   -0.05796150118112564,
   -0.013112171553075314,
   -0.02916647121310234,
   -0.07011472433805466,
   -0.018770847469568253,
   0.006233011372387409,
   -0.0020305232610553503,
   -0.1460181027650833,
   -0.06109420210123062,
   0.03499629721045494,
   -0.10499252378940582,
   -0.026349909603595734,
   -0.014746434986591339,
   -0.06028103083372116,
   0.03145352378487587,
   -0.007861766964197159,
   -0.03814692795276642,
   0.04293709248304367,
   -0.0345095694065094,
   0.04152316600084305,
   -0.0398036427795887,
   0.05465375632047653,
   0.054763760417699814,
   0.041898079216480255,
   -0.04535285383462906,
   0.0249637421220541,
   -0.011272846721

# jinaai embedding : https://huggingface.co/jinaai/jina-embeddings-v3

In [None]:
!pip install transformers torch einops
!pip install 'numpy<2'



In [None]:
class EmbeddingGenerator:
    def __init__(self):
        self.model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)

    def generate(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts).tolist()

In [None]:
# Example usage
def main():
    # 1. Parse sample documents
    parser = DocumentParser()
    preprocessor = TextPreprocessor()
    embedding_gen = EmbeddingGenerator()

    # Sample parsing (you'll need to add your test files)
    text = parser.parse_pdf("/content/MODE D'EMPLOI Juice expert UK 460265_BD.pdf")

    # 2. Preprocess and chunk text
    chunks = preprocessor.chunk_text(text)

    # 3. Generate embeddings
    embeddings = embedding_gen.generate(chunks)

    # 4. Prepare for AstraDB storage
    documents = []
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        doc = {
            "id": str(uuid.uuid4()),
            "text": chunk,
            "embedding": embedding,
            "metadata": {
                "source": "MODE D'EMPLOI Juice expert UK 460265_BD.pdf",
                "chunk_index": i
            }
        }
        documents.append(doc)

    return documents

if __name__ == "__main__":
    main()

custom_st.py:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-embeddings-v3:
- custom_st.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

configuration_xlm_roberta.py:   0%|          | 0.00/6.54k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- configuration_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_lora.py:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

modeling_xlm_roberta.py:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

block.py:   0%|          | 0.00/17.8k [00:00<?, ?B/s]

mha.py:   0%|          | 0.00/34.4k [00:00<?, ?B/s]

rotary.py:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mha.py
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


stochastic_depth.py:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mlp.py:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- block.py
- mha.py
- stochastic_depth.py
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


embedding.py:   0%|          | 0.00/3.88k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


xlm_padding.py:   0%|          | 0.00/10.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- xlm_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_xlm_roberta.py
- block.py
- embedding.py
- xlm_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_lora.py
- modeling_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/192 [00:00<?, ?B/s]

In [None]:
main()

[{'id': '71bd253b-29da-425f-ab81-3a129dcdbe6c',
  'embedding': [0.1186404824256897,
   0.025304049253463745,
   0.08754716068506241,
   0.0678085908293724,
   0.09870454668998718,
   -0.008238193579018116,
   -0.053771816194057465,
   0.11028910428285599,
   -0.038415465503931046,
   0.029209960252046585,
   -0.059838831424713135,
   0.11474504321813583,
   -0.10546296834945679,
   -0.003101881593465805,
   -0.0699307844042778,
   -0.045714765787124634,
   -0.10512378811836243,
   0.15929430723190308,
   -0.02585604600608349,
   0.01916254498064518,
   -0.08540385961532593,
   0.0025653981138020754,
   0.004005782771855593,
   0.06774960458278656,
   -0.0652356743812561,
   0.017197364941239357,
   -0.027530550956726074,
   0.03854096680879593,
   -0.09157819300889969,
   -0.019712062552571297,
   -0.0054999240674078465,
   -0.012148457579314709,
   -0.05256868153810501,
   -0.059473007917404175,
   0.03156406432390213,
   0.09796945005655289,
   -0.05194612592458725,
   0.013761097565