# Data Collection and Processing

## Import libraries and global variables

In [1]:
import sqlite3
import warnings
from pathlib import Path
from typing import List

import chromadb
import pandas as pd
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    RecursiveJsonSplitter,
)
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import JSONLoader, PyMuPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
import torch
from transformers import BertTokenizer

if torch.cuda.is_available():
    print("CUDA is available")
else:
    print("CUDA is not available")

CUDA is available


In [2]:
# Define the paths for the SQL and the vector store
path_sql_db = Path("path/sqlite_db.db")
path_chroma_db = Path("path/chroma_db/")

warnings.filterwarnings("ignore")

## Functions

In [3]:
def list_store_infos(vector_store):
    collections = vector_store._client.list_collections()
    print("Store settings:")

    for collection in collections:
        example = collection.peek(limit=1)
        print(f"Collection {collection.name} with ID {collection.id}")
        print(f"Total number of embeddings: {collection.count()}")
        print(f"Example {example['documents']}\n")

In [4]:
def list_tables_and_columns(db_path: str, display_columns: bool = True) -> None:
    """
    Retrieve a list of all tables and their columns from an SQLite database.

    Args:
        db_path (str): The path to the SQLite database file.
        display_columns (bool): Whether to display columns of the tables. Default is True.

    Returns:
        None
    """
    # Connect to the SQLite database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    # Get a list of all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()

    # For each table, get a list of all columns
    for table in tables:
        print(f"Table: {table[0]}")
        if display_columns:
            cursor.execute(f"PRAGMA table_info({table[0]});")
            columns = cursor.fetchall()
            for column in columns:
                print(f"  Column: {column[1]}")
        print()

    # Close the connection
    conn.close()

In [5]:
def dataframe_to_sql(df: pd.DataFrame, table_name: str, db_path: str) -> None:
    """
    Converts a pandas DataFrame to a SQL table and stores it in an SQLite database.

    Args:
        df (pandas.DataFrame): The DataFrame to be converted.
        table_name (str): The name of the table to be created in the database.
        db_path (str): The path to the SQLite database file.

    Returns:
        None
    """
    conn = sqlite3.connect(db_path)
    df.to_sql(table_name, conn, if_exists="replace", index=False)
    conn.close()

In [6]:
def delete_table(db_path: str, table_name: str) -> None:
    """
    Delete a table from the SQLite database.

    Args:
        conn (sqlite3.Connection): Connection object to the SQLite database.
        table_name (str): The name of the table to be deleted.

    Returns:
        None
    """
    conn = sqlite3.connect(db_path)
    try:
        cursor = conn.cursor()
        cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
        print(f"Table {table_name} deleted successfully.")
    except sqlite3.Error as e:
        print(e)

In [7]:
def execute_sql_query(database_path: str, query: str) -> list:
    """
    Function to execute a SQL query on a specified SQLite database.

    Args:
        database_path (str): The path to the SQLite database.
        query (str): The SQL query to execute.

    Returns:
        list: A list of tuples representing the rows returned by the query.
    """
    conn = sqlite3.connect(database_path)
    c = conn.cursor()
    c.execute(query)
    rows = c.fetchall()
    conn.close()
    return rows

In [8]:
def convert_columns(df, columns_to_convert) -> pd.DataFrame:
    """Removes special characters and converts columns to float."""
    df.loc[:, columns_to_convert] = df.loc[:, columns_to_convert].replace("-", 0)

    for col in columns_to_convert:
        df.loc[:, col] = (
            df.loc[:, col].astype(str).str.replace(",", ".").str.replace("°", "")
        )
        df.loc[:, col] = df.loc[:, col].astype(float)

    return df

In [9]:
def load_pdfs(dir: Path) -> list:
    """
    Load PDF documents from a directory.

    Args:
        dir (Path): The directory containing the PDF files.

    Returns:
        list: A list of loaded PDF documents as langchain Document objects.
    """
    documents = []
    for pdf in dir.iterdir():
        print(f"Reading PDF: {pdf.name}")
        loader = PyMuPDFLoader(str(pdf))
        documents.extend(loader.load())

    return documents

In [10]:
def setup_vector_store(
    path: str,
    documents: List[str],
    collection_name: str,
    embedding_model: HuggingFaceEmbeddings,
) -> Chroma:
    """
    Sets up a vector store for storing document embeddings.

    Args:
        path (str): The path to the directory where the vector store will be persisted.
        documents (List[str]): The list of documents to be added to the vector store.
        collection_name (str): The name of the collection in the vector store.
        embedding_model (HuggingFaceEmbeddings): The embedding model used to generate document embeddings.

    Returns:
        Chroma: The initialized vector store.
    """
    chroma_db = chromadb.PersistentClient(path)

    # If the collection already exists, reset it; otherwise, create a new collection.
    if chroma_db.count_collections() > 0:
        chroma_db.reset()
    else:
        vector_store = Chroma.from_documents(
            documents=documents,
            embedding=embedding_model,
            collection_name=collection_name,
            persist_directory=path,
        )
    print(f"{vector_store._collection.count()} splits uploaded to the vector store.")
    print(f"\nExample split: {vector_store._collection.get()['documents'][0]}")

    return vector_store

In [11]:
def token_count(text) -> int:
    return len(tokenizer.tokenize(text))

## SQL Database

### Import data

In [12]:
df_lamps = pd.read_excel("../../data/01_raw/xlsx/lamps.xlsx")
df_lamps.head(1)

Unnamed: 0,PK I,PK II,PK III,LEDtube Länge in mm,EOC,Philips Bestell-Nr.,EAN1,GPC,Bezeichnung lang,Bezeichnung kurz,...,Rotierende\nEnd-\nkappen,Garantie\n[Jahre],EEL,VE,Menge Palette,Spannung\n[V],kWh/1000h,Link eCat,Link Produktdatenblatt gem. EU-Verordnung,Link EEL Label
0,LEDtube,T8 LEDtube InstantFit KVG/VVG,MASTER,1500.0,871951400000000.0,43168300.0,8719514431683,929003482302,"MASTER LEDtube 1500mm UE 17,6W 840 T8",MAS LEDtube 1500mm UE 17.6W 840 T8,...,ja,10,A,10,520.0,220-240,18,http://www.assets.lighting.philips.com/is/cont...,https://eprel.ec.europa.eu/api/products/lights...,https://eprel.ec.europa.eu/qr/1013081


### Processing

In [13]:
df_lamps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 487 entries, 0 to 486
Data columns (total 34 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   PK I                                        487 non-null    object 
 1   PK II                                       487 non-null    object 
 2   PK III                                      487 non-null    object 
 3   LEDtube Länge in mm                         149 non-null    float64
 4   EOC                                         476 non-null    float64
 5   Philips Bestell-Nr.                         478 non-null    float64
 6   EAN1                                        487 non-null    int64  
 7   GPC                                         487 non-null    int64  
 8   Bezeichnung lang                            487 non-null    object 
 9   Bezeichnung kurz                            487 non-null    object 
 10  NEU           

#### Changing column names

In [14]:
df_lamps.columns

Index(['PK I', 'PK II', 'PK III', 'LEDtube Länge in mm', 'EOC',
       'Philips Bestell-Nr.', 'EAN1', 'GPC', 'Bezeichnung lang',
       'Bezeichnung kurz', 'NEU', 'TOPSELLER', 'Sockel', 'Betrieb an',
       'Ersatz für', 'Leistung\n[W]', 'Licht-strom (lm)', 'Lichtstärke (cd)',
       'Aus-\nführung', 'Nutz-lebens-dauer', 'Dimm-bar',
       'Aus-\nAbstrahl-\nwinkel\n[°]', 'Ra\nWert', 'Farb-temperatur',
       'Rotierende\nEnd-\nkappen', 'Garantie\n[Jahre]', 'EEL', 'VE',
       'Menge Palette', 'Spannung\n[V]', 'kWh/1000h', 'Link eCat ',
       'Link Produktdatenblatt gem. EU-Verordnung ', 'Link EEL Label'],
      dtype='object')

In [15]:
df_lamps.columns = (
    df_lamps.columns.str.replace("\n", "")
    .str.replace("-", "")
    .str.replace(" ", "_")
    .str.replace(".", "")
)
df_lamps.columns

Index(['PK_I', 'PK_II', 'PK_III', 'LEDtube_Länge_in_mm', 'EOC',
       'Philips_BestellNr', 'EAN1', 'GPC', 'Bezeichnung_lang',
       'Bezeichnung_kurz', 'NEU', 'TOPSELLER', 'Sockel', 'Betrieb_an',
       'Ersatz_für', 'Leistung[W]', 'Lichtstrom_(lm)', 'Lichtstärke_(cd)',
       'Ausführung', 'Nutzlebensdauer', 'Dimmbar', 'AusAbstrahlwinkel[°]',
       'RaWert', 'Farbtemperatur', 'RotierendeEndkappen', 'Garantie[Jahre]',
       'EEL', 'VE', 'Menge_Palette', 'Spannung[V]', 'kWh/1000h', 'Link_eCat_',
       'Link_Produktdatenblatt_gem_EUVerordnung_', 'Link_EEL_Label'],
      dtype='object')

In [16]:
df_lamps.rename(
    columns={
        "Philips_BestellNr": "Bestell_Nr",
        "Lichtstrom_(lm)": "Lichtstrom",
        "Leistung[W]": "Leistung",
        "Lichtstärke_(cd)": "Lichtstärke",
        "AusAbstrahlwinkel[°]": "Abstrahlwinkel",
        "RaWert": "Ra_Wert_Farbwiedergabe",
        "RotierendeEndkappen": "Rotierende_Endkappen",
        "Garantie[Jahre]": "Garantie",
        "Spannung[V]": "Spannung",
        "Link_eCat_": "eCat_Produktdatenblatt",
        "Link_Produktdatenblatt_gem_EUVerordnung_": "EU_Verordnung_Produktdatenblatt",
        "Link_EEL_Label": "EEL_Label",
        "VE": "Verpackungseinheit_VE",
        "TOPSELLER": "Topseller",
        "NEU": "Neu",
    },
    inplace=True,
)

df_lamps.columns = (
    df_lamps.columns.str.replace("ä", "ae")
    .str.replace("ö", "oe")
    .str.replace("ü", "ue")
)

df_lamps.rename(columns=lambda x: x.replace("PK", "Produktkategorie_PK"), inplace=True)
df_lamps.columns

Index(['Produktkategorie_PK_I', 'Produktkategorie_PK_II',
       'Produktkategorie_PK_III', 'LEDtube_Laenge_in_mm', 'EOC', 'Bestell_Nr',
       'EAN1', 'GPC', 'Bezeichnung_lang', 'Bezeichnung_kurz', 'Neu',
       'Topseller', 'Sockel', 'Betrieb_an', 'Ersatz_fuer', 'Leistung',
       'Lichtstrom', 'Lichtstaerke', 'Ausfuehrung', 'Nutzlebensdauer',
       'Dimmbar', 'Abstrahlwinkel', 'Ra_Wert_Farbwiedergabe', 'Farbtemperatur',
       'Rotierende_Endkappen', 'Garantie', 'EEL', 'Verpackungseinheit_VE',
       'Menge_Palette', 'Spannung', 'kWh/1000h', 'eCat_Produktdatenblatt',
       'EU_Verordnung_Produktdatenblatt', 'EEL_Label'],
      dtype='object')

In [17]:
df_lamps.columns

Index(['Produktkategorie_PK_I', 'Produktkategorie_PK_II',
       'Produktkategorie_PK_III', 'LEDtube_Laenge_in_mm', 'EOC', 'Bestell_Nr',
       'EAN1', 'GPC', 'Bezeichnung_lang', 'Bezeichnung_kurz', 'Neu',
       'Topseller', 'Sockel', 'Betrieb_an', 'Ersatz_fuer', 'Leistung',
       'Lichtstrom', 'Lichtstaerke', 'Ausfuehrung', 'Nutzlebensdauer',
       'Dimmbar', 'Abstrahlwinkel', 'Ra_Wert_Farbwiedergabe', 'Farbtemperatur',
       'Rotierende_Endkappen', 'Garantie', 'EEL', 'Verpackungseinheit_VE',
       'Menge_Palette', 'Spannung', 'kWh/1000h', 'eCat_Produktdatenblatt',
       'EU_Verordnung_Produktdatenblatt', 'EEL_Label'],
      dtype='object')

#### Handling missing values, binary encoding and value imputation

In [18]:
df_lamps.isna().sum()

Produktkategorie_PK_I                0
Produktkategorie_PK_II               0
Produktkategorie_PK_III              0
LEDtube_Laenge_in_mm               338
EOC                                 11
Bestell_Nr                           9
EAN1                                 0
GPC                                  0
Bezeichnung_lang                     0
Bezeichnung_kurz                     0
Neu                                440
Topseller                          437
Sockel                               0
Betrieb_an                           0
Ersatz_fuer                          0
Leistung                             0
Lichtstrom                           0
Lichtstaerke                         0
Ausfuehrung                          0
Nutzlebensdauer                      0
Dimmbar                              0
Abstrahlwinkel                       0
Ra_Wert_Farbwiedergabe               0
Farbtemperatur                       0
Rotierende_Endkappen                 0
Garantie                 

In [19]:
# Check the unique values in selected columns.
exclude_columns = [
    "Produktkategorie_PK_I",
    "Produktkategorie_PK_II",
    "Produktkategorie_PK_III",
    "EOC",
    "Bestell_Nr",
    "EAN1",
    "GPC",
    "Bezeichnung_lang",
    "Bezeichnung_kurz",
    "eCat_Produktdatenblatt",
    "EU_Verordnung_Produktdatenblatt",
    "EEL_Label",
]

for column in df_lamps.columns:
    if column not in exclude_columns:
        unique_values = df_lamps[column].unique()
        print(f"Unique values in '{column}' column:")
        print(unique_values)
        print()

Unique values in 'LEDtube_Laenge_in_mm' column:
[1500. 1200. 1050.  900.  600.    0. 1449. 1149.  849.  549.   nan]

Unique values in 'Neu' column:
[nan 'NEU ' 'NEU']

Unique values in 'Topseller' column:
[nan 'TOPSELLER']

Unique values in 'Sockel' column:
['G13' '-' 'G5' 'GU10' 'G24D-1' 'G24D-2' 'G24D-3' 'G24Q-1' 'G24Q-2'
 'G24Q-3' '2G11' 'GX24Q-2' 'GX24Q-3' 'GX24Q-4' 'G23' 'E27' 'E40' 'E14'
 'GU4' 'G4' 'GY6,35' 'G9' 'R7S' 'S14S' 'G53' 'GU5.3']

Unique values in 'Betrieb_an' column:
['KVG/VVG/230V' '-' 'EVG' 'Universal(KVG/EVG/230V)' 'KVG/EVG/\n230V'
 'EVG*' '220-240V' 'KVG/ VVG' 'KVG/VVG' '220-240' 'KVG/ VVG/230V']

Unique values in 'Ersatz_fuer' column:
[58 36 38 30 18 '-' '58' '36' '18' 80 49 35 54 28 39 21 24 14 50 13 26 55
 32 42 '9/11' 70 100 60 150 125 200 'HPL 80W/SON 70W' 'HPL 125W/SON 70W'
 'HPL 200W/SON 100W' 250 400 40 '60' '100' 75 120 25 20 15 68 '25' '40' 10
 101 102 103 104 105 43]

Unique values in 'Leistung' column:
[17.6 20 21.7 18.2 11.9 '13,5' 14.7 12.5 16 12 8 '

In [20]:
# Handling missing values
df_lamps.loc[df_lamps["EOC"].isna(), "EOC"] = df_lamps["EAN1"].astype(str) + "00"

df_lamps.loc[df_lamps["Bestell_Nr"].isna(), "Bestell_Nr"] = (
    df_lamps["EOC"].astype(str).str[-8:]
)
df_lamps["LEDtube_Laenge_in_mm"].fillna(0, inplace=True)
df_lamps["Menge_Palette"].fillna(500, inplace=True)

In [21]:
# Binary encoding for "Neu" and "Topseller" columns
df_lamps["Neu"].fillna(0, inplace=True)
df_lamps["Neu"].replace(["NEU ", "NEU"], 1, inplace=True)
df_lamps["Topseller"].fillna(0, inplace=True)
df_lamps["Topseller"].replace(["TOPSELLER ", "TOPSELLER"], 1, inplace=True)

print("Unique values in 'Neu' column:", df_lamps["Neu"].unique())
print("Unique values in 'Topseller' column:", df_lamps["Topseller"].unique())

Unique values in 'Neu' column: [0 1]
Unique values in 'Topseller' column: [0 1]


In [22]:
# Value imputation
df_lamps["Rotierende_Endkappen"].replace("-", "nein", inplace=True)
df_lamps["Ausfuehrung"].replace("-", "keine Angabe", inplace=True)
df_lamps["Dimmbar"] = df_lamps["Dimmbar"].str.lower().replace("-", "nein")
df_lamps["Betrieb_an"] = df_lamps["Betrieb_an"].str.replace(
    r".*KVG.*VVG.*", "KVG/VVG", regex=True
)
df_lamps["Betrieb_an"] = df_lamps["Betrieb_an"].str.replace("EVG\*", "EVG", regex=True)
df_lamps["Betrieb_an"] = df_lamps["Betrieb_an"].str.replace(
    r".*Universal.*", "Universal", regex=True
)
df_lamps["Betrieb_an"] = df_lamps["Betrieb_an"].str.replace(
    r".*220-240.*", "230V", regex=True
)

df_lamps.loc[df_lamps["Betrieb_an"] == "KVG/EVG/\n230V", "Betrieb_an"] = "Universal"

print("Unique values in 'Rotierende_Endkappen' column:")
print(df_lamps["Rotierende_Endkappen"].unique())

print("Unique values in 'Ausfuehrung' column:")
print(df_lamps["Ausfuehrung"].unique())

print("Unique values in 'Dimmbar' column:")
print(df_lamps["Dimmbar"].unique())

print("Unique values in 'Betrieb_an' column:")
print(df_lamps["Betrieb_an"].unique())

Unique values in 'Rotierende_Endkappen' column:
['ja' 'nein']
Unique values in 'Ausfuehrung' column:
['keine Angabe' 'matt' 'klar' 'gold' 'smoky']
Unique values in 'Dimmbar' column:
['nein' 'ja']
Unique values in 'Betrieb_an' column:
['KVG/VVG' '-' 'EVG' 'Universal' '230V']


In [23]:
# Drop other product code columns, because we only need the Bestell_Nr (order number)
df_lamps.drop(["EAN1", "GPC", "EOC"], axis=1, inplace=True)

#### Change data types

In [24]:
# Define the columns to convert to integer.
columns_to_int = [
    "LEDtube_Laenge_in_mm",
    "Bestell_Nr",
    "Lichtstrom",
    "Lichtstaerke",
    "Nutzlebensdauer",
    "Abstrahlwinkel",
    "Ra_Wert_Farbwiedergabe",
    "Farbtemperatur",
    "Garantie",
    "Menge_Palette",
]

df_lamps = convert_columns(df_lamps, columns_to_int)
df_lamps.loc[:, columns_to_int] = df_lamps.loc[:, columns_to_int].astype(int)

In [25]:
# Define the columns to convert to float.
columns_to_float = ["kWh/1000h", "Leistung"]
df_lamps = convert_columns(df_lamps, columns_to_float)

In [26]:
# Check again unique values in selected columns.
for column in df_lamps.columns:
    if column not in exclude_columns:
        unique_values = df_lamps[column].unique()
        print(f"Unique values in '{column}' column:")
        print(unique_values)
        print()

Unique values in 'LEDtube_Laenge_in_mm' column:
[1500 1200 1050 900 600 0 1449 1149 849 549]

Unique values in 'Neu' column:
[0 1]

Unique values in 'Topseller' column:
[0 1]

Unique values in 'Sockel' column:
['G13' '-' 'G5' 'GU10' 'G24D-1' 'G24D-2' 'G24D-3' 'G24Q-1' 'G24Q-2'
 'G24Q-3' '2G11' 'GX24Q-2' 'GX24Q-3' 'GX24Q-4' 'G23' 'E27' 'E40' 'E14'
 'GU4' 'G4' 'GY6,35' 'G9' 'R7S' 'S14S' 'G53' 'GU5.3']

Unique values in 'Betrieb_an' column:
['KVG/VVG' '-' 'EVG' 'Universal' '230V']

Unique values in 'Ersatz_fuer' column:
[58 36 38 30 18 '-' '58' '36' '18' 80 49 35 54 28 39 21 24 14 50 13 26 55
 32 42 '9/11' 70 100 60 150 125 200 'HPL 80W/SON 70W' 'HPL 125W/SON 70W'
 'HPL 200W/SON 100W' 250 400 40 '60' '100' 75 120 25 20 15 68 '25' '40' 10
 101 102 103 104 105 43]

Unique values in 'Leistung' column:
[17.6 20.0 21.7 18.2 11.9 13.5 14.7 12.5 16.0 12.0 8.0 0.0 23.0 20.5 15.5
 14.0 31.5 21.5 24.0 18.0 36.0 26.0 16.5 18.5 11.5 10.5 17.0 25.0 7.0 4.7
 4.5 6.5 8.5 9.0 15.0 5.0 19.0 28.5 42.8 34.0

In [27]:
# Save the processed data to an Excel file.
output_path = "../../data/02_processed/lamps_processed.xlsx"
df_lamps.to_excel(output_path, index=False)

### Setup

In [28]:
# Save the processed data to SQL database.
dataframe_to_sql(df_lamps, "lamps", path_sql_db)

In [29]:
list_tables_and_columns(str(path_sql_db), True)

Table: lamps
  Column: Produktkategorie_PK_I
  Column: Produktkategorie_PK_II
  Column: Produktkategorie_PK_III
  Column: LEDtube_Laenge_in_mm
  Column: Bestell_Nr
  Column: Bezeichnung_lang
  Column: Bezeichnung_kurz
  Column: Neu
  Column: Topseller
  Column: Sockel
  Column: Betrieb_an
  Column: Ersatz_fuer
  Column: Leistung
  Column: Lichtstrom
  Column: Lichtstaerke
  Column: Ausfuehrung
  Column: Nutzlebensdauer
  Column: Dimmbar
  Column: Abstrahlwinkel
  Column: Ra_Wert_Farbwiedergabe
  Column: Farbtemperatur
  Column: Rotierende_Endkappen
  Column: Garantie
  Column: EEL
  Column: Verpackungseinheit_VE
  Column: Menge_Palette
  Column: Spannung
  Column: kWh/1000h
  Column: eCat_Produktdatenblatt
  Column: EU_Verordnung_Produktdatenblatt
  Column: EEL_Label



In [30]:
# Test the SQL query execution.
sql_query = "SELECT Bezeichnung_lang FROM lamps LIMIT 1"
execute_sql_query(str(path_sql_db), sql_query)

[('MASTER LEDtube 1500mm UE 17,6W 840 T8 ',)]

## Vector Database

### Document loading

In [31]:
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")

In [32]:
path_pdfs = Path("path/")
documents = load_pdfs(path_pdfs)
print(f"\nDoc example: {documents[0]}")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=20,
    length_function=token_count,
    is_separator_regex=False,
)

pdf_splits = text_splitter.split_documents(documents)

Reading PDF: Interact_Pro_Anwendungshandbuch.pdf
Reading PDF: Philips_UV-C_Desinfektion_Whitepaper.pdf
Reading PDF: Wechselhilfe_fuer_Leuchtstoffroehren_2023.pdf
Reading PDF: Philips_Flyer_LED_Umstellung.pdf
Reading PDF: 20230829-sf-2-pager-sustainability-de.pdf
Reading PDF: 20220525-pager-lighting-trends-de.pdf
Reading PDF: 20240123-interact-pro-planungsbroschuere-2023.pdf

Doc example: page_content=' \n1 \n \n \n \n \n \n \nUser Guide \n  \nInteract Pro \nFoundation (ohne Gateway) \n&  \nAdvanced (mit Gateway) \nVersion August 2021 \n \n \n \n \nwww.interact-lighting.com  \n \n' metadata={'source': '../../data/01_raw/pdfs/Interact_Pro_Anwendungshandbuch.pdf', 'file_path': '../../data/01_raw/pdfs/Interact_Pro_Anwendungshandbuch.pdf', 'page': 0, 'total_pages': 55, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word for Microsoft 365', 'producer': 'Microsoft® Word for Microsoft 365', 'creationDate': "D:20210827102213+02'00'", 'modDa

In [33]:
# path_json = "..//data/02_processed/lamps_raw.json"

# json_loader = JSONLoader(
#     file_path=path_json, jq_schema=".[]", text_content=False, json_lines=True
# )

# json_data = json_loader.load()

### Setup

In [34]:
embedding_model = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-large",
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
)

vector_store = setup_vector_store(
    path=str(path_chroma_db),
    documents=pdf_splits,
    collection_name="technical_documents",
    embedding_model=embedding_model,
)

170 splits uploaded to the vector store.

Example split: 1 
 
 
 
 
 
 
User Guide 
  
Interact Pro 
Foundation (ohne Gateway) 
&  
Advanced (mit Gateway) 
Version August 2021 
 
 
 
 
www.interact-lighting.com


In [36]:
list_store_infos(vector_store)

Store settings:
Collection technical_documents with ID 3b06e76a-81f2-4ac0-90ae-ec4fa33bb081
Total number of embeddings: 170
Example ['1 \n \n \n \n \n \n \nUser Guide \n  \nInteract Pro \nFoundation (ohne Gateway) \n&  \nAdvanced (mit Gateway) \nVersion August 2021 \n \n \n \n \nwww.interact-lighting.com']



In [37]:
vector_store.persist()