In [3]:
class Document:
    def __init__(self, content, metadata=None, doc_id=None, num_chunks=None):
        self.content = content
        self.metadata = metadata if metadata is not None else {}
        self.id = doc_id if doc_id is not None else str(uuid.uuid4())
        self.num_chunks = num_chunks

    def add_metadata(self, key, value):
        """
        Adds a key-value pair to the document's metadata.

        :param key: The key for the metadata item.
        :param value: The value for the metadata item.
        """
        self.metadata[key] = value

    def __str__(self):
        """
        Returns a string representation of the document.

        :return: A string containing the content and metadata of the document.
        """
        return f"Content: {self.content}\nMetadata: {self.metadata}"


    def to_json(self):
        """ Convert the Document object to a JSON string """
        # Use a dictionary comprehension to handle the renaming of 'id' to 'doc_id'
        data = {k if k != 'id' else 'doc_id': v for k, v in self.__dict__.items()}
        return json.dumps(data)

    @classmethod
    def from_json(cls, json_str):
        """ Create a Document object from a JSON string """
        data = json.loads(json_str)
        return cls(**data)


In [5]:
import json
with open('../../data/2024/json/event_1dM9M_fuhrungsmodelle-der-zukunft.json', 'r') as file:
    data = json.load(file)


In [10]:
excluded_keys = ['Title', 'Description', 'Date', 'Location', 'Type']
difference = [key for key in data.keys() if key not in excluded_keys]

In [11]:
difference

['Speakers']

In [12]:
content = f"""Title: {data['Title']}\nDescription: {data['Description']}\nDate: {data['Date']}\nLocation: {data['Location']}\nType: {data['Type']}"""

In [13]:
metadata = dict()
for key in difference:
    metadata[key]= data[key]

In [14]:
metadata

{'Speakers': ['Marit Heidrich',
  'Mia Konew',
  'Robert Kaletsch',
  'Julia Seeliger']}

In [15]:
print(content)

Title: Führungsmodelle der Zukunft
Description: No description found
Date: Friday June 14, 2024 12:00 - 12:45 CEST
Location: Innovation Stage
Type: Zukunft der Arbeit, panel


In [17]:
import os
import json
import uuid

documents = []

def process_json_data(data, excluded_keys):
    # Process your JSON data here
    content = f"""Title: {data['Title']}\nDescription: {data['Description']}\nDate: {data['Date']}\nLocation: {data['Location']}\nType: {data['Type']}"""
    difference = [key for key in data.keys() if key not in excluded_keys]
    metadata = dict()
    for key in difference:
        metadata[key]= data[key]
    return Document(content, metadata=metadata)
    

# Path to the directory containing JSON files
folder_path = '../../data/2024/json'

# Loop through each file in the specified directory
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):  # Check if the file is a JSON file
        file_path = os.path.join(folder_path, filename)  # Get the full path of the file
        with open(file_path, 'r') as file:
            try:
                data = json.load(file)  # Load the JSON data from the file
                documents.append(process_json_data(data, excluded_keys))
            except json.JSONDecodeError:
                print(f"Error decoding JSON from the file {filename}")



In [18]:
len(documents)

106

In [23]:
def parse_event_details(event_string):
    # Split the input string into lines for easier parsing
    lines = event_string.split('\n')
    
    # Initialize dictionary to hold parsed data
    event_details = {}
    
    for line in lines:
        # Check if the line contains date and time information
        if 'Date:' in line:
            # Remove the label 'Datum:' and extra spaces
            date_time_info = line.replace('Date:', '').strip()
            
            # Find the index where the time starts (immediately after the year), if present
            time_start_index = date_time_info.find(':') - 2 if ':' in date_time_info else -1
            
            # Separate the day of the week
            day = date_time_info.split(' ')[0]
            
            if time_start_index != -1:
                # Extract the time information
                times = date_time_info[time_start_index:].split('-')
                start_time = times[0].strip()
                end_time = times[1].strip().split(' ')[0]  # Remove the timezone if present
            else:
                # No time given, set to a placeholder
                start_time = "None"
                end_time = "None"
            
            # Populate the event details dictionary
            event_details['day'] = day
            event_details['start_time'] = start_time
            event_details['end_time'] = end_time

    return event_details

# Example usage with a string that includes only the day
event_string_only_day = 'Titel: Breakfast for Camp Sleepers\nBeschreibung: No description found\nDate: Friday June 16, 2023\nOrt: Courtyard'

parsed_event_only_day = parse_event_details(event_string_only_day)
print(parsed_event_only_day)


{'day': 'Friday', 'start_time': 'None', 'end_time': 'None'}


In [24]:
for doc in documents:
    doc.add_metadata("date", parse_event_details(doc.content))

In [25]:
print(documents[0])

Content: Title: How to Businessplan
Description: No description found
Date: Friday June 14, 2024 10:00 - 10:45 CEST
Location: Beachfront Tent
Type: 
Metadata: {'Speakers': ['Mandy Semkow', 'Fabian Haushahn'], 'date': {'day': 'Friday', 'start_time': '10:00', 'end_time': '10:45'}}


In [26]:
for doc in documents:
    print(doc)
    print("###########")

Content: Title: How to Businessplan
Description: No description found
Date: Friday June 14, 2024 10:00 - 10:45 CEST
Location: Beachfront Tent
Type: 
Metadata: {'Speakers': ['Mandy Semkow', 'Fabian Haushahn'], 'date': {'day': 'Friday', 'start_time': '10:00', 'end_time': '10:45'}}
###########
Content: Title: Neue Herausforderungen im Bereich von Open Source AI & LLM Security
Description: No description found
Date: Friday June 14, 2024 17:00 - 17:45 CEST
Location: Garden Tent
Type: 
Metadata: {'Speakers': ['Jake Petersen', 'Jan Monica'], 'date': {'day': 'Friday', 'start_time': '17:00', 'end_time': '17:45'}}
###########
Content: Title: Verwaltung der Zukunft
Description: No description found
Date: Friday June 14, 2024 14:00 - 14:45 CEST
Location: Tiny Rathaus
Type: Zukunft der Verwaltung, talk
Metadata: {'Speakers': ['Thilak Mahendran'], 'date': {'day': 'Friday', 'start_time': '14:00', 'end_time': '14:45'}}
###########
Content: Title: Start A Startup Pitch Sessions
Description: No descript

In [27]:
json_documents = [doc.to_json() for doc in documents]

In [28]:
with open(f'documents.json', 'w') as file:
    json.dump(json_documents, file)