In [1]:
class Document:
    def __init__(self, content, metadata=None, doc_id=None, num_chunks=None):
        self.content = content
        self.metadata = metadata if metadata is not None else {}
        self.id = doc_id if doc_id is not None else str(uuid.uuid4())
        self.num_chunks = num_chunks

    def add_metadata(self, key, value):
        """
        Adds a key-value pair to the document's metadata.

        :param key: The key for the metadata item.
        :param value: The value for the metadata item.
        """
        self.metadata[key] = value

    def __str__(self):
        """
        Returns a string representation of the document.

        :return: A string containing the content and metadata of the document.
        """
        return f"Content: {self.content}\nMetadata: {self.metadata}"


    def to_json(self):
        """ Convert the Document object to a JSON string """
        # Use a dictionary comprehension to handle the renaming of 'id' to 'doc_id'
        data = {k if k != 'id' else 'doc_id': v for k, v in self.__dict__.items()}
        return json.dumps(data)

    @classmethod
    def from_json(cls, json_str):
        """ Create a Document object from a JSON string """
        data = json.loads(json_str)
        return cls(**data)


In [2]:
import json
with open('../data/2023/json/event_1MkgJ_the-story-of-europe-panel-talk.json', 'r') as file:
    data = json.load(file)


In [20]:
excluded_keys = ['title', 'description', 'date', 'location']
difference = [key for key in data.keys() if key not in excluded_keys]

In [21]:
difference

['speakers', 'moderators', 'type']

In [22]:
content = f"""Titel: {data['title']}\nBeschreibung: {data['description']}\nDatum: {data['date']}\nOrt: {data['location']}"""

In [23]:
metadata = dict()
for key in difference:
    metadata[key]= data[key]

In [24]:
metadata

{'speakers': [{'name': 'Fabian Böck',
   'role': 'Founder',
   'organization': 'Europe Think Factory'},
  {'name': 'Erdem Ovacik',
   'role': 'Founder and Executive Board Member',
   'organization': 'Donkey Republic'},
  {'name': 'Elina Åkerlind',
   'role': 'CEO and founder',
   'organization': 'Nordic Node AB'},
  {'name': 'Sakina Turabali',
   'role': 'Co-founder/Chief Growth Officer',
   'organization': 'CodeEasy'}],
 'moderators': [{'name': 'Linda-Maraike Plath',
   'role': 'Projektkoordination SEEd Schule (Social Entrepreneurship Education)',
   'organization': 'opencampus.sh'}],
 'type': 'Startup, Panel'}

In [10]:
print(content)

Titel: The Story of Europe – Panel Talk
Beschreibung: Europe Think Factory – Panel Talk:Join us as we're highlighting four speakers from four different ecosystems in the North to discuss the "Story of Europe", to open up about their experience in international collaboration, and to share their overall vision for a better together here in Europe.Europe Think Factory is a think tank working on a new narrative of collaboration for innovation, technology, and startups in Europe. Non-profit, independent, a founders collective.
Datum: Friday June 16, 2023 11:00 - 11:45 CEST
Ort: Stage 1


In [30]:
import os
import json
import uuid

documents = []
excluded_keys = ['title', 'description', 'date', 'location']

def process_json_data(data, excluded_keys):
    # Process your JSON data here
    loc=data.get('location', 'No location given')
    content = f"""Titel: {data['title']}\nBeschreibung: {data['description']}\nDatum: {data['date']}\nOrt: {loc}"""
    difference = [key for key in data.keys() if key not in excluded_keys]
    metadata = dict()
    for key in difference:
        metadata[key]= data[key]
    return Document(content, metadata=metadata)
    

# Path to the directory containing JSON files
folder_path = '../data/2023/json'

# Loop through each file in the specified directory
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):  # Check if the file is a JSON file
        file_path = os.path.join(folder_path, filename)  # Get the full path of the file
        with open(file_path, 'r') as file:
            try:
                data = json.load(file)  # Load the JSON data from the file
                documents.append(process_json_data(data, excluded_keys))
            except json.JSONDecodeError:
                print(f"Error decoding JSON from the file {filename}")



In [31]:
len(documents)

112

In [1]:
def parse_event_details(event_string):
    # Split the input string into lines for easier parsing
    lines = event_string.split('\n')
    
    # Initialize dictionary to hold parsed data
    event_details = {}
    
    for line in lines:
        # Check if the line contains date and time information
        if 'Datum:' in line:
            # Remove the label 'Datum:' and extra spaces
            date_time_info = line.replace('Datum:', '').strip()
            
            # Find the index where the time starts (immediately after the year), if present
            time_start_index = date_time_info.find(':') - 2 if ':' in date_time_info else -1
            
            # Separate the day of the week
            day = date_time_info.split(' ')[0]
            
            if time_start_index != -1:
                # Extract the time information
                times = date_time_info[time_start_index:].split('-')
                start_time = times[0].strip()
                end_time = times[1].strip().split(' ')[0]  # Remove the timezone if present
            else:
                # No time given, set to a placeholder
                start_time = "None"
                end_time = "None"
            
            # Populate the event details dictionary
            event_details['day'] = day
            event_details['start_time'] = start_time
            event_details['end_time'] = end_time

    return event_details

# Example usage with a string that includes only the day
event_string_only_day = 'Titel: Breakfast for Camp Sleepers\nBeschreibung: No description found\nDatum: Friday June 16, 2023\nOrt: Courtyard'

parsed_event_only_day = parse_event_details(event_string_only_day)
print(parsed_event_only_day)


{'day': 'Friday', 'start_time': 'None', 'end_time': 'None'}


In [None]:
for doc in documents:
    doc.add_metadata("date", parse_event_details(doc.content))

In [33]:
print(documents[0])

Content: Titel: The Story of Europe – Panel Talk
Beschreibung: Europe Think Factory – Panel Talk:Join us as we're highlighting four speakers from four different ecosystems in the North to discuss the "Story of Europe", to open up about their experience in international collaboration, and to share their overall vision for a better together here in Europe.Europe Think Factory is a think tank working on a new narrative of collaboration for innovation, technology, and startups in Europe. Non-profit, independent, a founders collective.
Datum: Friday June 16, 2023 11:00 - 11:45 CEST
Ort: Stage 1
Metadata: {'speakers': [{'name': 'Fabian Böck', 'role': 'Founder', 'organization': 'Europe Think Factory'}, {'name': 'Erdem Ovacik', 'role': 'Founder and Executive Board Member', 'organization': 'Donkey Republic'}, {'name': 'Elina Åkerlind', 'role': 'CEO and founder', 'organization': 'Nordic Node AB'}, {'name': 'Sakina Turabali', 'role': 'Co-founder/Chief Growth Officer', 'organization': 'CodeEasy'}]

In [36]:
json_documents = [doc.to_json() for doc in documents]

In [37]:
with open(f'documents.json', 'w') as file:
    json.dump(json_documents, file)