In [47]:
import weaviate
import json
import io
import os
import re



In [48]:
client = weaviate.Client(
    url = "http://localhost:8080", 
)

In [69]:
# flush the schema and data
client.schema.delete_all()

In [50]:
# helper function
def prettify(json_dict): 
    print(json.dumps(json_dict, indent=2))

In [30]:
document_class_obj = {
    "class": "Document",
        "properties": [
             {"name":"dataset", "dataType": ["string"], "description": "Which dataset the file belongs to"},
             {"name":"path", "dataType": ["string"], "description": "The path to the file"},
        {"name": "title",  "dataType": ["string"],  "description": "The title of the document"},
        {"name": "date", "dataType": ["string"], "description": "The date of the document"},
        { "name": "content",
            "dataType": ["text"],
            "description": "The content of the document"},
    ],
    "vectorizer": "text2vec-transformers",  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    "moduleConfig": {
        "text2vec-transformers": {
            "model": "sentence-transformers/multi-qa-mpnet-base-cos-v1",  # Can be any public or private Hugging Face model.
            "options": {
                "waitForModel": True
            }
        }
    }
}

In [31]:
client.schema.create_class(document_class_obj)

UnexpectedStatusCodeException: Create class! Unexpected status code: 422, with response body: {'error': [{'message': 'class name "Document" already exists'}]}.

In [70]:
import uuid

def create_weaviate_schema(client):
    """
    ...
    """

    # flush the schema and data
    client.schema.delete_all()
    # create schema
    schema = {
        "classes": [
            {
                "class": "Article",
                "description": "An IETF RFC article with a title and date",
                "vectorizer": "none",
                "vectorIndexConfig": {
                    "skip": True
                },
                "properties": [
                    {
                        "dataType": [
                            "string"
                        ],
                        "description": "Title of the article",
                        "name": "title",
                        "indexInverted": True
                    },
                    {
                        "dataType": [
                            "string"
                        ],
                        "description": "Canonical URL for the article",
                        "name": "url",
                        "indexInverted": True
                        "moduleConfig": {
                            "text2vec-transformers": {
                                "skip": True,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                    # {
                    #     "dataType": [
                    #         "date"
                    #     ],
                    #     "description": "Title of the article",
                    #     "name": "date",
                    #     "indexInverted": True
                    # }, 
                    {
                        "dataType": [
                            "Paragraph"
                        ],
                        "description": "List of paragraphs this article has",
                        "name": "hasParagraphs",
                        "indexInverted": True
                    },
                ]
            },
            {
                "class": "Paragraph",
                "description": "An article paragraph",
                "vectorIndexConfig": {
                    "vectorCacheMaxObjects": 150000000000,
                    "ef": 256,
                    "efConstruction": 512,
                    "maxConnections": 128
                },
                "properties": [
                    {
                        "dataType": [
                            "string"
                        ],
                        "description": "Title of the paragraph",
                        "name": "title",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-transformers": {
                                "skip": True,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                    {
                        "dataType": [
                            "text"
                        ],
                        "description": "The content of the paragraph",
                        "name": "content",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-transformers": {
                                "skip": False,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                    {
                        "dataType": [
                            "int"
                        ],
                        "description": "Order of the paragraph",
                        "name": "order",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-transformers": {
                                "skip": True,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                    {
                        "dataType": [
                            "Article"
                        ],
                        "description": "Article this paragraph is in",
                        "name": "inArticle",
                        "moduleConfig": {
                            "text2vec-transformers": {
                                "skip": True,
                                "vectorizePropertyName": False,
                            }
                        }
                    }
                ]
            }
        ]
    }
    #
    # add schema
    #
    client.schema.create(schema)


create_weaviate_schema(client)

In [53]:

def add_article_to_batch(parsed_line):
    return [
        {
            "title": parsed_line["title"]
        },
        "Article",
        str(uuid.uuid3(uuid.NAMESPACE_DNS, parsed_line["title"].replace(" ", "_")))
    ]


def add_paragraph_to_batch(parsed_line):
    return_array = []
    for paragraph in parsed_line["paragraphs"]:
        add_object = {    
            "content": paragraph["content"],
            "order": paragraph["count"],
            "inArticle": [{
                "beacon": "weaviate://localhost/" + str(uuid.uuid3(uuid.NAMESPACE_DNS, parsed_line["title"].replace(" ", "_")))
            }]
        }
        if "title" in paragraph:
            # Skip if wiki paragraph
            if ":" in paragraph["title"]:
                continue
            add_object["title"] = paragraph["title"]
        # add to batch
        return_array.append([
            add_object,
            "Paragraph",
            str(uuid.uuid3(uuid.NAMESPACE_DNS, parsed_line["title"].replace(" ", "_") + str(paragraph["count"])))
        ])
    return return_array


def handle_results(results):
    if results is not None:
        for result in results:
            if 'result' in result and 'errors' in result['result'] and  'error' in result['result']['errors']:
                for message in result['result']['errors']['error']:
                    print('DEBUG: ', message['message'])

In [67]:
prettify(client.schema.get())

{
  "classes": [
    {
      "class": "Article",
      "description": "An IETF RFC article with a title and date",
      "invertedIndexConfig": {
        "bm25": {
          "b": 0.75,
          "k1": 1.2
        },
        "cleanupIntervalSeconds": 60,
        "stopwords": {
          "additions": null,
          "preset": "en",
          "removals": null
        }
      },
      "properties": [
        {
          "dataType": [
            "text"
          ],
          "description": "Title of the article",
          "indexFilterable": true,
          "indexSearchable": true,
          "name": "title",
          "tokenization": "whitespace"
        },
        {
          "dataType": [
            "text"
          ],
          "description": "Canonical path to the article",
          "indexFilterable": false,
          "indexSearchable": false,
          "moduleConfig": {
            "text2vec-transformers": {
              "skip": true,
              "vectorizePropertyName": false
  

In [55]:
def get_content(plain_text_rfc_file):
    # Initialize variables
    sw = 1
    b_count = 0
    b_sw = False
    last_indent = 0

    output = io.StringIO()
    for line in plain_text_rfc_file:
        stripped_line = line.strip()

        # Count blank lines but don't write them
        if stripped_line == '':
            if sw > 0: 
                sw = 0
            if sw == -1:
                sw = -2  # The line following the footer has a FF char which in Python3 is a line break.
            else:
                b_count += 1
        else: # Handle a non-blank line
            if sw == 0: 
                sw = 1
            if sw < 0: 
                sw -= 1 # If we're between pages, count footer/dashes/header
            if sw <= -4: # If we're at the header, resume printing
                this_indent = len(line) - len(line.lstrip())
                if b_sw or this_indent < last_indent: 
                    print('', file=output) # Print a blank line, if needed
                    pass
                sw = 1
                b_count = 0
                b_sw = False

            if re.search(r'\[Page [0-9]+\] *$', stripped_line): # Found the footer:
                sw = -1 # Stop output
                b_sw = b_count > 3 # true = print blank line when resuming output
            elif sw > 0:
                # Print a blank line if the previous line(s) was/were blank
                if b_count: 
                    print('', file=output)
                    pass
                b_count = 0
                print(line, file=output, end='')
                last_indent = len(line) - len(line.lstrip())
    contents = output.getvalue()
    output.close()
    return contents

In [56]:
def get_lines(plain_text_rfc_file):
    # Initialize variables
    sw = 1
    b_count = 0
    b_sw = False
    last_indent = 0

    lines = []
    for line in plain_text_rfc_file:
        stripped_line = line.strip()

        # Count blank lines but don't write them
        if stripped_line == '':
            if sw > 0: 
                sw = 0
            if sw == -1:
                sw = -2  # The line following the footer has a FF char which in Python3 is a line break.
            else:
                b_count += 1
        else: # Handle a non-blank line
            if sw == 0: 
                sw = 1
            if sw < 0: 
                sw -= 1 # If we're between pages, count footer/dashes/header
            if sw <= -4: # If we're at the header, resume printing
                this_indent = len(line) - len(line.lstrip())
                if b_sw or this_indent < last_indent: 
                    lines.append('\n')
                sw = 1
                b_count = 0
                b_sw = False

            if re.search(r'\[Page [0-9]+\] *$', stripped_line): # Found the footer:
                sw = -1 # Stop output
                b_sw = b_count > 3 # true = print blank line when resuming output
            elif sw > 0:
                # Print a blank line if the previous line(s) was/were blank
                if b_count: 
                    lines.append('\n')
                b_count = 0
                lines.append(line)
                last_indent = len(line) - len(line.lstrip())
    return lines

In [58]:
import os
directory_path = '/Users/jim/Projects/wiki3/ipfs/QmNvTjdqEPjZVWCvRWsFJA1vK7TTw1g9JP6we1WBJTRADM/rfc-data'
filename = 'rfc5925.txt'
with open(os.path.join(directory_path, filename), 'r') as file:
    content = get_content(file)
    print(content)


Internet Engineering Task Force (IETF)                          J. Touch
Request for Comments: 5925                                       USC/ISI
Obsoletes: 2385                                                A. Mankin
Category: Standards Track                            Johns Hopkins Univ.
ISSN: 2070-1721                                                R. Bonica
                                                        Juniper Networks
                                                               June 2010

                     The TCP Authentication Option

Abstract

   This document specifies the TCP Authentication Option (TCP-AO), which
   obsoletes the TCP MD5 Signature option of RFC 2385 (TCP MD5).  TCP-AO
   specifies the use of stronger Message Authentication Codes (MACs),
   protects against replays even for long-lived TCP connections, and
   provides more details on the association of security with TCP
   connections than TCP MD5.  TCP-AO is compatible with either a static
   

In [60]:
def parse_header(header_line):
    pattern = re.compile(r'^\x0c?(RFC \d+[a-z]?)(?: -\s+)?\s{2,}(.*)\s{2,}(.*)$')
    match = pattern.match(header_line)

    if match:
        return {'label':match.group(1), 'title':match.group(2).rstrip(), 'date':match.group(3)}

    return None

def extract_header(file):
    pattern_never_issued = re.compile(r'^rfc .* never issued', re.IGNORECASE)
    header = None
    left_margin = 0
    right_margin = 0
    for line in file:
        if line.startswith('\x0c'):  # End of page, break the loop
            break
                    # Find the widest common leading whitespace (left margin)
        left_margin = min(left_margin, len(line) - len(line.lstrip()))
                    # Find the widest common trailing whitespace (right margin)
        right_margin = min(right_margin, len(line) - len(line.rstrip()))
                
                # Reset the file pointer
    file.seek(0)
    is_first_line = True
    for line in file:
        # Trim margins only if they contain only whitespace
        line = line[left_margin:] if line[:left_margin].isspace() else line
        line = line[:-right_margin] if right_margin > 0 and line[-right_margin:].isspace() else line

        # If the line matches "RFC .... never issued ..."
        if is_first_line:
            is_first_line = False
            if pattern_never_issued.match(line):
                print(f"{filename} contains a never issued RFC: {line.strip()}")
                break
                    
        header = parse_header(line)
        if header:
            # If the title is missing, use the next line as the title
            if not header['title']:
                header = (header[0], next(file, '').strip(), header[2])
            # print(f"File: {filename}, Header: {header}")
            break

    return header

In [61]:
with open(os.path.join(directory_path, filename), 'r') as file:
    print(extract_header(file))
    lines = get_lines(file)
    # print(os.linesep.join(lines[:10]))
    print(f'lines: {len(lines)}')
    print(''.join(lines[:15]))

{'label': 'RFC 5925', 'title': 'The TCP Authentication Option', 'date': 'June 2010'}
lines: 2103

Copyright Notice

   Copyright (c) 2010 IETF Trust and the persons identified as the
   document authors.  All rights reserved.

   This document is subject to BCP 78 and the IETF Trust's Legal
   Provisions Relating to IETF Documents
   (http://trustee.ietf.org/license-info) in effect on the date of
   publication of this document.  Please review these documents
   carefully, as they describe your rights and restrictions with respect
   to this document.  Code Components extracted from this document must
   include Simplified BSD License text as described in Section 4.e of
   the Trust Legal Provisions and are provided without warranty as
   described in the Simplified BSD License.



In [74]:
import uuid

def put_article(document):
    doc_id = str(uuid.uuid3(uuid.NAMESPACE_URL, document['url']))
    # validate the object
    result = client.data_object.validate(
        data_object=document,
        class_name='Article',
        uuid=doc_id
    )
    prettify(result)
    if not result['valid']:
        print(f"Document not valid error: {result['error']}")
        return
    id = client.data_object.create(document, "Article")
    print(f"Document created with id: {id}")

def put_paragraphs(document, content):
    doc_id = str(uuid.uuid3(uuid.NAMESPACE_URL, document['url']))
    id = client.data_object.create(document, "Paragraph")


def ingest_rfc_file(directory_path, path):
    with open(os.path.join(directory_path, path), 'r') as f:
        lines = get_lines(f)
        stripped_lines = [line.rstrip() for line in lines]
        content = '\n'.join(stripped_lines[0:100])
        header = extract_header(f)
        if header:
            document = {
                # "dataset": "rfc",
                "url":  f"https://www.rfc-editor.org/rfc/{path}",
                "title": header['label'] + ': ' + header['title'],
                # "date": header['date'],
                # "content": content
            }
            prettify(document)
            put_article(document)
            put_paragraphs(document, content)


In [16]:
filename_list = [filename for filename in os.listdir(directory_path) if filename.endswith('.txt')]
filename_list[0:10]

['rfc7698.txt',
 'rfc6586.txt',
 'rfc7840.txt',
 'rfc2938.txt',
 'rfc4391.txt',
 'rfc3398.txt',
 'rfc5931.txt',
 'rfc2086.txt',
 'rfc5925.txt',
 'rfc2092.txt']

In [80]:
result = (
  client.query
    .get("Article", ['url', 'title'])
    .do()
)

prettify(result)

{
  "data": {
    "Get": {
      "Article": [
        {
          "title": "RFC 5931: EAP Password",
          "url": "https://www.rfc-editor.org/rfc/rfc5931.txt"
        },
        {
          "title": "EAP Password",
          "url": "https://www.rfc-editor.org/rfc/rfc5931.txt"
        },
        {
          "title": "EAP Password",
          "url": "https://www.rfc-editor.org/rfc/rfc5931.txt"
        }
      ]
    }
  }
}


In [89]:
where_filter = {
  "path": ["url"],
  "operator": "Equal",
  "valueText": 'https://www.rfc-editor.org/rfc/rfc5931.txt'
}

result = (
  client.query
    .get("Article", ['url', 'title'])
    .with_where(where_filter)
    .do()
)

prettify(result)

{
  "data": {
    "Get": {
      "Article": null
    }
  },
  "errors": [
    {
      "locations": [
        {
          "column": 6,
          "line": 1
        }
      ],
      "message": "Filtering by property 'url' requires inverted index. Is `indexFilterable` option of property 'url' enabled? Set it to `true` or leave empty",
      "path": [
        "Get",
        "Article"
      ]
    }
  ]
}


In [75]:
ingest_rfc_file(directory_path, 'rfc5931.txt')

{
  "url": "https://www.rfc-editor.org/rfc/rfc5931.txt",
  "title": "RFC 5931: EAP Password"
}
{
  "error": null,
  "valid": true
}
Document created with id: 69c8219f-0522-40bd-822f-2a31911881b6


In [None]:
for filename in filename_list:
    try:
        ingest_rfc_file(directory_path, filename)
    except Exception as e:
        print(f"Error reading {filename}: {str(e)}")
