In [30]:
# download libraries and packages
!pip install gensim
!pip install python-docx



In [31]:
# import libraries and packages
import pandas as pd
import numpy as np
import gensim
from docx import Document

In [32]:
# import data (text file)
file_name = "Sample Data for Evaluation.docx"
doc = Document(file_name)

# extract text from doc object
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])

In [33]:
# remove unwanted text #
import re

# remove emails
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)

# remove URLs
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

# normalize white space
text = re.sub(r'\s+', ' ', text).strip()

In [34]:
# text tokenization #
from gensim.utils import simple_preprocess

# tokenize the text
tokens = simple_preprocess(text)

In [35]:
# stop words removal #
from nltk.corpus import stopwords
import nltk

# load built-in stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# remove stop words
tokens = [token for token in tokens if token not in stop_words]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
# lemmatization #
!python -m spacy download en_core_web_md
import spacy

# load spacey model
nlp = spacy.load('en_core_web_md')

# lemmatize using spacey
text = ' '.join(tokens)
doc = nlp(text)
tokens = [token.lemma_ for token in doc]

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [37]:
print(text)

unacceptable risk scenario social scoring system description government implements ai system assigns scores citizens based social behavior financial status online activities risk could lead discrimination loss privacy social exclusion making unacceptable risk eu ai act high risk scenario ai healthcare description hospital uses ai system diagnose diseases recommend treatments based patient data risk incorrect diagnoses treatment recommendations could serious health consequences therefore ai system subject strict regulations oversight scenario autonomous vehicles description company deploys self driving cars public roads risk malfunctions errors ai system could lead accidents posing significant safety risks systems must comply rigorous safety standards limited risk scenario customer service chatbots description commerce website uses ai powered chatbots handle customer inquiries support risk risk lower transparency required inform users interacting ai ensuring ethical use scenario recomme

In [38]:
text_tokens = text.split()

print(tokens)

['unacceptable', 'risk', 'scenario', 'social', 'scoring', 'system', 'description', 'government', 'implement', 'ai', 'system', 'assign', 'score', 'citizen', 'base', 'social', 'behavior', 'financial', 'status', 'online', 'activity', 'risk', 'could', 'lead', 'discrimination', 'loss', 'privacy', 'social', 'exclusion', 'make', 'unacceptable', 'risk', 'eu', 'ai', 'act', 'high', 'risk', 'scenario', 'ai', 'healthcare', 'description', 'hospital', 'use', 'ai', 'system', 'diagnose', 'disease', 'recommend', 'treatment', 'base', 'patient', 'datum', 'risk', 'incorrect', 'diagnosis', 'treatment', 'recommendation', 'could', 'serious', 'health', 'consequence', 'therefore', 'ai', 'system', 'subject', 'strict', 'regulation', 'oversight', 'scenario', 'autonomous', 'vehicle', 'description', 'company', 'deploy', 'self', 'drive', 'car', 'public', 'road', 'risk', 'malfunction', 'error', 'ai', 'system', 'could', 'lead', 'accident', 'pose', 'significant', 'safety', 'risk', 'system', 'must', 'comply', 'rigorous'

In [39]:
def parse_tokens(text_tokens):
    structured_data = []
    current_dict = {}
    i = 0
    risk_levels = ["unacceptable", "high", "limited", "minimal"]

    while i < len(text_tokens):
        if text_tokens[i] in risk_levels and text_tokens[i+1] == "risk":
            # If a dictionary has been started, finalize it and start a new one
            if current_dict and current_dict.get("scenario"):
                structured_data.append(current_dict)
            # Initialize a new dictionary
            current_dict = {
                "risk_level": f"{text_tokens[i]} risk".capitalize(),
                "scenario": "",
                "description": [],
                "risk": []
            }
            i += 2
        elif text_tokens[i] == "scenario":
            i += 1
            scenario = []
            while i < len(text_tokens) and text_tokens[i] != "description":
                scenario.append(text_tokens[i])
                i += 1
            current_dict["scenario"] = " ".join(scenario).capitalize()
        elif text_tokens[i] == "description":
            i += 1
            description = []
            while i < len(text_tokens) and text_tokens[i] != "risk" and text_tokens[i] != "scenario":
                description.append(text_tokens[i])
                i += 1
            # Assign description words to a list
            current_dict["description"] = description
        elif text_tokens[i] == "risk":
            i += 1
            risk = []
            while i < len(text_tokens) and text_tokens[i] not in risk_levels and text_tokens[i] != "scenario":
                risk.append(text_tokens[i])
                i += 1
            # Append risk words to the list
            if current_dict.get("risk"):
                current_dict["risk"].extend(risk)
            else:
                current_dict["risk"] = risk
        else:
            i += 1

    # Final check to append the last dictionary if it's fully populated
    if current_dict and current_dict.get("scenario"):
        structured_data.append(current_dict)

    return structured_data

# Parse tokens into structured data
structured_data = parse_tokens(text_tokens)

# Display the structured data
for item in structured_data:
    print(item)

{'risk_level': 'Unacceptable risk', 'scenario': 'Social scoring system', 'description': ['government', 'implements', 'ai', 'system', 'assigns', 'scores', 'citizens', 'based', 'social', 'behavior', 'financial', 'status', 'online', 'activities'], 'risk': ['could', 'lead', 'discrimination', 'loss', 'privacy', 'social', 'exclusion', 'making']}
{'risk_level': 'High risk', 'scenario': 'Autonomous vehicles', 'description': ['company', 'deploys', 'self', 'driving', 'cars', 'public', 'roads'], 'risk': ['incorrect', 'diagnoses', 'treatment', 'recommendations', 'could', 'serious', 'health', 'consequences', 'therefore', 'ai', 'system', 'subject', 'strict', 'regulations', 'oversight', 'malfunctions', 'errors', 'ai', 'system', 'could', 'lead', 'accidents', 'posing', 'significant', 'safety', 'risks', 'systems', 'must', 'comply', 'rigorous', 'safety', 'standards']}
{'risk_level': 'Limited risk', 'scenario': 'Recommendation systems', 'description': ['streaming', 'service', 'uses', 'ai', 'suggest', 'mov

In [40]:
# update this once we get key

# import os
# import getpass
# os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [41]:
# install libraries
!pip install -U llama-index
!pip install elasticsearch
from llama_index.core import Document, GPTVectorStoreIndex, StorageContext
from elasticsearch import Elasticsearch, helpers



In [52]:
# connects to Elasticsearch instance
# I made my own instance for testing purposes - Cindy
es = Elasticsearch("https://e05734aec42640f4b38e62a3286e0fb8.centralus.azure.elastic-cloud.com:443",
                   api_key=("jzMVoZEBTnwe5alvuY6b", "cIfUCPp_RcurKI_x_Wghmw"))
print(es.info())
documents = [{
                "_op_type": "index",
                "_index": "risk_levels",
                "_source": doc} for doc in structured_data
]

# indexing all documents
success, errors = helpers.bulk(es, documents)

print(f"Successfully indexed {success} documents.")
if errors:
    print(f"Errors occurred during indexing: {errors}")

{'name': 'instance-0000000000', 'cluster_name': 'e05734aec42640f4b38e62a3286e0fb8', 'cluster_uuid': 'ZrYCZbUHRheAs71RbRbOow', 'version': {'number': '8.15.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179', 'build_date': '2024-08-05T10:05:34.233336849Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}
Successfully indexed 4 documents.


In [59]:
# creates a Document object and a list of Document objects
lamma_documents = [Document(text = str(doc)) for doc in structured_data]

# making vector store
storage_context = StorageContext.from_defaults(vector_store = es)

# creates the index from documents
# throws an error since we don't have access to openai
index = GPTVectorStoreIndex.from_documents(lamma_documents, storage_context = storage_context)


# query documents
query_engine = index.as_query_engine()
response = query_engine.query("What are the risks associated with AI?")
print(response)


ValueError: 
******
Could not load OpenAI embedding model. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

Consider using embed_model='local'.
Visit our documentation for more embedding options: https://docs.llamaindex.ai/en/stable/module_guides/models/embeddings.html#modules
******