## Getting data from wikipedia

In [2]:
# !pip install langchain
# !pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py): started
  Building wheel for wikipedia (setup.py): finished with status 'done'
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11707 sha256=a5f55c1a42866948b32ed018e157249e9f9a1791a21ec228df2e8a3aa5288f46
  Stored in directory: c:\users\jesus\appdata\local\pip\cache\wheels\8f\ab\cb\45ccc40522d3a1c41e1d2ad53b8f33a62f394011ec38cd71c6
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [2]:
#load the Open AI wikipedia page
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import CharacterTextSplitter
raw_documents = WikipediaLoader(query='OpenAI').load()

In [3]:
# define chunking strategy
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 1000, chunk_overlap = 20
)

#CHUNK the document
documents = text_splitter.split_documents(raw_documents)
for d in documents:
    del d.metadata['summary']

In [4]:
for doc in documents:
    print(doc.metadata['source'])

https://en.wikipedia.org/wiki/OpenAI
https://en.wikipedia.org/wiki/OpenAI_Codex
https://en.wikipedia.org/wiki/ChatGPT
https://en.wikipedia.org/wiki/OpenAI_Five
https://en.wikipedia.org/wiki/Removal_of_Sam_Altman_from_OpenAI
https://en.wikipedia.org/wiki/Gemini_(chatbot)
https://en.wikipedia.org/wiki/Generative_artificial_intelligence
https://en.wikipedia.org/wiki/Sam_Altman
https://en.wikipedia.org/wiki/Artificial_general_intelligence
https://en.wikipedia.org/wiki/Whisper_(speech_recognition_system)
https://en.wikipedia.org/wiki/Generative_pre-trained_transformer
https://en.wikipedia.org/wiki/GPT-4
https://en.wikipedia.org/wiki/Gemini_(language_model)
https://en.wikipedia.org/wiki/Perplexity.ai
https://en.wikipedia.org/wiki/Greg_Brockman
https://en.wikipedia.org/wiki/GPT-3
https://en.wikipedia.org/wiki/Large_language_model
https://en.wikipedia.org/wiki/AI_boom
https://en.wikipedia.org/wiki/Mira_Murati
https://en.wikipedia.org/wiki/AI_alignment
https://en.wikipedia.org/wiki/Emmett_Shear

## Load news articles

In [5]:
from langchain.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
import os

directory_path = 'news'
#Initialize PyPDFLoader for each PDF in directory
loaders = [PyPDFLoader(os.path.join(directory_path,f)) for f in os.listdir(directory_path) if f.endswith('.pdf')]


#load documents from PDF
news_docs = []
for loader in loaders:
    news_docs.extend(loader.load())
    
#prepare the content and metadata for each news article as document objects

news_articles_data = [
    Document(
        page_content=doc.page_content, 
        metadata={
            'source': doc.metadata['source'].removeprefix('news/'),
        }
    )
    for doc in news_docs
    
]


In [22]:
!pip install spacy-llm
!pip install --upgrade jupyter ipywidgets

Collecting spacy-llm
  Obtaining dependency information for spacy-llm from https://files.pythonhosted.org/packages/23/dc/19b155fd5cdc4b2d2306b79043491a11f42df8e71277ac3cad1a5110b94e/spacy_llm-0.7.1-py2.py3-none-any.whl.metadata
  Downloading spacy_llm-0.7.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting spacy<4.0,>=3.5 (from spacy-llm)
  Obtaining dependency information for spacy<4.0,>=3.5 from https://files.pythonhosted.org/packages/92/fb/d1f0605e1e8627226c6c96053fe1632e9a04a3fbcd8b5d715528cb95eb97/spacy-3.7.4-cp311-cp311-win_amd64.whl.metadata
  Downloading spacy-3.7.4-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting confection<1.0.0,>=0.1.3 (from spacy-llm)
  Obtaining dependency information for confection<1.0.0,>=0.1.3 from https://files.pythonhosted.org/packages/39/78/f9d18da7b979a2e6007bfcea2f3c8cc02ed210538ae1ce7e69092aed7b18/confection-0.1.4-py3-none-any.whl.metadata
  Downloading confection-0.1.4-py3-none-any.whl.metadata (19 kB)
Collecting srsly<3.0.0,>=2.4.0 (from con

In [6]:
all_data = documents + news_articles_data

## Perform Articles Summaries as relationship extraction Database

In [7]:
from langchain.chains.combine_documents.stuff  import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
load_dotenv()


#Initialize the text splitter
rtext_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 150)

# Initialize LLM
llm = ChatOpenAI(temperature = 0, model_name = 'gpt-4')

map_template = """ the following is a set of documents
{all_data}
based on this list of docs, please perform concise summaries while extracting essential relationships for relationships analysis later, 
please do include dates of actions or events, which are bery important for timeline analysis later. Example : 'Sam gets fired by the OpenAI board on 
11/17/2023 or (Nov. 17th, friday)', which showcase not only the relationships between Sam and OpenAI , but also when it happens.
Helpful answer: """
map_promp = PromptTemplate.from_template(map_template)

map_chain = LLMChain(llm=llm,prompt=map_promp)

all_data = news_articles_data + documents

# REDUCE

reduce_template = """ the following is a set of summaries:
{all_data}
Take these and distill it into concise summaries of the articles while containing important relationships and events (including the timeline).
Example: 'Sam gets fired byt the  OpenAI board on 11/17/2023 or (Nov. 17th, friday)', which showcase not only the relationships between Sam and OpenAI, but also when it happens.
Helpful Answer:"""

reduce_promp = PromptTemplate.from_template(reduce_template)

#Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_promp)

combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain,
    document_variable_name='all_data'
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    #this is final chain that is calles
    combine_documents_chain=combine_documents_chain,
    #if documents exceed context for 'stuffdocumentschain'
    collapse_documents_chain=combine_documents_chain,
    #maximum numver of tokers to group documents into
    token_max=4000
)

#Combining documents by mapping a chain over them, them combining results
map_reduce_chain = MapReduceDocumentsChain(
    #map chain
    llm_chain=map_chain,
    #reduce chain
    reduce_documents_chain=reduce_documents_chain,
    #the variable name in the llm_chain to put the documents in
    document_variable_name='all_data',
    #return the results of the map steps in the output
    return_intermediate_steps=False
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 1000, chunk_overlap = 0
)

split_docs = text_splitter.split_documents(all_data)

# Run the MapReduce Chain
summarization_results = map_reduce_chain.run(split_docs)

  warn_deprecated(


In [8]:
summarization_results


"1. On February 17, 2024, Rod Stewart sold his musical rights for an undisclosed amount. \n\n2. Elon Musk reignited his dispute with OpenAI CEO, Sam Altman, on February 16, 2024, over OpenAI's decision to abandon its non-profit model.\n\n3. OpenAI introduced Sora, an AI generative model, on February 17, 2024, acknowledging that the model still faces significant challenges.\n\n4. In August 2024, wildfires in Maui led to Jeff Bezos and his fiancée Lauren Sánchez launching a $100 million fund to help the victims.\n\n5. Jeff Bezos owns a house in Maui and lent his helicopter to the Maui Fire Department for worker transportation in August 2024.\n\n6. Elon Musk launched an independent AI company, xAI, in November 2024, with its first product being an AI chatbot named Grok.\n\n7. As of an unspecified date, Forbes valued Elon Musk at $205.5 billion, making him the second richest person in the world.\n\n8. OpenAI introduced Sora, an AI capable of creating hyper-realistic videos, on February 16,

In [9]:
#store summarization results to a text file for future use
with open('summary.txt','w') as file:
    file.write(str(summarization_results))

## Spacy

In [11]:
#!pip install spacy-llm



In [12]:
import os 
import json
import spacy
from collections import Counter
from pathlib import Path
from wasabi import msg
from spacy_llm.util import assemble

In [None]:
# traditional spacy NER (Named Recognition Library)
def split_document_sent(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

#spacy-llm relationship extraction
def process_text(nlp, text, verbose = False):
    doc = nlp(text)
    if verbose:
        msg.text(f'Text:{doc.text}')
        msg.text(f'Entities: {[(ent.text, ent.label_) for ent in doc.ents]}')
        msg.text('Relations')
        for r in doc._.rel:
            msg.text(f' - {doc.ents[r.dep]} [{r.relation}] {doc.ents[r.dest]}')
    return doc

def run_pipeline(config_path, examples_path = None , verbose = False):
    if not os.getenv('OPENAI_API_KEY'):
        msg.fail("OPENAI_API_KEY env variable was not found. Set it and try again", exist = 1)
            
            
            