## Chat Models - Tagging Documents

In [1]:
%pip install langchain langchain_openai nest_asyncio lxml beautifulsoup4 langchain-community --upgrade

Collecting langchain
  Downloading langchain-0.3.1-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.2.1-py3-none-any.whl.metadata (2.6 kB)
Collecting lxml
  Downloading lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.1-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain-core<0.4.0,>=0.3.6 (from langchain)
  Downloading langchain_core-0.3.6-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.129-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting openai<2.0.0,>=1.40.0 (from langchain_openai)
  Downloading openai-1.50.2-py3-none-a

In [2]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

··········


In [3]:
# fixes a bug with asyncio and jupyter
import nest_asyncio

nest_asyncio.apply()

In [4]:
from langchain.document_loaders.sitemap import SitemapLoader
from langchain_openai.chat_models import ChatOpenAI
from langchain.chains import create_tagging_chain, create_tagging_chain_pydantic
import pandas as pd



In [5]:
sitemap_loader = SitemapLoader(web_path="https://understandingdata.com/sitemap.xml")
sitemap_loader.requests_per_second = 5
docs = sitemap_loader.load()

Fetching pages: 100%|##########| 101/101 [00:03<00:00, 25.71it/s]


In [8]:
# Schema
schema = {
    "properties": {
        "sentiment": {"type": "string" },
        "aggressiveness": {"type": "integer"},
        "primary_topic": {"type": "string", "description": "The main topic of the document."},
    },
     "required": ["primary_topic", "sentiment", "aggressiveness"],
}

# LLM
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
chain = create_tagging_chain(schema, llm, output_key='output')

In [9]:
results = []

# Remove the 0:10 to run on all documents:
for index, doc in enumerate(docs[0:10]):
    print(f"Processing doc {index +1}")
    chain_result = chain.invoke({'input': doc.page_content})
    results.append(chain_result['output'])

Processing doc 1
Processing doc 2
Processing doc 3
Processing doc 4
Processing doc 5
Processing doc 6
Processing doc 7
Processing doc 8
Processing doc 9
Processing doc 10


In [10]:
results

[{'sentiment': 'positive',
  'aggressiveness': 3,
  'primary_topic': 'Software & Data Engineering'},
 {'sentiment': 'positive', 'aggressiveness': 0, 'primary_topic': 'technology'},
 {'sentiment': 'positive', 'aggressiveness': 0, 'primary_topic': 'Contact'},
 {'sentiment': 'positive',
  'aggressiveness': 2,
  'primary_topic': 'Software & Data Engineering Services'},
 {'sentiment': 'positive',
  'aggressiveness': 0,
  'primary_topic': 'Software & Data Engineering'},
 {'sentiment': 'neutral',
  'aggressiveness': 0,
  'primary_topic': 'Data Engineering'},
 {'sentiment': 'positive',
  'aggressiveness': 0,
  'primary_topic': 'Data Engineering Services'},
 {'sentiment': 'positive',
  'aggressiveness': 3,
  'primary_topic': 'React Software Development'},
 {'sentiment': 'positive',
  'aggressiveness': 0,
  'primary_topic': 'Python software development'},
 {'sentiment': 'positive',
  'aggressiveness': 2,
  'primary_topic': 'Python software development'}]

In [11]:
df = pd.DataFrame(results)

In [12]:
df

Unnamed: 0,sentiment,aggressiveness,primary_topic
0,positive,3,Software & Data Engineering
1,positive,0,technology
2,positive,0,Contact
3,positive,2,Software & Data Engineering Services
4,positive,0,Software & Data Engineering
5,neutral,0,Data Engineering
6,positive,0,Data Engineering Services
7,positive,3,React Software Development
8,positive,0,Python software development
9,positive,2,Python software development


In [13]:
docs[0].metadata

{'source': 'https://understandingdata.com/',
 'loc': 'https://understandingdata.com/',
 'lastmod': '2024-09-29T20:35:28.914Z',
 'changefreq': 'monthly',
 'priority': '1.0'}

In [14]:
# Combine the URLs with the results
df['url'] = [doc.metadata['source'] for doc in docs[0:10]]


In [None]:
df

Unnamed: 0,sentiment,aggressiveness,primary_topic,url
0,positive,0.5,software engineering,https://understandingdata.com/
1,passionate,0.0,innovative solutions,https://understandingdata.com/about/
2,positive,0.5,contact,https://understandingdata.com/contact/
3,positive,0.5,software & data engineering services,https://understandingdata.com/services/
4,positive,0.5,software and data engineering,https://understandingdata.com/projects/
5,positive,0.5,data engineering,https://understandingdata.com/posts/
6,positive,0.5,data engineering services,https://understandingdata.com/services/data-en...
7,positive,0.5,React Software Development,https://understandingdata.com/services/react-s...
8,positive,0.5,Python software development,https://understandingdata.com/services/python-...
9,positive,0.5,Python software development,https://understandingdata.com/services/python-...
