In [None]:
#Run this if you are running the program for the first time
!pip install nomic
!pip install -U langchain-nomic langchain_community tiktoken langchain-openai chromadb langchain
!pip install -U langchain-anthropic

In [1]:
from langchain_nomic.embeddings import NomicEmbeddings
import os
from dotenv import load_dotenv
_ = load_dotenv()

In [2]:
# Convert data into text functions
import anthropic
from langchain_anthropic import ChatAnthropic
import os

from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_nomic import NomicEmbeddings
from langchain_nomic.embeddings import NomicEmbeddings

from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

pdf_list = ["files/Style.pdf", "files/DEI.pdf", "files/34th.pdf", "files/sports.pdf"]
urls = [
'https://yoast.com/slug/', 'https://www.semrush.com/blog/what-is-a-url-slug/?kw=&cmp=US_SRCH_DSA_Blog_EN&label=dsa_pagefeed&Network=g&Device=c&kwid=dsa-2185834088336&cmpid=18348486859&agpid=156019556762&BU=Core&extid=97592280163&adpos=', 'https://www.upwork.com/resources/how-to-write-seo-content','https://authorservices.wiley.com/author-resources/Journal-Authors/Prepare/writing-for-seo.html','https://www.semrush.com/blog/seo-writing/','https://www.semrush.com/kb/839-how-to-write-seo-articles-four-steps','https://www.flowmatters.com/blog/a-practical-guide-on-how-to-write-seo-articles/','https://www.maropost.com/how-to-combine-seo-and-email-marketing-for-better-rankings/','https://www.webfx.com/seo/learn/email-marketing-tips-to-improve-seo/','https://sendgrid.com/en-us/blog/seo-and-email-marketing','https://www.emailonacid.com/blog/article/email-marketing/seo-connections/','https://coalitiontechnologies.com/blog/strategic-seo-tips-for-email-marketing','https://optinmonster.com/101-email-subject-lines-your-subscribers-cant-resist/','https://www.wordstream.com/blog/ws/2014/03/31/email-subject-lines','https://www.constantcontact.com/blog/good-email-subject-lines/','https://blog.hubspot.com/marketing/best-email-subject-lines-list'
]

def load_csv(csv):
    loader = CSVLoader(file_path= csv)
    data = loader.load()
    return data

def load_url(url_list):
    urls = url_list
    docs = [WebBaseLoader(url).load() for url in urls]
    docs_list = [item for sublist in docs for item in sublist]
    return docs_list

def load_pdf(pdf_list):
    pdfs = pdf_list
    output = [PyPDFLoader(pdf).load() for pdf in pdfs]
    pdfs_list = [item for sublist in output for item in sublist]
    return pdfs_list

data = load_csv("files/merged_stats.csv")
docs_list = load_url(urls)
pdfs_list = load_pdf(pdf_list)

#Splitting
def splitter(data, docs_list, pdfs_list):
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=7500, chunk_overlap=100
    )
    
    doc_splits = text_splitter.split_documents(data)
    
    url_text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=7500, chunk_overlap=100
    )
    
    url_splits = url_text_splitter.split_documents(docs_list)
    
    pdf_text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=7500, chunk_overlap=100
    )
    
    pdf_splits = pdf_text_splitter.split_documents(pdfs_list)

    return doc_splits, url_splits, pdf_splits

doc_splits, url_splits, pdf_splits = splitter(data, docs_list, pdfs_list) 

# Vector DB for Articles.csv
csv_vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding=NomicEmbeddings(model="nomic-embed-text-v1"),
)
csv_retriever = csv_vectorstore.as_retriever()

# Vector DB for SEO 

url_vectorstore = Chroma.from_documents(
    documents=url_splits,
    collection_name="rag-chroma",
    embedding=NomicEmbeddings(model="nomic-embed-text-v1"),
)
url_retriever = url_vectorstore.as_retriever()

# Vector DB for Writing Style Documents 

pdf_vectorstore = Chroma.from_documents(
    documents=pdf_splits,
    collection_name="rag-chroma",
    embedding=NomicEmbeddings(model="nomic-embed-text-v1"),
)
pdf_retriever = pdf_vectorstore.as_retriever()

template = """ Pretend you are an SEO expert for a company called "The Daily Pennsylvanian" that does journalism for the University of Pennsylvania.  
Answer the question based only on the following context: 

Information about previous articles as well as their performance metrics can be found through: {context}

Information about SEO Optimization can be found through: {context1}

The Daily Pennsylvanian writing style guide and tips can be found through: {context2}

Question: Output 3 potential URL Slugs and SEO titles based on the provided Drafted Title and Content 
make sure that the URL Slug is in the correct format that a URL Slug should be and that 
the SEO title is search engine optimized and concise. DO NOT ASSUME ANY INFORMATION, make the title based ONLY on the information told in the question {question}
\n\nAssistant:
"""
prompt = ChatPromptTemplate.from_template(template)

# Local LLM
llm_name = "claude-3-opus"
model_remote = ChatAnthropic(model=llm_name)  

# Chain
# take the question, chroma search, gives back chunks, that 
# context , 1 , 2 , 3 seperate objects retrievers
chain = (
    {"context": csv_retriever, "context1" : url_retriever, "context2" : pdf_retriever, "question": RunnablePassthrough()}
    | prompt
    | model_remote
    | StrOutputParser()
)

In [97]:
# Convert data into text functions
import anthropic
from langchain_anthropic import ChatAnthropic
import os

from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_nomic import NomicEmbeddings
from langchain_nomic.embeddings import NomicEmbeddings

from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

pdf_list = ["Style.pdf", "DEI.pdf", "34th.pdf", "sports.pdf"]
urls = [
'https://yoast.com/slug/', 'https://www.semrush.com/blog/what-is-a-url-slug/?kw=&cmp=US_SRCH_DSA_Blog_EN&label=dsa_pagefeed&Network=g&Device=c&kwid=dsa-2185834088336&cmpid=18348486859&agpid=156019556762&BU=Core&extid=97592280163&adpos=', 'https://www.upwork.com/resources/how-to-write-seo-content','https://authorservices.wiley.com/author-resources/Journal-Authors/Prepare/writing-for-seo.html','https://www.semrush.com/blog/seo-writing/','https://www.semrush.com/kb/839-how-to-write-seo-articles-four-steps','https://www.flowmatters.com/blog/a-practical-guide-on-how-to-write-seo-articles/','https://www.maropost.com/how-to-combine-seo-and-email-marketing-for-better-rankings/','https://www.webfx.com/seo/learn/email-marketing-tips-to-improve-seo/','https://sendgrid.com/en-us/blog/seo-and-email-marketing','https://www.emailonacid.com/blog/article/email-marketing/seo-connections/','https://coalitiontechnologies.com/blog/strategic-seo-tips-for-email-marketing','https://optinmonster.com/101-email-subject-lines-your-subscribers-cant-resist/','https://www.wordstream.com/blog/ws/2014/03/31/email-subject-lines','https://www.constantcontact.com/blog/good-email-subject-lines/','https://blog.hubspot.com/marketing/best-email-subject-lines-list'
]

def load_csv(csv):
    loader = CSVLoader(file_path= csv)
    data = loader.load()
    return data

def load_url(url_list):
    urls = url_list
    docs = [WebBaseLoader(url).load() for url in urls]
    docs_list = [item for sublist in docs for item in sublist]
    return docs_list

def load_pdf(pdf_list):
    pdfs = pdf_list
    output = [PyPDFLoader(pdf).load() for pdf in pdfs]
    pdfs_list = [item for sublist in output for item in sublist]
    return pdfs_list

data = load_csv("merged_stats.csv")
docs_list = load_url(urls)
pdfs_list = load_pdf(pdf_list)

#Splitting
def splitter(data, docs_list, pdfs_list):
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=7500, chunk_overlap=100
    )
    
    doc_splits = text_splitter.split_documents(data)
    
    url_text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=7500, chunk_overlap=100
    )
    
    url_splits = url_text_splitter.split_documents(docs_list)
    
    pdf_text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=7500, chunk_overlap=100
    )
    
    pdf_splits = pdf_text_splitter.split_documents(pdfs_list)

    return doc_splits, url_splits, pdf_splits

doc_splits, url_splits, pdf_splits = splitter(data, docs_list, pdfs_list) 

# Vector DB for Articles.csv
csv_vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding=NomicEmbeddings(model="nomic-embed-text-v1"),
)
csv_retriever = csv_vectorstore.as_retriever()

# Vector DB for SEO 

url_vectorstore = Chroma.from_documents(
    documents=url_splits,
    collection_name="rag-chroma",
    embedding=NomicEmbeddings(model="nomic-embed-text-v1"),
)
url_retriever = url_vectorstore.as_retriever()

# Vector DB for Writing Style Documents 

pdf_vectorstore = Chroma.from_documents(
    documents=pdf_splits,
    collection_name="rag-chroma",
    embedding=NomicEmbeddings(model="nomic-embed-text-v1"),
)
pdf_retriever = pdf_vectorstore.as_retriever()

template = """ Pretend you are an SEO expert for a company called "The Daily Pennsylvanian" that does journalism for the University of Pennsylvania.  
Answer the question based only on the following context: 

Information about previous articles as well as their performance metrics can be found through: {context}

Information about SEO Optimization can be found through: {context1}

The Daily Pennsylvanian writing style guide and tips can be found through: {context2}

Question: Output 3 potential URL Slugs and SEO titles based on the provided Drafted Title and Content 
make sure that the URL Slug is in the correct format that a URL Slug should be and that 
the SEO title is search engine optimized and concise. DO NOT ASSUME ANY INFORMATION, make the title based ONLY on the information told in the question {question}
\n\nAssistant:
"""
prompt = ChatPromptTemplate.from_template(template)

# Local LLM
llm_name = "claude-3-opus"
api_key= os.environ.get("ANTHROPIC_API_KEY")

model_remote = ChatAnthropic(api_key= api_key, model_name="claude-3-opus-20240229") 

# Chain
# take the question, chroma search, gives back chunks, that 
# context , 1 , 2 , 3 seperate objects retrievers
chain = (
    {"context": csv_retriever, "context1" : url_retriever, "context2" : pdf_retriever, "question": RunnablePassthrough()}
    | prompt
    | model_remote
    | StrOutputParser()
)

In [98]:
def activate_chain(t, c):
    for chunk in chain.stream(f'Title: {t} and Content: {c}'):
        print(chunk, end="")
    
title = (str(input("Please give me your suggested title, I will optimize it! ")))
print()
content = str(input("Please tell me what you are writing about: "))

print("response")
print(activate_chain(title, content))

Please give me your suggested title, I will optimize it!  Testing AI





Please tell me what you are writing about:  The use of AI in education


response
Here are 3 potential URL slugs and SEO titles based on the provided drafted title and content:

URL Slug 1: testing-ai-in-education
SEO Title 1: Testing AI in Education: Implications and Challenges

URL Slug 2: ai-impact-on-education
SEO Title 2: The Impact of AI on Education: Testing and Beyond

URL Slug 3: ai-education-testing
SEO Title 3: AI and Education: Testing the Limits and Potential

The URL slugs follow proper formatting by using lowercase letters, separating words with hyphens, and avoiding special characters.

The SEO titles are concise (under 60 characters), incorporate the main keyword from the drafted title ("testing AI" or "AI in education"), and provide a bit more context about the content's focus on implications, impact, limitations and potential of AI in educational testing and beyond. The titles aim to be descriptive and compelling to entice users to click through from search results.None


In [77]:
def activate_chain(t, c):
    output = chain.invoke(f'Title: {t} and Content: {c}')
    return output
    
title = (str(input("Please give me your suggested title, I will optimize it! ")))
print()
content = str(input("Please tell me what you are writing about: "))

print("response")
print(activate_chain(title, content))

Please give me your suggested title, I will optimize it!  How I cope with mental health issues as a student-athlete





Please tell me what you are writing about:  They say that when you get worked up about something to count down from ten or repeat a mantra to try to calm down. But what happens when your chest gets so tight that you can’t think about what comes after seven? What about when you stomach is lurching up your throat at what feels like 1,000 miles an hour? How are you supposed to remember a mantra when you are trying to stop your eyes from darting from right to left, constantly in search of something but you’re not sure what, all while you are trying to figure out how to escape your shirt that is inexplicably and suddenly trying to pull you inside out? I had my first panic attack when I was 17. I was sitting in my AP Stat class, third row back, fourth seat from the right. It was a crisp early October day and I remember the vivid fall colors adorning the trees outside the windows to my right: burnt orange leaves with the veins dyed deeper orange hues, blood red, perfectly symmetrical foliage 

response
Here are 3 potential URL slugs and SEO titles based on the provided draft title and content:

URL Slug 1: coping-with-mental-health-issues-as-student-athlete
SEO Title 1: Coping with Mental Health Issues as a Student-Athlete

URL Slug 2: my-experience-with-anxiety-and-depression-in-college-sports 
SEO Title 2: My Experience with Anxiety and Depression as a College Athlete

URL Slug 3: managing-mental-health-challenges-while-playing-college-sports
SEO Title 3: How I Manage Mental Health Challenges While Playing College Sports


In [None]:
import gradio as gr
def chat(input_text, dept, title, content, chat_history):
    chat_history = chat_history or []
    global context
    
    # Assemble the prompt text if necessary
    prompt_text = f"""Given that I work for this department {dept} and have the article 
    title of this: {title}, here is what the 
    article is about {content} answer this question: {input_text}"""
    
    chat_history.append((input_text, chain.invoke(prompt_text)))
    
    # Clear input fields and maintain the chat history
    return chat_history, chat_history, "", "", "", ""


def chat_stream(input_text, dept, title, content, chat_history):
    chat_history = chat_history or []
    global context
    response=[]
    
    # Assemble the prompt text if necessary
    prompt_text = f"""Given that I work for this department {dept} and have the article 
    title of this: {title}, here is what the 
    article is about {content} answer this question: {input_text}"""



    '''if message is not None:
        #history_langchain_format.append(HumanMessage(content=message))
        partial_message = ""
        for response in chain.stream(prompt):
            partial_message += response.content
            yield partial_message'''
    

    #chat_history.append((input_text, chain.invoke(prompt_text)))
    
    # Clear input fields and maintain the chat history
    #return chat_history, chat_history, "", "", "", ""



# Adjust the chain setup
llm_name = "claude-3-opus"
api_key= os.environ.get("ANTHROPIC_API_KEY")

model_remote = ChatAnthropic(api_key= api_key, model_name="claude-3-opus-20240229") 

chain = (
    {"context": csv_retriever, "context1" : url_retriever, "context2" : pdf_retriever, "question": RunnablePassthrough()}
    | prompt
    | model_remote
    | StrOutputParser()
)

# Setup Gradio UI
theme = gr.themes.Base(
    primary_hue="red",
    secondary_hue="red",
    neutral_hue="slate",
)

with gr.Blocks(theme=theme) as demo:
    gr.Markdown("<h1><center>Daily Pennsylvanian SEO Optimizer</center></h1>")
    chatbot = gr.Chatbot()
    title = gr.Textbox(placeholder="Title here", label="Article Title")
    content = gr.Textbox(placeholder="Article content here", label="Article Content")
    input_box = gr.Textbox(placeholder="Chat with the GPT", label="Question")
    dept = gr.Dropdown(["Under the Button", "34th Street", "Quaker Nation", "DP General"], label="Department", info="Please tell me what department you are writing for!", allow_custom_value = True)
    state = gr.State()

    submit = gr.Button("SEND")
    clear = gr.Button("CLEAR")
    reset_chat = gr.Button("RESET CHAT HISTORY")

    submit.click(chat_stream, inputs=[input_box, dept, title, content, state], outputs=[chatbot]) #, state, input_box, dept, title, content])
    clear.click(lambda: ([], None, None, None, [], []), inputs=None, outputs=[chatbot, input_box, dept, title, content, state], queue=False)
    reset_chat.click(lambda: ([]), inputs=None, outputs=[chatbot], queue=False)

demo.launch(debug=True, share=True)

Running on local URL:  http://127.0.0.1:7865
IMPORTANT: You are using gradio version 4.16.0, however version 4.29.0 is available, please upgrade.
--------
Running on public URL: https://806923159ef9359193.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


### Streaming with Anthropic

In [81]:
chat = ChatAnthropic(temperature=0.3, model_name="claude-3-opus-20240229")
prompt = ChatPromptTemplate.from_messages(
    [("human", "Give me a list of famous tourist attractions in Japan")]
)
chain = prompt | chat
for chunk in chain.stream({}):
    print(chunk.content, end="", flush=True)

Here is a list of famous tourist attractions in Japan:

1. Mount Fuji - Japan's highest mountain and a national symbol
2. Tokyo Skytree - The tallest tower in Japan, offering panoramic views of Tokyo
3. Fushimi Inari Shrine - A Shinto shrine famous for its thousands of bright orange torii gates
4. Hiroshima Peace Memorial Park - A memorial park dedicated to the victims of the atomic bombing
5. Kinkaku-ji (Golden Pavilion) - A Zen temple in Kyoto with a gold-leaf-covered pavilion
6. Tokyo DisneySea - A unique Disney theme park with a nautical and adventure theme
7. Osaka Castle - A historic castle surrounded by a park, museum, and shops

KeyboardInterrupt: 