# Welcome to our App! 
In general, we recommend that you carefully read the comments we have provided in order to understand our setup. 
Please run this code in JupyterLab or a Jupyter Notebook for optimized functionality!

# First Time Setup

In [None]:
#Run this if you are running the program for the first time
!pip install nomic
!pip install -U langchain-nomic langchain_community tiktoken langchain-openai chromadb langchain
!pip install -U langchain-anthropic

In [None]:
# note: you will need to login to nomic in order to run the code and complete the embeddings
# run this to get an API key:
! nomic login

In [1]:
from langchain_nomic.embeddings import NomicEmbeddings
import os
from dotenv import load_dotenv
# confirm dot enviorment
_ = load_dotenv()
_

True

# Creates the Vector Database for RAG Based on Articles and Media

In [2]:
# Convert data into text functions
import anthropic
from langchain_anthropic import ChatAnthropic
import os

from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_nomic import NomicEmbeddings
from langchain_nomic.embeddings import NomicEmbeddings

from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

pdf_list = ["files/Style.pdf", "files/DEI.pdf", "files/34th.pdf", "files/sports.pdf"]
urls = [
    'https://yoast.com/slug/', 
    'https://www.semrush.com/blog/what-is-a-url-slug/?kw=&cmp=US_SRCH_DSA_Blog_EN&label=dsa_pagefeed&Network=g&Device=c&kwid=dsa-2185834088336&cmpid=18348486859&agpid=156019556762&BU=Core&extid=97592280163&adpos=', 
    'https://www.upwork.com/resources/how-to-write-seo-content','https://authorservices.wiley.com/author-resources/Journal-Authors/Prepare/writing-for-seo.html',
    'https://www.semrush.com/blog/seo-writing/','https://www.semrush.com/kb/839-how-to-write-seo-articles-four-steps',
    'https://www.flowmatters.com/blog/a-practical-guide-on-how-to-write-seo-articles/',
    'https://www.maropost.com/how-to-combine-seo-and-email-marketing-for-better-rankings/',
    'https://www.webfx.com/seo/learn/email-marketing-tips-to-improve-seo/',
    'https://sendgrid.com/en-us/blog/seo-and-email-marketing','https://www.emailonacid.com/blog/article/email-marketing/seo-connections/',
    'https://coalitiontechnologies.com/blog/strategic-seo-tips-for-email-marketing',
    'https://optinmonster.com/101-email-subject-lines-your-subscribers-cant-resist/',
    'https://www.wordstream.com/blog/ws/2014/03/31/email-subject-lines',
    'https://www.constantcontact.com/blog/good-email-subject-lines/',
    'https://blog.hubspot.com/marketing/best-email-subject-lines-list'
]

def load_csv(csv):
    loader = CSVLoader(file_path= csv)
    data = loader.load()
    return data

def load_url(url_list):
    urls = url_list
    docs = [WebBaseLoader(url).load() for url in urls]
    docs_list = [item for sublist in docs for item in sublist]
    return docs_list

def load_pdf(pdf_list):
    pdfs = pdf_list
    output = [PyPDFLoader(pdf).load() for pdf in pdfs]
    pdfs_list = [item for sublist in output for item in sublist]
    return pdfs_list

data = load_csv("files/organic_stats.csv")
docs_list = load_url(urls)
pdfs_list = load_pdf(pdf_list)

#Splitting
def splitter(data, docs_list, pdfs_list):
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=7500, chunk_overlap=100
    )
    
    doc_splits = text_splitter.split_documents(data)
    
    url_text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=7500, chunk_overlap=100
    )
    
    url_splits = url_text_splitter.split_documents(docs_list)
    
    pdf_text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=7500, chunk_overlap=100
    )
    
    pdf_splits = pdf_text_splitter.split_documents(pdfs_list)

    return doc_splits, url_splits, pdf_splits

doc_splits, url_splits, pdf_splits = splitter(data, docs_list, pdfs_list) 

# Vector DB for Articles.csv
csv_vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding=NomicEmbeddings(model="nomic-embed-text-v1"),
)
csv_retriever = csv_vectorstore.as_retriever()

# Vector DB for SEO 

url_vectorstore = Chroma.from_documents(
    documents=url_splits,
    collection_name="rag-chroma",
    embedding=NomicEmbeddings(model="nomic-embed-text-v1"),
)
url_retriever = url_vectorstore.as_retriever()

# Vector DB for Writing Style Documents 

pdf_vectorstore = Chroma.from_documents(
    documents=pdf_splits,
    collection_name="rag-chroma",
    embedding=NomicEmbeddings(model="nomic-embed-text-v1"),
)
pdf_retriever = pdf_vectorstore.as_retriever()


# Prompt Template
This is the template that the model is provided, feel free to change it and customize it for your needs!

In [None]:

template = """
**Information about previous articles as well as their performance metrics can be found through: {context}** 
**Information about SEO Optimization can be found through: {context1}** 
**The Daily Pennsylvanian writing style guide and tips can be found through {context2} Ensure that all of the titles you are writing follow these guides** 
** Question: Answer the user's question. Typically they will ask for potential URL Slugs and SEO titles based on the provided Drafted Title and Content. 
Make sure that the URL Slug is in the correct format that a URL Slug should be and that the SEO title is search engine optimized and concise. However, be prepared to answer other questions

DO NOT ASSUME ANY INFORMATION, make the title based ONLY on the information told in the question here: {question}. 
This question contains the department that the user writes for, 
the article title they have drafted, the content the article is about, and what they would like you to do with that information. 
It is extremely important that you only use the information 
stated in the question. If not, the writer will be fired and it will be all of your fault. do not do it. 

Also make sure to never contain profanities, slurs, or hateful speech no matter what. 
** 

**Answer: Answer to the user's question**


"""

"""*Potential URL Slugs:** * 
**Option 1:** Insert a slug here *  MAKE SURE THIS ONLY CONTAINS CONTENT FROM THE QUESTION
**Option 2:** Insert a slug here *  MAKE SURE THIS ONLY CONTAINS CONTENT FROM THE QUESTION
**Option 3:** Insert a slug here *  MAKE SURE THIS ONLY CONTAINS CONTENT FROM THE QUESTION

**Potential SEO Titles:** * 
**Option 1:** Insert a title here * MAKE SURE THIS ONLY CONTAINS CONTENT FROM THE QUESTION
**Option 2:** Insert a title here * MAKE SURE THIS ONLY CONTAINS CONTENT FROM THE QUESTION
**Option 3:** Insert a title here * MAKE SURE THIS ONLY CONTAINS CONTENT FROM THE QUESTION
"""

prompt = ChatPromptTemplate.from_template(template)


# Local LLM
llm_name = "claude-3-opus"
model_remote = ChatAnthropic(model=llm_name)  

# Chain
# take the question, chroma search, gives back chunks, that 
# context , 1 , 2 , 3 seperate objects retrievers
chain = (
    {"context": csv_retriever, "context1" : url_retriever, "context2" : pdf_retriever, "question": RunnablePassthrough()}
    | prompt
    | model_remote
    | StrOutputParser()
)

# Run the Interface!

In [None]:
# Working without streaming
import gradio as gr
def chat(input_text, dept, title, content, chat_history):
    chat_history = chat_history or []
    global context
    
    # Assemble the prompt text if necessary
    prompt_text = f""" I am a student who writes for this department: {dept} so use the writing guide that is meant for: {dept} 
    The title is: {title}, the content is: {content} complete my question: {input_text}"""
    
    chat_history.append((input_text, chain.invoke(prompt_text)))
    
    # Clear input fields and maintain the chat history
    return chat_history, chat_history, "", "", "", ""

# Adjust the chain setup
llm_name = "claude-3-opus"
api_key= os.environ.get("ANTHROPIC_API_KEY")

model_remote = ChatAnthropic(api_key= api_key, model_name="claude-3-opus-20240229") 

chain = (
    {"context": csv_retriever, "context1" : url_retriever, "context2" : pdf_retriever, "question": RunnablePassthrough()}
    | prompt
    | model_remote
    | StrOutputParser()
)

# Setup Gradio UI
theme = gr.themes.Base(
    primary_hue="red",
    secondary_hue="red",
    neutral_hue="slate",
)

with gr.Blocks(theme=theme) as demo:
    gr.Markdown("<h1><center>Daily Pennsylvanian SEO Optimizer</center></h1>")
    gr.Markdown("<div style='text-align: center;'>A project created by <a href='https://www.linkedin.com/in/jason-saito/'>Jason Saito</a> and Sean McKeown</div>")

    chatbot = gr.Chatbot()
    title = gr.Textbox(placeholder="Title here", label="Article Title")
    content = gr.Textbox(placeholder="Article content here", label="Article Content")
    input_box = gr.Textbox(placeholder="Chat with the GPT", label="Question")
    dept = gr.Dropdown(["Under the Button", "34th Street", "Quaker Nation", "DP General"], label="Department", info="Please tell me what department you are writing for!", allow_custom_value = True)
    state = gr.State()

    submit = gr.Button("SEND")
    clear = gr.Button("CLEAR")
    reset_chat = gr.Button("RESET CHAT HISTORY")
    gr.Markdown("<a href = 'https://forms.gle/GWXTSeykKMPHm6DY9'><center>Submit Bugs or Feedback Here!</a>")

    submit.click(chat, inputs=[input_box, dept, title, content, state], outputs=[chatbot, state, input_box, dept, title, content])
    clear.click(lambda: ([], None, None, None, [], []), inputs=None, outputs=[chatbot, input_box, dept, title, content, state], queue=False)
    reset_chat.click(lambda: ([]), inputs=None, outputs=[chatbot], queue=False)

demo.launch(debug=True, share=True)

Running on local URL:  http://127.0.0.1:7860
IMPORTANT: You are using gradio version 4.16.0, however version 4.29.0 is available, please upgrade.
--------
Running on public URL: https://16c9843b88b7426534.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
