In [1]:
import os
from langchain.document_loaders import PyPDFLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
import cohere
import pandas as pd
import numpy as np
from langchain.embeddings import CohereEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import datetime
import sys
sys.path.append('../')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [13]:
COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
co = cohere.Client(COHERE_API_KEY)

In [4]:
loader = PyPDFLoader("../data/Beliefs-About-Linear-Social-Progress.pdf")
pages = loader.load()
file_path = "../data/BetterUp-Classified.csv"
loader = CSVLoader(file_path=file_path)
documents = loader.load()

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [6]:
splits = text_splitter.split_documents(pages)

In [7]:
df = pd.read_csv("../data/BetterUp-Classified.csv")
df["Conversation Part 1"] = df["Conversation Part 1"].fillna("")
df["Conversation Part 2"] = df["Conversation Part 2"].fillna("")
df["Conversation Part 3"] = df["Conversation Part 3"].fillna("")
df["formatted"] = df["Conversation Part 1"] + " " + df["Conversation Part 2"] + " " + df["Conversation Part 3"]
df = df.drop_duplicates(subset=["formatted"])

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150
)

In [9]:
all_chunks = []
for idx, row in df.iterrows():
    conversation_id = row['convo_id']
    text = row['formatted']
    chunks = text_splitter.split_text(text)
    for i, chunk in enumerate(chunks):
        all_chunks.append({
            "id": f"{conversation_id}_{i}",
            "source": "csv",
            "text": chunk
        })

In [17]:
embedding_model = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=COHERE_API_KEY, user_agent="langchain-embeddings")

In [20]:
persist_directory = '../docs/chroma/'

In [21]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding_model,
    persist_directory=persist_directory
)

In [28]:
question = "who is the author of the paper?"
docs = vectordb.similarity_search(question,k=3)

[Document(metadata={'author': 'Julia D. Hur and Rachel L. Ruttan', 'creator': 'Adobe InDesign CS5.5 (7.5)', 'rgid': 'PB:369478403_AS:11431281204625714@1699893051228', 'total_pages': 18, 'subject': 'Pers Soc Psychol Bull 0.0:01461672231158843', 'creationdate': '2023-03-21T16:12:55+05:30', 'keywords': 'lay beliefs,diversity,equality,social justice,social issues', 'title': 'Beliefs About Linear Social Progress', 'trapped': '/False', 'page': 0, 'page_label': '1', 'moddate': '2023-11-13T08:28:22-08:00', 'source': '../data/Beliefs-About-Linear-Social-Progress.pdf', 'producer': 'Adobe PDF Library 9.9; modified using iText 4.2.0 by 1T3XT'}, page_content='See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/369478403\nBeliefs About Linear Social Progress\nArticle\xa0\xa0in \xa0\xa0Personality and Social Psychology Bulletin · March 2023\nDOI: 10.1177/01461672231158843\nCITATIONS\n6\nREADS\n184\n2 authors:\nJulia Hur\nUniversity Canada West

In [30]:
from langchain.schema import Document

In [31]:
documents = [
    Document(page_content=chunk["text"], metadata=chunk.get("metadata", {}))
    for chunk in all_chunks
]

In [34]:
documents[0]

Document(metadata={}, page_content="Speaker A: fine. Perfect. Yeah. Mhm. Mhm. Yeah. Yeah. Speaker B: No, I'm good. Speaker A: No. Hi, how are you? Speaker B: How are you? Speaker A: Good thanks. Speaker B: Hm My name is played to me and the next time you Amanda. Speaker A: Mhm. Hi, my name's Amanda. All right. Speaker B: Mm. Speaker A: Does that time start? Yeah, three o'clock okay, starts now. Speaker B: Oh right. At three. Speaker A: Yeah. Speaker B: Right, right. Speaker A: What's that? Don't go early so I didn't want to. All right. Okay. Speaker B: Yeah, I don't know anything whatever. Speaker A: So what do you want to talk about? Where are you located? Speaker B: Um, right now my wife and I are living in texas. In Austin texas. Speaker A: Really? Speaker B: Yeah. So uh, I'm sorry my kid just walked in there, Jack, can you have boston put a clean diaper on you? Speaker A: So strange. Right. Mhm. Speaker B: Okay. It's like I put a diaper on my kitties around naked one second. Speake

In [36]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

  embedding_model = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [39]:
documents_shortened = [
    Document(page_content=doc.page_content[:100], metadata=doc.metadata)
    for doc in documents
]


In [40]:
persist_directory_conversations = '../docs/chroma_conversations/'
vectordb = Chroma.from_documents(
    documents=documents_shortened,
    embedding=embedding_model,
    persist_directory=persist_directory_conversations
)

In [73]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.environ.get("LANGCHAIN_API_KEY")

In [90]:
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Be as detailed as possible.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [44]:
embedding_model_cohere = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=COHERE_API_KEY, user_agent="langchain-embeddings")

In [46]:
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding_model_cohere)
vectordb_convos = Chroma(persist_directory=persist_directory_conversations, embedding_function=embedding_model)

In [80]:
from typing import List, Optional
from pydantic import BaseModel, Field, root_validator
from langchain.chat_models.base import BaseChatModel
from langchain.schema import AIMessage, BaseMessage
from langchain.schema.output import ChatGeneration, ChatResult
import cohere

'''
This was ChatGPT generated just to get a demo working for Cohere Langchain integration.
The Cohere client currently has a bug in the chat_types so this alternative was used.
'''

class MyCohereLLM(BaseChatModel):
    cohere_api_key: str
    model: str = "command-r-plus"
    temperature: float = 0.3
    client: Optional[cohere.Client] = Field(default=None, exclude=True)

    @root_validator(pre=True)
    def build_client(cls, values):
        if values.get("client") is None:
            values["client"] = cohere.Client(values["cohere_api_key"])
        return values

    def _generate(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        **kwargs,
    ) -> ChatResult:
        prompt = "\n".join([f"{m.type.capitalize()}: {m.content}" for m in messages])

        response = self.client.chat(
            message=prompt,
            model=self.model,
            temperature=self.temperature,
            stop_sequences=stop,
        )

        return ChatResult(
            generations=[
                ChatGeneration(message=AIMessage(content=response.text))
            ]
        )

    @property
    def _llm_type(self) -> str:
        return "custom-cohere"


/var/folders/sm/h_vm8nzd6z54q1s5kfsmm6x00000gn/T/ipykernel_70755/1885975952.py:15: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  @root_validator(pre=True)


In [91]:
llm = MyCohereLLM(cohere_api_key=COHERE_API_KEY)

In [92]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [95]:
response = qa_chain.invoke("Explain the goal, methods, and results of Study SEt 5")

In [96]:
response['result']

"Goal: Study Set 5 aimed to understand the potential consequences of believing in linear progress regarding social issues. Specifically, it explored whether assuming linear progress reduces the perceived urgency of an issue and the perceived effort needed to make further progress.\n\nMethod: Study Set 5 consisted of two studies, 5A and 5B, with similar procedures. Participants were recruited via Amazon's Mechanical Turk service, with a predetermined sample size of 200 participants for each study.\n\nIn Study 5A, participants were randomly assigned to one of two conditions in a between-subjects design. They were asked to view two graphs depicting progress in gender equality: one showing linear progress and the other showing actual, nonlinear progress. The issues presented were women in STEM and women on boards, and the manipulation involved presenting one issue as linear and the other as nonlinear, based on participants' estimates of progress (linear) versus real data (nonlinear).\n\nIn

In [98]:
qa_chain_convos = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb_convos.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [101]:
response = qa_chain_convos.invoke("What were the most common themes found in the conversation dataset?")
response['result']

"I don't know. The provided conversation is not clear and seems to be incomplete. To answer the question, I would need more context and a complete transcript of the conversation."

In [102]:
response['source_documents']

[Document(metadata={}, page_content='the conversations and stuff. Speaker B: Mhm. You know, every conversation, it seems like I have with'),
 Document(metadata={}, page_content='B: What? Speaker A: What has like before corona? Speaker B: Yeah. Speaker A: What were some things t'),
 Document(metadata={}, page_content='stuff too. Speaker A: Research studies that kind of a bunch of lately. Speaker B: Uh huh. Speaker A:'),
 Document(metadata={}, page_content='Yeah, we had a really interesting topics. Speaker B: Like, we talked about the weather and covid our')]

In [None]:
''''The studies referenced in the provided text appear to be a mix of experimental and survey-based research designs. 
\n\nFor example, the Clifford et al. (2015) study appears to be examining the validity of using Amazon\'s Mechanical Turk for 
political ideology research. This may involve analyzing existing data or conducting a survey of Mechanical Turk 
workers to gather political ideology data and then validating it through various methods. \n\nCrawford et al. (2017) 
seem to be reporting on an experimental intervention with young people, likely involving a treatment and control group to 
understand the role of procedural justice in anti-social behavior interventions. \n\nThe Critcher and Risen (2014) study appears to
 be an experimental one where participants are exposed to counter-stereotypical examples and then measured on their automatic inferences, suggesting a potential priming 
 methodology. \n\nThe remaining studies also suggest experimental designs with various manipulations and measurements, 
 but without the full context of each paper, I cannot provide a more detailed explanation of the specific methods employed. 
 However, terms like "manipulation," "conditions," and "dependent variables" strongly indicate experimental designs with hypothesis testing. \n\nOverall, while the specific methods vary across these studies,
they generally seem to involve some form of experimental manipulation and data collection, whether through surveys, observations, or other means, followed by statistical analysis to test hypotheses and draw conclusions.'
'''

In [None]:
'''
"Goal: Study Set 5 aimed to understand the potential consequences of believing in linear progress regarding social issues. Specifically, it explored whether assuming linear progress reduces the perceived 
urgency of an issue and the perceived effort needed to make further progress.
\n\nMethod: Study Set 5 consisted of two studies, 5A and 5B, with similar procedures. 
Participants were recruited via Amazon's Mechanical Turk service, with a predetermined sample size of 200 participants for each 
study.\n\nIn Study 5A, participants were randomly assigned to one of two conditions in a between-subjects design. 
They were asked to view two graphs depicting progress in gender equality: one showing linear progress and the other showing actual,
 nonlinear progress. The issues presented were women in STEM and women on boards, and the manipulation involved presenting one 
 issue as linear and the other as nonlinear, based on participants' estimates of progress (linear) versus real data (nonlinear).
 \n\nIn Study 5B, participants were also randomly assigned to one of two conditions, with the same gender equality issues presented. 
 However, in this study, both graphs had the same starting and ending points, but one showed a more linear pattern of progress 
 while the other showed a less linear pattern.\n\nIn both studies, participants were asked to indicate which issue they 
 believed was more urgent and required more help and effort to make further progress.\n\nResults: 
The results of Study Set 5 are not explicitly mentioned in the provided text.
However, based on the research question and hypotheses, the studies likely examined the impact of believing in linear progress on perceived urgency and effort needed for social issues. The specific findings would provide insight into whether these beliefs influence people's perceptions of the importance and difficulty of addressing social issues."

'''