In [1]:
import os
import openai
import sys
import io
import IPython.display
import gradio as gr

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']
hf_api_key = os.environ['HF_API_KEY']

In [2]:
import sys
import csv

csv.field_size_limit(sys.maxsize)

131072

In [4]:
from langchain.document_loaders.csv_loader import CSVLoader
load_filename = "processed/scraped_null_removed.csv"
source_column = "fname"
loader = CSVLoader(file_path=load_filename, source_column=source_column)
pages = loader.load()

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=3500,
    chunk_overlap=300,
    length_function=len,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
splits = text_splitter.split_documents(pages)

In [6]:
len(splits)

420604

In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector

embedding = OpenAIEmbeddings()

In [8]:
database_name = "experian"
user = "larryyin"
passwd = ""

In [9]:
# PGVector needs the connection string to the database.
# CONNECTION_STRING = "postgresql+psycopg2://harrisonchase@localhost:5432/test3"

# # Alternatively, you can create it from enviornment variables.
# import os

CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver="psycopg2",
    host="localhost",
    port=5432,
    database=database_name,
    user=user,
    password=passwd,
)

COLLECTION_NAME = "experian230725"

In [10]:
CONNECTION_STRING

'postgresql+psycopg2://larryyin:@localhost:5432/experian'

In [11]:
import psycopg2
from psycopg2 import sql
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

# Establish a connection to the database
# You should connect to the postgres (system) database to create a new database
conn = psycopg2.connect(
    dbname="postgres",
    user=user,
    password=passwd,
    host="localhost"
)

# Create a new cursor object
cur = conn.cursor()

# The name of the database we want to create
dbname = database_name

# Check if the database already exists
cur.execute(sql.SQL("SELECT 1 FROM pg_catalog.pg_database WHERE datname = %s"), (dbname,))

exists = cur.fetchone()
if not exists:
    # If the database does not exist, create it
    # Since the CREATE DATABASE command is a transaction block, we set the connection to the autocommit mode.
    conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    cur.execute(sql.SQL("CREATE DATABASE {}").format(sql.Identifier(dbname)))
    print(f"Database {dbname} created successfully.")
else:
    print(f"Database {dbname} already exists.")

# Close the cursor and the connection
cur.close()
conn.close()


conn = psycopg2.connect(
    dbname=database_name,
    user=user,
    password=passwd,
    host="localhost"
)

cur = conn.cursor()

# Execute the SQL statement to create the extension
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")

# Commit the transaction
conn.commit()

# Close the cursor and connection
cur.close()
conn.close()

Database experian already exists.


In [12]:
def close_all_connections(database_name, user, passwd):
    # Define our connection string
    conn_string = f"host='localhost' dbname='{database_name}' user='{user}' password='{passwd}'"
    
    # Get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)
    
    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()
    
    # Terminate all connections to the 'target_database'
    cursor.execute(f"""
        SELECT pg_terminate_backend(pg_stat_activity.pid)
        FROM pg_stat_activity
        WHERE pg_stat_activity.datname = '{database_name}'
          AND pid <> pg_backend_pid();
    """)
    
    # Commit the transaction
    conn.commit()
    
    # Close the cursor and connection
    cursor.close()
    conn.close()

In [13]:
def count_connections(database_name, user, passwd):
    # Define our connection string
    conn_string = f"host='localhost' dbname='{database_name}' user='{user}' password='{passwd}'"
    
    # Get a connection
    conn = psycopg2.connect(conn_string)
    
    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()
    
    # Count all active connections
    cursor.execute(f"SELECT COUNT(*) FROM pg_stat_activity WHERE datname = '{database_name}';")
    
    # Fetch the result
    result = cursor.fetchone()
    
    # result is a tuple with one element, so get the first element
    active_connections = result[0]
    
    print(f'Number of active connections: {active_connections}')
    
    # Close the cursor and connection
    cursor.close()
    conn.close()

In [None]:
import time

total_length = len(splits)
batch_size = 100

for batch_start in range(0, total_length, batch_size):
    batch_end = min(batch_start + batch_size, total_length)
    batch_texts = splits[batch_start:batch_end]
    PGVector.from_documents(documents=batch_texts,
                          embedding=embedding,
                          collection_name=COLLECTION_NAME,
                          connection_string=CONNECTION_STRING,
                         )
    print(f"Inserted {batch_end}/{total_length} chunks")

    if (batch_end % 3000 == 0):
        print("Before disconnect:")
        count_connections(database_name, user, passwd)
        
        close_all_connections(database_name, user, passwd)
        
        print("After disconnect:")
        count_connections(database_name, user, passwd)
    
    time.sleep(10)

In [15]:
vectordb = PGVector(embedding_function=embedding,
                  collection_name=COLLECTION_NAME,
                  connection_string=CONNECTION_STRING,
                 )

In [16]:
vectordb.similarity_search("What does Experian do?")

 Document(page_content="They are often the first results when searching your name online and are marketplaces for data brokers to buy and sell personal data, making it easier for anyone, including robocallers, spammers, hackers and identity thieves to access your information and leave you more vulnerable to identity theft.Get your free Experian personal privacy scanStart your free scanGet your free Experian personal privacy scanStart your free scanCreditCreditFree credit monitoring3-bureau reports and FICO® ScoresAnnual credit reportExperian CreditLockUnderstanding credit reportsHow to improve your credit scorePersonal financePersonal financeCredit card reviewsLoan reviewsCar insurance reviewsBudgetingSavingMortgage & rentingSupportSupportHow to freeze your credit fileHow to dispute info on your credit reportIdentity theft victim assistanceSupport for denied creditUpload a document to ExperianExperian customer supportExperian's Diversity, Equity and InclusionLearn more how Experian is 

In [17]:
from langchain.chat_models import ChatOpenAI
# llm_name = "gpt-3.5-turbo"
llm_name = "gpt-3.5-turbo-16k"
# llm_name = "gpt-4-32k"
llm = ChatOpenAI(model_name=llm_name, temperature=0)

In [18]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationTokenBufferMemory

retriever=vectordb.as_retriever()
memory = ConversationTokenBufferMemory(
    llm = llm,
    max_token_limit=8000,
    memory_key="chat_history",
    return_messages=True
)
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory,
    verbose=False
)

In [23]:
bot_name = "Experian Bot V0.2"
bot_desc = "Experian's chatbot, from its public webcrawl, serves a credit feast for all."

with gr.Blocks() as demo:
    gr.Markdown(f"# {bot_name}\n\n{bot_desc}")
    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="Type your message (Shift + Enter to submit)", lines=6)
    submit = gr.Button("Submit")
    clear = gr.Button("Clear")

    def respond(message, chat_history):
        result = qa({"question": message})
        chat_history.append((message, result["answer"]))
        return ("", chat_history)

    msg.submit(respond, [msg, chatbot], [msg, chatbot], queue=False)
    submit.click(respond, [msg, chatbot], [msg, chatbot], queue=False)
    clear.click(lambda: None, None, chatbot, queue=False)

gr.close_all()
demo.queue()
demo.launch(share=False)

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [20]:
gr.close_all()
demo.close()
demo.clear()

In [None]:
import langchain
langchain.debug = True

In [34]:
qa({"question": "what does Experian do?"})



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
While Experian Consumer Services uses reasonable efforts to present the most accurate information, all offer information is presented without warranty. Experian websites have been designed to support modern, up-to-date internet browsers. Experian does not support Internet Explorer. If you are currently using a non-supported browser your experience may not be optimal, you may experience rendering issues, and you may be exposed to potential security risks. It is recommended that you upgrade to the most recent browser version. © 2023 All rights reserved. Experian. Experian and the Experian trademarks used herein are trademarks or registered trademarks of Experian and its affi

{'question': 'what does Experian do?',
 'chat_history': [HumanMessage(content='what does Experian do?', additional_kwargs={}, example=False),
  AIMessage(content='Experian is a global information services company that provides data and analytics to help businesses and individuals make informed decisions. They offer a wide range of services, including credit reporting, identity verification, fraud detection, data quality management, and marketing solutions. Experian works with businesses in various industries, such as finance, insurance, healthcare, and retail, to help them manage risk, improve customer acquisition and retention, and make data-driven decisions. They also provide individuals with access to their credit reports and scores, as well as tools and resources to help them manage their credit and protect against identity theft.', additional_kwargs={}, example=False)],
 'answer': 'Experian is a global information services company that provides data and analytics to help businesse

In [35]:
qa({"question": "What are Experian's main businesses?"})



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: what does Experian do?
Assistant: Experian is a global information services company that provides data and analytics to help businesses and individuals make informed decisions. They offer a wide range of services, including credit reporting, identity verification, fraud detection, data quality management, and marketing solutions. Experian works with businesses in various industries, such as finance, insurance, healthcare, and retail, to help them manage risk, improve customer acquisition and retention, and make data-driven decisions. They also provide individuals with access to their credit reports and scores, as well as tools and resources to help them manage their credit and protect against identity theft.
Follow Up Input: What are Experi

{'question': "What are Experian's main businesses?",
 'chat_history': [HumanMessage(content='what does Experian do?', additional_kwargs={}, example=False),
  AIMessage(content='Experian is a global information services company that provides data and analytics to help businesses and individuals make informed decisions. They offer a wide range of services, including credit reporting, identity verification, fraud detection, data quality management, and marketing solutions. Experian works with businesses in various industries, such as finance, insurance, healthcare, and retail, to help them manage risk, improve customer acquisition and retention, and make data-driven decisions. They also provide individuals with access to their credit reports and scores, as well as tools and resources to help them manage their credit and protect against identity theft.', additional_kwargs={}, example=False),
  HumanMessage(content="What are Experian's main businesses?", additional_kwargs={}, example=False)

In [None]:
langchain.debug = False