In [103]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb 
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

You should consider upgrading via the '/Users/lene/Documents/repos/paper_helper/venv/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/lene/Documents/repos/paper_helper/venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [104]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv

In [105]:
load_dotenv()

True

In [106]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

## Define LLM

In [107]:
llm = ChatOpenAI(model="gpt-4o", api_key=OPENAI_API_KEY)
llm.invoke("Tell me a joke about cats")

AIMessage(content="Sure, here's a cat joke for you:\n\nWhy was the cat sitting on the computer?\n\nBecause it wanted to keep an eye on the mouse! 🐱💻🖱️", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 38, 'prompt_tokens': 13, 'total_tokens': 51, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_5796ac6771', 'finish_reason': 'stop', 'logprobs': None}, id='run-dac87a3a-2490-447a-95b5-505e60c6b925-0', usage_metadata={'input_tokens': 13, 'output_tokens': 38, 'total_tokens': 51})

## Process PDF document

### Load PDF document

In [108]:
loader = PyPDFLoader("data/Wang_et_al.pdf")
pages = loader.load()
pages

[Document(metadata={'source': 'data/Wang_et_al.pdf', 'page': 0}, page_content='A DQN-based Internet Financial Fraud Transaction Detection\nMethod\nXiaoguo Wang\nCollege of Electronics and\nInformation Engineering, Tongji\nUniversity, Shanghai,, China,\nxiaoguowang@tongji.edu.cnZeguo Wan*\nCollege of Electronics and\nInformation Engineering, Tongji\nUniversity, Shanghai, China,\n1933042@\ntongji.edu.cnYin Zhang\nCollege of Electronics and\nInformation Engineering, Tongji\nUniversity, Shanghai, China\n1930790@tongji.edu.cn\nABSTRACT\nThe anti-fraud issue of Internet finance is a hot research topic in\nthe industry. Aiming at the complex fraud problem of Internet\nfinance, this paper proposes a fraudulent transaction detection\nmethod based on Deep Q Learning, and constructs a feasible elec-\ntronic transaction fraud detection model. Based on reinforcement\nlearning, this method makes the agent learn classification strategies,\nbuilds the environment with RFM model, and uses SmoothL1 as t

### Split document

In [109]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                            chunk_overlap=200,
                                            length_function=len,
                                            separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)

In [110]:
chunks

[Document(metadata={'source': 'data/Wang_et_al.pdf', 'page': 0}, page_content='A DQN-based Internet Financial Fraud Transaction Detection\nMethod\nXiaoguo Wang\nCollege of Electronics and\nInformation Engineering, Tongji\nUniversity, Shanghai,, China,\nxiaoguowang@tongji.edu.cnZeguo Wan*\nCollege of Electronics and\nInformation Engineering, Tongji\nUniversity, Shanghai, China,\n1933042@\ntongji.edu.cnYin Zhang\nCollege of Electronics and\nInformation Engineering, Tongji\nUniversity, Shanghai, China\n1930790@tongji.edu.cn\nABSTRACT\nThe anti-fraud issue of Internet finance is a hot research topic in\nthe industry. Aiming at the complex fraud problem of Internet\nfinance, this paper proposes a fraudulent transaction detection\nmethod based on Deep Q Learning, and constructs a feasible elec-\ntronic transaction fraud detection model. Based on reinforcement\nlearning, this method makes the agent learn classification strategies,\nbuilds the environment with RFM model, and uses SmoothL1 as t

### Create embeddings

In [111]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()


In [112]:
test_vector = embedding_function.embed_query("cat")
test_vector

[-0.007135485298931599,
 -0.017439933493733406,
 -0.009677036665380001,
 -0.030711298808455467,
 -0.0125340661033988,
 0.00314592313952744,
 -0.005015753209590912,
 -0.0411469005048275,
 -0.014561636373400688,
 -0.021310748532414436,
 0.01918392814695835,
 0.05078849196434021,
 -0.0012468489585444331,
 0.0025557302869856358,
 -0.038424570113420486,
 -0.0061004324816167355,
 0.03544702008366585,
 -0.004601023159921169,
 0.0023873569443821907,
 -0.013455688953399658,
 -0.018928708508610725,
 0.008989364840090275,
 0.015795191749930382,
 -0.008734146133065224,
 -0.014618351124227047,
 0.007114217150956392,
 0.013115397654473782,
 -0.013335169292986393,
 0.0029988179448992014,
 0.004849152639508247,
 0.004051594529300928,
 -0.01680188812315464,
 -0.015781013295054436,
 -0.04304686188697815,
 -0.027166597545146942,
 -0.0042784553952515125,
 0.007961400784552097,
 -0.009903897531330585,
 0.02201968990266323,
 -0.009010632522404194,
 0.004863331094384193,
 0.00025477545568719506,
 -0.01210161

In [113]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance", 
                            embeddings=embedding_function)

In [114]:
# Test the evaluator
evaluator.evaluate_strings(prediction="Amsterdam", reference="coffeeshop")

{'score': 0.17454945643404907}

In [115]:
evaluator.evaluate_strings(prediction="Paris", reference="coffeeshop")

{'score': 0.22430796993904023}

### Create vector database

In [116]:
print(f"Creating vectorstore with {len(chunks)} chunks")

Creating vectorstore with 21 chunks


In [118]:
import tempfile
import atexit
import shutil
from langchain_community.vectorstores import Chroma

# Create a temporary directory that will persist for the session
temp_dir = tempfile.mkdtemp()

# Function to clean up the temporary directory when the session ends
def cleanup_temp_dir():
    print(f"Cleaning up temporary directory: {temp_dir}")
    shutil.rmtree(temp_dir, ignore_errors=True)

# Register the cleanup function to be called when the session ends
atexit.register(cleanup_temp_dir)

def create_vectorstore_temp(chunks, embedding_function):
    print(f"Starting create_vectorstore with {len(chunks)} chunks")
    print(f"Using temporary directory: {temp_dir}")
    
    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    print(f"Generated {len(ids)} unique ids")
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk) 
    
    print(f"Filtered to {len(unique_chunks)} unique chunks")

    # Create a new Chroma database from the documents
    print("Creating Chroma vectorstore...")
    vectorstore = Chroma.from_documents(
        documents=unique_chunks, 
        ids=list(unique_ids),
        embedding=embedding_function, 
        persist_directory=temp_dir
    )
    
    print(f"Vectorstore created with {len(unique_chunks)} documents")
    
    # Verify the contents of the vectorstore
    all_docs = vectorstore.get()
    print(f"Vectorstore contains {len(all_docs.get('documents', []))} documents")
    
    return vectorstore

# Use the function
vectorstore = create_vectorstore_temp(chunks=chunks, embedding_function=embedding_function)

# Print the location of the temporary directory
print(f"Temporary directory location: {temp_dir}")

Starting create_vectorstore with 21 chunks
Using temporary directory: /var/folders/wf/1y72rxhs35jb1qx_q7lq42xr0000gn/T/tmpdbjw4l0z
Generated 21 unique ids
Filtered to 21 unique chunks
Creating Chroma vectorstore...
Vectorstore created with 21 documents
Vectorstore contains 21 documents
Temporary directory location: /var/folders/wf/1y72rxhs35jb1qx_q7lq42xr0000gn/T/tmpdbjw4l0z


In [119]:
# Verify vectorstore contents
print(f"Number of documents in vectorstore: {vectorstore._collection.count()}")
all_docs = vectorstore.get()
print(f"Total documents retrieved: {len(all_docs.get('documents', []))}")

# Print some document contents
for i, doc in enumerate(all_docs.get('documents', [])[:3]):
    print(f"Document {i+1}:")
    print(doc[:200])
    print("---")

Number of documents in vectorstore: 21
Total documents retrieved: 21
Document 1:
A DQN-based Internet Financial Fraud Transaction Detection
Method
Xiaoguo Wang
College of Electronics and
Information Engineering, Tongji
University, Shanghai,, China,
xiaoguowang@tongji.edu.cnZeguo W
---
Document 2:
•Computing methodologies ;•Machine learning ;•Learning
paradigms ;•Reinforcement learning ;
KEYWORDS
Internet finance, Fraudulent transaction detection, Electronic trans-
action
ACM Reference Format:

---
Document 3:
on the first page. Copyrights for components of this work owned by others than ACM
must be honored. Abstracting with credit is permitted. To copy otherwise, or republish,
to post on servers or to redi
---


## Query for relevant data

In [120]:
# Create retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# Try querying
query = "What is the title of the paper?"
relevant_chunks = retriever.invoke(query)
print(f"Number of relevant chunks: {len(relevant_chunks)}")
for i, chunk in enumerate(relevant_chunks):
    print(f"Relevant chunk {i+1}:")
    print(chunk.page_content[:200])
    print("---")

Number of relevant chunks: 3
Relevant chunk 1:
A DQN-based Internet Financial Fraud Transaction Detection
Method
Xiaoguo Wang
College of Electronics and
Information Engineering, Tongji
University, Shanghai,, China,
xiaoguowang@tongji.edu.cnZeguo W
---
Relevant chunk 2:
[10] Juliet M, Jonah K. (2018). Credit Card Fraud Detection using Bayes Theorem.
International Journal of Computer and Information Technology, 7(4).
[11] Zhang W, Ntoutsi E. (2019). FAHT: An Adaptive 
---
Relevant chunk 3:
Journal of Machine Learning Research, 15, 315-323.
[18] Krizhevsky A, Sutskever I, Hinton G E. (2017). ImageNet Classification with Deep
Convolutional Neural Networks. Communications of the ACM, 60(6)
---


In [121]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

## Generate responses

In [122]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="What is the title of the paper?")
print(prompt)

Human: 
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

A DQN-based Internet Financial Fraud Transaction Detection
Method
Xiaoguo Wang
College of Electronics and
Information Engineering, Tongji
University, Shanghai,, China,
xiaoguowang@tongji.edu.cnZeguo Wan*
College of Electronics and
Information Engineering, Tongji
University, Shanghai, China,
1933042@
tongji.edu.cnYin Zhang
College of Electronics and
Information Engineering, Tongji
University, Shanghai, China
1930790@tongji.edu.cn
ABSTRACT
The anti-fraud issue of Internet finance is a hot research topic in
the industry. Aiming at the complex fraud problem of Internet
finance, this paper proposes a fraudulent transaction detection
method based on Deep Q Learning, and constructs a feasible elec-
tronic transaction fraud detection model. Based on reinforcement
learning, this method mak

In [123]:
llm.invoke(prompt)

AIMessage(content='The title of the paper is "A DQN-based Internet Financial Fraud Transaction Detection Method."', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 1281, 'total_tokens': 1299, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_c17d3befe7', 'finish_reason': 'stop', 'logprobs': None}, id='run-bac8e2e7-c6a8-47d3-9bf5-eeb8076812ac-0', usage_metadata={'input_tokens': 1281, 'output_tokens': 18, 'total_tokens': 1299})

### Using Langchain Expression Language

In [124]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm
        )
rag_chain.invoke("What's the title of this paper?")

AIMessage(content='The title of the paper is "A DQN-based Internet Financial Fraud Transaction Detection Method."', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 1276, 'total_tokens': 1294, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_5796ac6771', 'finish_reason': 'stop', 'logprobs': None}, id='run-a9a1ebfa-800e-4dd8-a264-e85295b4c008-0', usage_metadata={'input_tokens': 1276, 'output_tokens': 18, 'total_tokens': 1294})

### Generate structured responses

#### Pydantic classes

In [125]:
class AnswerWithSources(BaseModel):
    """An answer to the question, with sources and reasoning."""
    answer: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")
    
class ExtractedInfo(BaseModel):
    """Extracted information about the research article"""
    paper_title: AnswerWithSources
    paper_summary: AnswerWithSources
    publication_year: AnswerWithSources
    paper_authors: AnswerWithSources
    paper_models: AnswerWithSources

#### Langchain chain

In [129]:
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfo, strict=True)
        )

rag_chain.invoke("What is the title, publication date, authors, and models studied in the research paper?")

ExtractedInfo(paper_title=AnswerWithSources(answer='A DQN-based Internet Financial Fraud Transaction Detection Method', sources='Xiaoguo Wang\nCollege of Electronics and\nInformation Engineering, Tongji\nUniversity, Shanghai,, China,\nxiaoguowang@tongji.edu.cnZeguo Wan*\nCollege of Electronics and\nInformation Engineering, Tongji\nUniversity, Shanghai, China,\n1933042@\ntongji.edu.cnYin Zhang\nCollege of Electronics and\nInformation Engineering, Tongji\nUniversity, Shanghai, China\n1930790@tongji.edu.cn\nABSTRACT\nThe anti-fraud issue of Internet finance is a hot research topic in\nthe industry. Aiming at the complex fraud problem of Internet\nfinance, this paper proposes a fraudulent transaction detection\nmethod based on Deep Q Learning, and constructs a feasible elec-\ntronic transaction fraud detection model.', reasoning='The title of the paper is explicitly stated in the abstract section of the provided text.'), paper_summary=AnswerWithSources(answer='The paper proposes a fraudule

### Transform response into a dataframe

In [132]:
structured_response = rag_chain.invoke("What is the title, publication date, authors, and models studied in the research paper?")
df = pd.DataFrame([structured_response.dict()])

# Transforming into a table with two rows: 'answer' and 'source'
answer_row = []
source_row = []
reasoning_row = []

for col in df.columns:
    answer_row.append(df[col][0]['answer'])
    source_row.append(df[col][0]['sources'])
    reasoning_row.append(df[col][0]['reasoning'])

# Create new dataframe with two rows: 'answer' and 'source'
structured_response_df = pd.DataFrame([answer_row, source_row, reasoning_row], columns=df.columns, index=['answer', 'source', 'reasoning'])


In [133]:
structured_response_df

Unnamed: 0,paper_title,paper_summary,publication_year,paper_authors,paper_models
answer,A DQN-based Internet Financial Fraud Transacti...,The paper proposes a fraudulent transaction de...,2021,"Xiaoguo Wang, Zeguo Wan, Yin Zhang",RFM model
source,The title of the paper is clearly stated at th...,The abstract provides a concise summary of the...,The publication year is mentioned in the confe...,The authors of the paper are listed in the abs...,The model used in the research is mentioned in...
reasoning,The title of the paper is clearly stated at th...,The abstract provides a concise description of...,The publication year is explicitly mentioned i...,The authors are explicitly listed with their a...,The abstract specifies that the environment fo...
