This notebook aims to explore the whole process of building a Langchain project to detect when a conversation is going to an inappropiate place, specially aiming to protect young children and adolescents that can become easy targets for pedophiles.

In [27]:
from dotenv import load_dotenv
import os
from preprocess.clear_data import xml2csv
import pandas as pd
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers.string import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from qdrant_client import QdrantClient
from langchain_community.vectorstores import Qdrant
from langchain_qdrant import QdrantVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [11]:
load_dotenv()

# Load the "training" data, which will be the Document Store
DATA_PATH = os.getenv("DATA")
CORPUS_DATA_PATH = os.getenv("CORPUS_DATA") #xml file
PREDATORS_DATA_PATH = os.getenv("PREDATORS_DATA") #txt file

# Load the "test" data to test the model
CORPUS_TEST_DATA_PATH = os.getenv("CORPUS_TEST_DATA") #xml file
PREDATORS_TEST_DATA_PATH = os.getenv("PREDATORS_TEST_DATA") #txt file

# Load the necessary keys for the APIs
os.environ["GOOGLE_CSE_ID"] = os.getenv("GOOGLE_CSE_ID")
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["QDRANT_API_KEY"] = os.getenv("QDRANT_API_KEY")
os.environ["LANGSMITH_TRACING"] = "true"


The project data can be download from https://pan.webis.de/clef12/pan12-web/sexual-predator-identification.html please be sure to read all the necessary documentation to deeply understand it. The files were renamed and some were erased, but feel free to explore them and integrate them to the project.

In [None]:
# First, I used the preprocessing files from https://github.com/aaarguel/pan_identification to convert the xml 
# files to a more amicable CSV file. The csv contain the following columns:
# CONVERSATION_ID, AUTHORS_IDS, IS_ABUSIVE, CONVERSATION_TEXT
# The CONVERSATION_TEXT has all the consecutive messages separated by |

# Load the training data, in this case only load the abusive cases to store them in Qdrant.
xml2csv(nameXML=CORPUS_DATA_PATH,nameCSV="csv_files/abusive_text.csv",predatorsTXT=PREDATORS_DATA_PATH, only_abusive=True)
# Load the test data, in this case load all the cases to test the model.
xml2csv(nameXML=CORPUS_TEST_DATA_PATH,nameCSV="csv_files/abusive_text_test.csv",predatorsTXT=PREDATORS_TEST_DATA_PATH, only_abusive=False)

In [5]:
# Load the training data using the doc loader from langchain, make sure to specify the content, 
# source and metadata columns
loader = CSVLoader(
    file_path="csv_files/abusive_text.csv",
    csv_args={
        "delimiter": ";",
        "fieldnames": ["CONVERSATION_ID", "AUTHORS_IDS", "IS_ABUSIVE", "CONVERSATION_TEXT"],
    },
    content_columns=["CONVERSATION_TEXT"],
    source_column="CONVERSATION_ID",
    metadata_columns=["AUTHORS_IDS"],
)

data = loader.load()

In [6]:
# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# 'splits' holds the text you want to split, split the text into documents using the text splitter.
splits = text_splitter.split_documents(data)

In [30]:
# Use Qdrant to load the data into the vector store, using an embedding function to convert the text into vectors.
qdrant_url = os.getenv("QDRANT_URL")
qdrant_key = os.getenv("QDRANT_API_KEY")

embedding_function = OpenAIEmbeddings(model="text-embedding-3-large")

qdrant = QdrantVectorStore.from_documents(
    splits,
    embedding_function,
    url=qdrant_url,
    prefer_grpc=True,
    api_key=qdrant_key,
    collection_name="groom_chats",
)

In [31]:
# The documents should load only one time, of course if you need to load more data you can do it.
# Now, we will retrieve the data from the Qdrant store to test the model.
qdrant = QdrantVectorStore.from_existing_collection(
    embedding=embedding_function,
    collection_name="groom_chats",
    url=qdrant_url,
    api_key=qdrant_key,
)

In [32]:
retriever = qdrant.as_retriever()

In [34]:
def join_docs(chunks):
    return "\n\n".join(chunk.page_content for chunk in chunks)

In [56]:
PROMPT_TEMPLATE = """
You are an expert on detecting grooming on chat conversations. 
This are some grooming chat examples, keep in mind that conversation 
messages are separated by a | character. 

{context}

---
Taking into account the previous examples, do you identify any grooming behavior 
in the next chat? Answer if the conversation is grooming or not, and
give the literal text that makes you think so.

{question}
"""

prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

In [68]:
# Cargar el modelo de OpenAI
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.8)

# Definimos el chain con el pipeline integrado
chain = (
        {"context": retriever | join_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
        )

Now we should test the model with a test conversation

In [69]:
df = pd.read_csv("csv_files/abusive_text_test.csv", delimiter=";", names=["CONVERSATION_ID", "AUTHORS_IDS", "IS_ABUSIVE", "CONVERSATION_TEXT"])

In [70]:
chain.invoke(df.iloc[4]['CONVERSATION_TEXT'])

'Yes, this conversation exhibits grooming behavior. The phrases "hello there", "how are ya?", "hey", and "where are you from, Stranger" all show an attempt to establish a connection, gather personal information, and potentially manipulate the individual.'