### Imports

In [1]:
import pandas as pd
import getpass
import os
import json
import time
import requests
from tqdm import tqdm

from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Asks for Tavily API Key input
if not os.environ.get("TAVILY_API_KEY"):
    os.environ["TAVILY_API_KEY"] = getpass.getpass("Tavily API key:\n")

# Asks for Groq API Key input
if not os.environ.get("GROQ_API_KEY"):
    os.environ["GROQ_API_KEY"] = getpass.getpass("Groq API key:\n")



### Read CSV Data

In [2]:
csv_path = os.path.join('..', 'data', 'terms_classified.csv')

words_df = pd.read_csv(csv_path)

terms = words_df['word'][:20]

words_df.head(3)


Unnamed: 0,word,label_round1,label_round2,label_round3,label_round4,label_round5,label_round6,sum,Hand Label,Word Etymology,Geographic,Color Code,gemma_label
0,aangezien,1,0,1,1,1,0,4,Dutch word for 'since',,,Recognized Word,4.0
1,alaqsa,0,1,1,1,1,0,4,Name of mosque,,,Recognized word - proper noun/foreign word not...,0.0
2,alassad,0,1,1,1,1,0,4,Name of Syrian Politicion,,,Unrecognized word,1.0


### Setup

In [5]:
search = TavilySearchResults()
summarizer_llm = ChatGroq(model="llama3-8b-8192")
classifier_llm = ChatGroq(model="llama3-70b-8192")

### Prompts

In [6]:
summarization_prompt = ChatPromptTemplate.from_template("""
Summarize the following content in 1–2 concise sentences to explain what the term is or how it's used. Focus on social or cultural meaning, not dictionary definitions.

Content:
{content}

Summary:
""")

classification_prompt = ChatPromptTemplate.from_template("""
You are a researcher analyzing terms from a fringe online community.
Your job is to classify whether each term is part of everyday language or specific to that space.

Use the following definitions:

0 — **Recognized Word**: A common English word or phrase used in everyday language (e.g. "apple", "dog", "table").
1 — **Recognized Proper Noun or Foreign Word**: A proper name, place name, or foreign-language term that is not typically used in casual English conversations but is verifiable through reliable sources (e.g. "al-Assad", "Qatar", "al-Bab").
2 — **Recognized Slur**: A known derogatory term or slur. These are offensive and not typically used in respectful everyday speech (e.g. racial or ethnic slurs).
3 — **Unrecognized Word**: A word that appears to be made up, has no clear definition, or is only found in obscure internet forums.
4 — **Unsure**: If you cannot tell what the word means or there is not enough information.

Based on the internet summary provided, classify the term accordingly.

Term: {term}

Web Summary:
{summary}

Answer with only the classification number (0, 1, 2, 3, or 4), no explanation.
""")

summarize_chain = summarization_prompt | summarizer_llm | StrOutputParser()
classify_chain = classification_prompt | classifier_llm | StrOutputParser()


In [7]:
summaries = []
classifications = []

In [8]:
print("Classifying terms using Tavily + Groq...")
for term in tqdm(terms):
    try:
        # --- Tavily Search ---
        search_result = search.invoke({"query": f"What is '{term}' on 4chan?"})
        raw_sources = search_result
        top_contents = " ".join([doc['content'] for doc in raw_sources[:3] if 'content' in doc])

        # --- Summarize ---
        summary_text = summarize_chain.invoke({"content": top_contents})
        summaries.append(summary_text)

        # --- Classify ---
        classification = classify_chain.invoke({"term": term, "summary": summary_text})
        classifications.append(classification.strip())

        # --- Pause to respect rate limits ---
        time.sleep(2.5)

    except Exception as e:
        print(f"Error for term '{term}': {e}")
        summaries.append("ERROR")
        classifications.append("4")  # default to Unsure

Classifying terms using Tavily + Groq...


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [01:44<00:00,  5.22s/it]


In [9]:
print(f"terms: {len(terms)}")
print(f"classifications: {len(classifications)}")
print(f"summaries: {len(summaries)}")

terms: 20
classifications: 20
summaries: 20


In [12]:
pd.set_option('display.max_colwidth', None)

results_df = pd.DataFrame({
    'term': terms,
    'groq_label': classifications,
    'tavily_summary': summaries
})
results_df

Unnamed: 0,term,groq_label,tavily_summary
0,aangezien,1,"Here is a concise summary of the content:\n\n4chan is a website that embodies the concept of ""true freedom of speech"" by allowing users to post anonymously without fear of consequences, leading to a culture of anger, resentment, and hurtful behavior. This unrestricted environment contributes to the website's notorious reputation, with users often acting rudely and condescendingly."
1,alaqsa,1,"Here is a concise summary of the term ""fakeness"" in this context:\n\nIn this social media discussion, ""fakeness"" refers to the idea that something is not authentic or genuine, often used to describe a false or fabricated claim, image, or message. The term is used to express skepticism and doubt about the credibility of information, particularly in the context of online communities like 4chan."
2,alassad,1,"Here is a concise summary of the term ""Assad Must Go"" in 1-2 sentences:\n\n""Assad Must Go"" is a meme template that uses an image of Syrian dictator Bashar al-Assad and another political figure, often making fun of the international community's failed attempts to oust him from power. The meme pokes fun at the irony of the situation, where leaders who call for Assad's removal often end up being the ones who lose their own power or reputation."
3,albab,1,"Here is a summary of the content in 1-2 concise sentences:\n\n4chan is an imageboard where users, known as Original Posters (OPs), can create threads by posting images and messages, often anonymously, and others can respond with their own messages and images. The platform is known for its anonymity, ephemerality, and often controversial content, which has given rise to a unique culture and community."
4,albaghdadi,1,"Here is a concise summary of the content in 1-2 sentences:\n\nThe term ""virgins"" in the Quran is often used to describe the rewards of sexual paradise in Islamic teaching, but in the context of ISIS and extremist online discourse, it takes on a different meaning, referring to sexual demons instead. The online debate around this issue highlights the blurred lines between real-world militancy and online ""fanboy"" behavior, with social media becoming a battleground for ideological influence and propaganda."
5,allfather,1,"The term ""Allfather"" is a prestigious title used to refer to a divine being, but its meaning may not be as straightforward as it seems. It's possible that the Old Norse word ""föðr"" being translated as ""Allfather"" is actually related to the concept of ""all-orderer"" or ""all-arranger"" rather than simply meaning ""father"" of everything."
6,altleft,3,"Here is a summary of the content in 1-2 concise sentences:\n\n/pol/ refers to a political persuasion on the internet forum 4chan that is characterized by racist, sexist, and neo-Nazi beliefs, and is often associated with the alt-right movement. The term has been used to describe a wider cultural phenomenon of far-right ideologies and online hate speech that emerged on 4chan and other platforms in the mid-2010s."
7,anonkun,3,"Here is a summary of the content in 1-2 concise sentences:\n\nAnon-kun refers to a community of anonymous users on various online platforms, including 4chan and spin-off sites like fiction.live, who create and engage with fictional stories, often with mature themes. The term ""anon-kun"" is used as a substitute for a name, and is often associated with quests, a type of collaborative storytelling."
8,anticlinton,0,"Here is a concise summary of the content:\n\n4chan is an online forum where users can share and discuss content, often anonymously. It has been associated with the creation and dissemination of memes and propaganda, particularly during the 2016 US Presidential election, where users organized to spread pro-Trump and anti-Clinton messages online. The forum's users are often characterized as a mix of intellectuals and ""absolute idiots"", with some users creating fake personas to spread extremist content."
9,antimasker,0,"The term ""anti-masker"" is used to describe individuals who refuse to wear face masks, particularly in response to government health mandates, often citing concerns about personal freedoms or the effectiveness of masks. This term is often associated with a broader movement that combines anti-mask sentiment with far-right ideologies, conspiracy theories, and religious beliefs, as seen in the QAnon movement."
