In [30]:
TECH_KEYWORDS = [
    # Programming languages
    "python", "java", "javascript", "typescript", "c++", "golang", "php",
    "matlab", "bash", "bashscript", "next.js", "nextjs", "vue", "svelte",
    "nodejs", "nestjs", "fastapi",
    "django", ".net", "asp.net",
    "laravel", "pandas", "numpy", "scikit", "sklearn",
    "tensorflow", "pytorch", "keras", "xgboost", "sql", "mysql", "postgres", "postgresql", "sqlite",
    "mongodb", "redis", "cassandra", "neo4j", "dynamodb", "snowflake", 
    "api", "rest api", "graphql",
    "json", "xml", "oauth", "jwt", "cors", "websocket", "dns",
    "docker", "dockerfile", "kubernetes", "k8s", "helm",
    "aws", "azure", "github actions", "gitlab ci",
    "git", "github", "gitlab", "bitbucket",
    "vscode", "visual studio", "jupyter", "pip", "conda",
    "venv", "requirements.txt", "environment.yml", "segfault",
    "syntaxerror", "indexerror", "keyerror",
    "importerror", "null pointer", "for loop", "while loop", "if else", 
    "regex", "hashmap", "encryption", "hash", "bcrypt", "sql injection", "reactjs", "react.js", "angularjs", "vuejs",
    "redux", "nextauth", "nuxt", "vite", "fastapi app", "expressjs", "spring mvc",
    "hibernate", "typeorm", "sequelize", "prisma orm",
    "opencv", "lightgbm", "catboost", "huggingface",
    "transformers library", "scipy", "statsmodels", "binary search tree", "linked list",
    "depth first search", "breadth first search", "dijkstra algorithm", "dynamic programming", "big o notation", "time complexity analysis",
    "space complexity", "multithreading", "asynchronous programming", "event loop"
]

In [31]:
from datasets import load_from_disk

ds = load_from_disk("wildchat_local")
train_ds = ds["train"]

train_ds

Dataset({
    features: ['conversation_hash', 'model', 'timestamp', 'conversation', 'turn', 'language', 'openai_moderation', 'detoxify_moderation', 'toxic', 'redacted', 'state', 'country', 'hashed_ip', 'header'],
    num_rows: 837989
})

In [32]:
def extract_first_user_prompt(example):
    for turn in example["conversation"]:
        if turn["role"] == "user":
            return {
                "conversation_hash": example["conversation_hash"],
                "prompt": turn["content"]
            }
    return {
        "conversation_hash": example["conversation_hash"],
        "prompt": ""
    }

prompts_ds = train_ds.map(extract_first_user_prompt)

Map:   0%|          | 0/837989 [00:00<?, ? examples/s]

In [33]:
prompts_ds = prompts_ds.remove_columns(
    [col for col in prompts_ds.column_names if col not in ["conversation_hash", "prompt"]]
)

prompts_ds

Dataset({
    features: ['conversation_hash', 'prompt'],
    num_rows: 837989
})

In [34]:
import re

def is_technical_prompt(text):
    if not text or not isinstance(text, str):
        return False
    
    text = text.lower()
    
    for keyword in TECH_KEYWORDS:
        pattern = r"\b" + re.escape(keyword) + r"\b"
        if re.search(pattern, text):
            return True
    
    return False

In [35]:
technical_ds = prompts_ds.filter(
    lambda x: is_technical_prompt(x["prompt"])
)

Filter:   0%|          | 0/837989 [00:00<?, ? examples/s]

In [36]:
technical_sample_ds = technical_ds.shuffle(seed=42).select(range(1000))

In [37]:
technical_df = technical_sample_ds.to_pandas()
technical_df.shape
technical_df   

Unnamed: 0,conversation_hash,prompt
0,47ff5c1db22c05e6b795d5dd0db7114a,## 任务描述\n\n作为PPT专家，你的任务是根据要求概述，遵循PPT设计规范，生成包含对...
1,09d34c9250e8139cae3a7da7d2ecb755,"Напишите программу Python, которая считает кол..."
2,3306065b61ef17582b2b07d5c481c88e,Как заменить const { userAuthorization } from ...
3,198e3999a0bc37829e1539b15881da06,<html lang=“ru”>\n<head>\n<meta charset=“UTF-8...
4,6372c4ee68adc642cdcdc8d584b18fd5,"Montos tiene las columnas [IDSOLICITUD,\tFECHA..."
...,...,...
995,9fb2918959550f19334e415652c2e8b0,"<?php \n$connection_link = new mysqli(""localho..."
996,6b569b277c85a1a78cd94cd850703bee,"检查下面的英文语法To solve this problem, this paper is ..."
997,057df01d711ad60b23fdac5d48e98f74,convert this python code into javascript. I wa...
998,654c64b9c5a50ea64708e1b47cb7df30,"write an article comparing SQS, SNS, Kinesis, ..."


In [49]:
technical_df["label"] = "Technical"
#removing unwanted columns
technical_df = technical_df.drop(columns=["label_id"], errors="ignore")

print(technical_df.columns)

Index(['conversation_hash', 'prompt', 'label'], dtype='object')


In [50]:
technical_df.to_csv(
    "rq1_technical_prompts_1000.csv",
    index=False
)

In [52]:
import pandas as pd

manual_df = pd.read_csv("rq1_prompt_labels.csv")
auto_tech_df = pd.read_csv("rq1_technical_prompts_1000.csv")

combined_df = pd.concat([manual_df, auto_tech_df], ignore_index=True)

combined_df.to_csv("rq1_final_prompts_labels.csv", index=False)