In [18]:
# Add the parent directory to sys.path
import sys
sys.path.insert(0, '..')

# Remove near-duplicates

In [74]:
import json
from rapidfuzz import fuzz

from collections import OrderedDict
from typing import Dict, Any

def replace_escaped_quotes(input_string):
    return input_string.replace('\\"', "'")

def deduplicate_with_fuzzy(entries, threshold):
    # Using OrderedDict to preserve the order
    deduped_entries = OrderedDict()

    total_entries = len(entries)
    duplicate_count = 0

    for entry in entries:
        question = entry["question"]
        duplicate_found = False
        for existing_entry in deduped_entries.keys():
            if fuzz.ratio(question, existing_entry) > threshold:
                duplicate_found = True
                duplicate_count += 1
                print(f"Duplicate entry found: '{question}' duplicates '{existing_entry}'")
                break
        if not duplicate_found:
            deduped_entries[question] = entry

    unique_entries = len(deduped_entries)

    print(f"Total entries: {total_entries}")
    print(f"Total duplicates: {duplicate_count}")
    print(f"Total non-duplicates entries: {unique_entries}")

    return list(deduped_entries.values())

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import OrderedDict

def deduplicate_with_TfidfVectorizer(entries, threshold):
    # Using OrderedDict to preserve the order
    deduped_entries = OrderedDict()
    duplicate_count = 0

    lines = [entry['question'] for entry in entries]

    # Calculate TF-IDF vectors for the lines
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(lines)

    # Compute cosine similarity between the vectors
    similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

    for i, entry in enumerate(entries):
        duplicate_found = False
        for j in range(i + 1, len(entries)):
            if similarities[i][j] > threshold:
                print(f"Duplicate entry found: '{entry['question']}' duplicates '{entries[j]['question']}'")
                duplicate_found = True
                duplicate_count += 1
                break

        if not duplicate_found:
            deduped_entries[entry['question']] = entry

    non_duplicate_count = len(deduped_entries)
    total_lines = len(entries)

    print(f"Total entries: {total_lines}")
    print(f"Total duplicates: {duplicate_count}")
    print(f"Total non-duplicates: {non_duplicate_count}")

    return list(deduped_entries.values())

In [77]:
# Usage
json_file = '../data/questions.json'  # specify your file path here
fuzzy_threshold = 98  # specify your threshold here
tfidf_threshold = 0.85

save_to_file = True  # specify if you want to save the deduplicated lines back to the file

with open(json_file, 'r') as f:
        entries = json.load(f)

for entry in entries:
     entry['question'] = replace_escaped_quotes(entry['question'])

deduped_lines = deduplicate_with_fuzzy(entries, fuzzy_threshold)
deduped_lines = deduplicate_with_TfidfVectorizer(entries, tfidf_threshold)


if save_to_file:
    with open(json_file, 'w') as f:
        # Use json.dump with indent parameter to format the output
        json.dump(list(deduped_lines), f, indent=1, ensure_ascii=False)

Total entries: 5096
Total duplicates: 0
Total unique entries: 5096
Duplicate entry found: 'What do you hope to achieve during this new phase of your life?' duplicates 'What do you hope unicorn to achieve during this new phase of your life?'
Total entries: 5096
Total duplicates: 1
Total non-duplicates: 5095
Percentage of duplicates: 0.019623233908948195%


# Convert Internal JSON dataset to CSV

In [79]:
import json
import pandas as pd

def convert_json_to_csv(json_file, csv_file):
    with open(json_file, 'r') as f:
        data = json.load(f)

    df = pd.DataFrame(data, columns=["question", "tags"])
    df.to_csv(csv_file, index=False)

# Usage
json_file = '../data/questions.json'  # specify the input JSON file path here
csv_file = '../data/questions.csv'  # specify the output CSV file path here

convert_json_to_csv(json_file, csv_file)