In [1]:
import json
import pandas as pd
import numpy as np

# Load the datasets
data_path = 'data/'

# Load train data
train_data = json.load(open(data_path + "news_clippings/data/news_clippings/data/merged_balanced/train.json"))
train_data = pd.DataFrame(train_data["annotations"])
train_data.insert(0, 'new_clipping_id', range(0, len(train_data)))
train_data.columns.values[1] = 'article_id'


# Load VisualNews data
vn_data = json.load(open(data_path + '/VisualNews/origin/data.json'))
vn_data = pd.DataFrame(vn_data)
vn_data = vn_data[['id', 'image_path', 'article_path']]


# Load source evidence paths
SOURCE_EVIDENCE_PATH = 'data/news_clippings/queries_dataset'

train_paths = pd.DataFrame(json.load(open(SOURCE_EVIDENCE_PATH + '/dataset_items_train.json'))).transpose()
train_paths = train_paths.reset_index().rename(columns={'index': 'match_index'})
train_paths['match_index'] = train_paths['match_index'].astype(int)




merged_train_data = pd.merge(train_data, train_paths, left_on='new_clipping_id', right_on='match_index')

merged_with_article_data = pd.merge(merged_train_data, vn_data, left_on='article_id', right_on='id', how='left')
merged_with_article_data = merged_with_article_data.rename(columns={'image_path': 'article_id_image_path', 'article_path': 'article_id_article_path'})

# Merge the resulting data with vn_data on image_id to get article_path and image_path
final_merged_data = pd.merge(merged_with_article_data, vn_data, left_on='image_id', right_on='id', how='left')
final_merged_data = final_merged_data.rename(columns={'image_path': 'image_id_image_path', 'article_path': 'image_id_article_path'})


### Let us delete 9/10
num_entries_to_keep = len(final_merged_data) // 10
subset_final_merged_data = final_merged_data.head(num_entries_to_keep)
last_new_clipping_id = subset_final_merged_data['new_clipping_id'].max()

In [7]:
### Update news_clippings

# relative path: data/news_clippings/data/news_clippings/data/merged_balanced/train.json
with open(data_path + "news_clippings/data/news_clippings/data/merged_balanced/train.json", 'r') as file:
    data = json.load(file)

# Find the index of the last specified id
# Remove all annotations after the specified id
data['annotations'] = data['annotations'][:last_new_clipping_id + 1]
# Save the modified JSON data back to the file
with open(data_path + "news_clippings/data/news_clippings/data/merged_balanced/train.json", 'w') as file:
    json.dump(data, file)

print("Annotations after the specified id have been deleted.")

Annotations after the specified id have been deleted.


In [2]:
def load_and_merge_data(data_path, dataset_type):
    # Load the dataset
    dataset = json.load(open(f"{data_path}/news_clippings/data/news_clippings/data/merged_balanced/{dataset_type}.json"))
    dataset = pd.DataFrame(dataset["annotations"])
    dataset.insert(0, 'new_clipping_id', range(0, len(dataset)))
    dataset.columns.values[1] = 'article_id'


    # Load VisualNews data
    vn_data = json.load(open(data_path + '/VisualNews/origin/data.json'))
    vn_data = pd.DataFrame(vn_data)
    vn_data = vn_data[['id', 'image_path', 'article_path']]

    # Merge datasets
    merged_with_article_data = pd.merge(dataset, vn_data, left_on='article_id', right_on='id', how='left')
    merged_with_article_data = merged_with_article_data.rename(columns={'image_path': 'article_id_image_path', 'article_path': 'article_id_article_path'})

    # Merge the resulting data with vn_data on image_id to get article_path and image_path
    final_merged_data = pd.merge(merged_with_article_data, vn_data, left_on='image_id', right_on='id', how='left')
    final_merged_data = final_merged_data.rename(columns={'image_path': 'image_id_image_path', 'article_path': 'image_id_article_path'})
    
    return final_merged_data


data_path = 'data/'
val_data = load_and_merge_data(data_path, 'val')
test_data = load_and_merge_data(data_path, 'test')

In [4]:
import json
import os
import pandas as pd

# Extract article_ids and image_ids from the validation and test datasets
article_ids_val = val_data['id_x']
image_ids_val = val_data['id_y']
article_ids_test = test_data['id_x']
image_ids_test = test_data['id_y']

# Extract article_ids from the train dataset to protect them
article_ids_train = subset_final_merged_data['id_x']
image_ids_train = subset_final_merged_data['id_y']

# Combine the IDs and remove duplicates
all_ids = pd.concat([article_ids_val, image_ids_val, article_ids_test, image_ids_test, article_ids_train, image_ids_train]).unique()

print(len(all_ids))

# Load the main JSON file
with open('data/VisualNews/origin/data.json', 'r') as file:
    data = json.load(file)

ids_to_keep = all_ids

# Convert ids_to_keep to a set for faster lookups
ids_to_keep_set = set(ids_to_keep)

# Define the base path for images
base_image_path = "data/VisualNews/origin"

# Filter the data to only include items with IDs in the ids_to_keep list
filtered_data = []
for item in data:
    if item['id'] in ids_to_keep_set:
        filtered_data.append(item)
    else:
        # Delete associated files
        image_path = item.get('image_path')
        article_path = item.get('article_path')
        
        if image_path:
            full_image_path = os.path.join(base_image_path, image_path.lstrip('./'))
            if os.path.isfile(full_image_path):
                try:
                    os.remove(full_image_path)
                    print(f"Deleted file: {full_image_path}")
                except OSError as e:
                    print(f"Error deleting file {full_image_path}: {e}")

        if article_path and os.path.isfile(article_path):
            try:
                os.remove(article_path)
                print(f"Deleted file: {article_path}")
            except OSError as e:
                print(f"Error deleting file {article_path}: {e}")

# Save the filtered data back to a new JSON file
with open('data/VisualNews/origin/data.json', 'w') as file:
    json.dump(filtered_data, file)

print(f"Filtered data saved to 'filtered_large_file.json'. Original count: {len(data)}, Filtered count: {len(filtered_data)}")

19018


In [None]:
import json
import os
import shutil

# Load the dataset
with open('data/news_clippings/queries_dataset/dataset_items_train.json', 'r') as infile:
    data = json.load(infile)

last_id = int(last_new_clipping_id)
new_data = {}

# Define the base path for the directories
base_directory_path = "data/news_clippings/queries_dataset/merged_balanced"

# Process the data and delete directories beyond last_id
for key in list(data.keys()):
    key_int = int(key)
    if key_int <= last_id:
        new_data[key] = data[key]
    else:
        # Delete the directories
        if 'inv_path' in data[key]:
            full_inv_path = os.path.join(base_directory_path, data[key]['inv_path'].lstrip('./'))
            if os.path.exists(full_inv_path):
                try:
                    shutil.rmtree(full_inv_path)
                    print(f"Deleted directory: {full_inv_path}")
                except OSError as e:
                    print(f"Error deleting directory {full_inv_path}: {e}")

        if 'direct_path' in data[key]:
            full_direct_path = os.path.join(base_directory_path, data[key]['direct_path'].lstrip('./'))
            if os.path.exists(full_direct_path):
                try:
                    shutil.rmtree(full_direct_path)
                    print(f"Deleted directory: {full_direct_path}")
                except OSError as e:
                    print(f"Error deleting directory {full_direct_path}: {e}")

# Save the new data back to the file
with open('data/news_clippings/queries_dataset/dataset_items_train.json', 'w') as outfile:
    json.dump(new_data, outfile)

print(f"Updated data saved to 'data/news_clippings/queries_dataset/dataset_items_train.json'. Original count: {len(data)}, New count: {len(new_data)}")
