In [None]:
import os
import shutil
import zipfile
from google.colab import drive

## Set up

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Mount drives and create necessary directories

In [None]:
print("Mounting Google Drive...")

drive.mount('/content/drive')

In [None]:
for directory in ['data', 'data_exports', 'recommendations']:
    os.makedirs(directory, exist_ok=True)

# Downloading KuaiRec Dataset

The code below downloads the KuaiRec dataset, extracts it and moves the data directory to the root folder <br>

You can also manually download the dataset from this link: https://kuairec.com/

In [None]:
# Download the KuaiRec dataset
file_id = "1qe5hOSBxzIuxBb1G_Ih5X-O65QElollE"

print("Downloading KuaiRec dataset...")
!gdown "https://drive.google.com/uc?id={file_id}"

print("\nExtracting files...")
with zipfile.ZipFile("KuaiRec.zip", 'r') as zip_ref:
    zip_ref.extractall()

print("Cleaning up...")
os.remove("KuaiRec.zip")

print("Download and extraction complete!")

In [None]:
if os.path.exists('KuaiRec 2.0'):
    # Move the data folder to current directory
    print("Moving data folder to current directory...")
    try:
        # If a data folder already exists in current directory, remove it first
        if os.path.exists('data'):
            shutil.rmtree('data')
        
        # Move the data folder
        shutil.move('KuaiRec 2.0/data', '.')
        
        # Clean up by removing the empty KuaiRec 2.0 directory
        shutil.rmtree('KuaiRec 2.0')
        
        print("Data folder successfully moved to current directory!")
    except Exception as e:
        print(f"An error occurred: {e}")
else:
    print("'KuaiRec 2.0' directory not found. Please check if the extraction was successful.")

# Verify the contents of the current directory
print("\nContents of current directory:")
!ls

# Translating Captions and Categories

This section executes the code to translate the chinese fields in the dataset. The translation code is located in a Python file within the translation folder. To perform the translations, you'll need to run a local language model (we used Qwen2.5-14B) for processing certain text fields. Note that this step requires a GPU and can be time-intensive, so we have already provided the translated output in the `/data` directory as `kuairec_caption_category_translated.csv`.

In [None]:
from translation.translation import main

In [None]:
main()