## Inspect `data/sports`

In [1]:
import numpy as np
import pandas as pd

inter_file_path = '/nas/MusicRecommendation/data'
dataset_name = 'sports'

# Load inter_file
inter_df = pd.read_csv(f'{inter_file_path}/{dataset_name}/sports14-indexed-v4.inter', sep='\t')

# Inspect the dataframe
inter_df[['userID', 'itemID', 'x_label']].head()
print(f'#users: {len(inter_df["userID"].unique())}')
print(f'#items: {len(inter_df["itemID"].unique())}')
print(f'#filtered interactions: {len(inter_df)}')

#users: 35598
#items: 18357
#filtered interactions: 296337


In [2]:
image_feat = np.load(f'{inter_file_path}/{dataset_name}/image_feat.npy', allow_pickle=True)
text_feat = np.load(f'{inter_file_path}/{dataset_name}/text_feat-v1.npy', allow_pickle=True)
print(f'Shape of image_feat: {image_feat.shape}')
print(f'Shape of text_feat: {text_feat.shape}')

Shape of image_feat: (18357, 4096)
Shape of text_feat: (18357, 384)


## Music4All: Interaction data

**`interaction.json`**
```
{
  0: [12233, 23344, ...],
  1: [],
  ...
}
```


In [2]:
import pandas as pd
import random
import json

music4all_dir = '/nas/MusicRecommendation/Music4All/processed'
inter_path = f'{music4all_dir}/interactions.json'
attr_path = f'{music4all_dir}/attributes.json'

# Open and read the JSON file
with open(inter_path, 'r') as file:
    interactions = json.load(file)
with open(attr_path, 'r') as file:
    attributes = json.load(file)

In [3]:
# Step 1: Count item occurrences to filter 5-core items
item_counts = {}
for items in interactions.values():
    for item in items:
        item_counts[item] = item_counts.get(item, 0) + 1

# Step 2: Filter users and items to retain only those with at least 5 interactions
filtered_interactions = {}
valid_users = []  # Keep track of valid users
for user, items in interactions.items():
    filtered_items = [item for item in items if item_counts.get(item, 0) >= 5]
    if len(filtered_items) >= 5:
        valid_users.append(int(user))  # Store valid user IDs
        filtered_interactions[user] = filtered_items

# Create user ID mapping
user_id_map = {old_id: new_id for new_id, old_id in enumerate(sorted(valid_users))}

# Step 3: Split interactions into train, validation, and test sets (8:1:1)
train_data, valid_data, test_data = [], [], []
for old_user, items in filtered_interactions.items():
    new_user = user_id_map[int(old_user)]  # Map to new sequential ID
    random.shuffle(items)
    train_size = int(0.8 * len(items))
    valid_size = int(0.1 * len(items))
    
    # Assign labels: 0 for train, 1 for valid, 2 for test
    train_data.extend([(new_user, item, 0) for item in items[:train_size]])
    valid_data.extend([(new_user, item, 1) for item in items[train_size:train_size + valid_size]])
    test_data.extend([(new_user, item, 2) for item in items[train_size + valid_size:]])

# Step 4: Combine all data into a single dataframe
all_data = train_data + valid_data + test_data
df = pd.DataFrame(all_data, columns=['userID', 'itemID', 'x_label'])

# Save the dataframe to CSV
df.to_csv(f'{inter_file_path}/Music4All/filtered_interactions.csv', index=False, sep='\t')

df.head()
print(f'#filtered interactions: {len(df)}')

#filtered interactions: 5058234
