## FMLP-Rec and LinRec Training and Data Extraction

### FMLP-Rec

#### Preprocess ML-1M Dataset

In [1]:
!git clone https://github.com/RUCAIBox/FMLP-Rec.git

Cloning into 'FMLP-Rec'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 20 (delta 0), reused 20 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (20/20), 8.41 MiB | 18.02 MiB/s, done.


In [2]:
import pandas as pd
import numpy as np
import zipfile
import os
from collections import Counter
import csv
import shutil

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Preprocess datasets and save mappings

In [4]:
data_file_ml1m = '/content/drive/MyDrive/ml-1m.zip'
extract_path_ml1m = 'ml-1m'
ratings_file_ml1m = 'ml-1m/ratings.dat'

In [5]:
processed_file_ml1m = "ML-1M.txt"
sample_file_ml1m = "ML-1M_sample.txt"
destination_folder = "FMLP-Rec/data/"

In [6]:
def convert_zip_to_df(zip_path, extract_path, ratings_file):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    ml1m_path = os.path.join(extract_path, ratings_file)
    df = pd.read_csv(ml1m_path, sep="::", engine="python", names=["user_id", "item_id", "rating", "timestamp"])
    return df

In [7]:
def filter_k_core(df, k=5):
    while True:
        user_counts = df["user_id"].value_counts()
        item_counts = df["item_id"].value_counts()

        df = df[df["user_id"].isin(user_counts[user_counts >= k].index)]
        df = df[df["item_id"].isin(item_counts[item_counts >= k].index)]

        new_user_counts = df["user_id"].value_counts()
        new_item_counts = df["item_id"].value_counts()

        if len(new_user_counts) == len(user_counts) and len(new_item_counts) == len(item_counts):
            break
    return df

In [8]:
def save_mappings_to_csv(user_map, item_map, dataset):

    # Create DataFrames from the mappings
    df_user = pd.DataFrame(list(user_map.items()), columns=['old_user_id', 'new_user_id'])
    df_item = pd.DataFrame(list(item_map.items()), columns=['old_item_id', 'new_item_id'])

    # Save to CSV files
    df_user.to_csv(f'user_map_{dataset}.csv', index=False)
    df_item.to_csv(f'item_map_{dataset}.csv', index=False)


In [9]:
def run_preprocess(data_file, extract_path, ratings_file, sample_file, processed_file, kcore=False, min_rating=0):

    data = convert_zip_to_df(data_file, extract_path, ratings_file)

    # Keep only interactions where rating > min_rating
    data = data[data["rating"] > min_rating]

    # apply k-core filtering (ensuring each user and item has at least k interactions)
    if kcore:
      data = filter_k_core(data, k=5)

    user_map = {old: new+1 for new, old in enumerate(data["user_id"].unique())}
    item_map = {old: new+1 for new, old in enumerate(data["item_id"].unique())}

    save_mappings_to_csv(user_map, item_map, extract_path)

    data["user_id"] = data["user_id"].map(user_map)
    data["item_id"] = data["item_id"].map(item_map)

    # group by user_id, sort by timestamp, and aggregate item interactions
    data = data.sort_values(by=["user_id", "timestamp"])
    grouped = data.groupby("user_id")["item_id"].apply(list).to_dict()

    grouped_data = data.groupby("user_id")["item_id"].apply(lambda x: " ".join(map(str, x))).reset_index()
    grouped_data["formatted"] = grouped_data["user_id"].astype(str) + " " + grouped_data["item_id"]
    grouped_data["formatted"].to_csv(f"{processed_file}", index=False, header=False)

    print(f"Processing complete. Saved as {processed_file}")

    # 99 negative samples per user required
    all_items = set(data["item_id"].unique())

    with open(sample_file, "w") as f:
        for user_id, pos_items in grouped.items():
            positive_items = set(pos_items) if isinstance(pos_items, (set, list, pd.Series)) else {pos_items}
            negative_samples = list(all_items - positive_items)
            sampled_negatives = np.random.choice(negative_samples, 99, replace=False) if len(negative_samples) >= 99 else negative_samples
            f.write(f"{user_id} " + " ".join(map(str, sampled_negatives)) + "\n")

    print(f"Sample file '{sample_file}' created successfully.")

Preprocess ML-1M

In [10]:
run_preprocess(data_file_ml1m, extract_path_ml1m, ratings_file_ml1m, sample_file_ml1m, processed_file_ml1m)

Processing complete. Saved as ML-1M.txt
Sample file 'ML-1M_sample.txt' created successfully.


In [11]:
os.makedirs(destination_folder, exist_ok=True)
shutil.move(processed_file_ml1m, os.path.join(destination_folder, processed_file_ml1m))
shutil.move(sample_file_ml1m, os.path.join(destination_folder, sample_file_ml1m))

print(f"Files moved to {destination_folder}")

Files moved to FMLP-Rec/data/


Preprocess Beauty

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!git clone https://github.com/RUCAIBox/CIKM2020-S3Rec.git

Cloning into 'CIKM2020-S3Rec'...
remote: Enumerating objects: 128, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 128 (delta 15), reused 10 (delta 10), pack-reused 103 (from 1)[K
Receiving objects: 100% (128/128), 60.87 MiB | 22.17 MiB/s, done.
Resolving deltas: 100% (52/52), done.


In [3]:
# change the path to the dataset

!sed -i 's|/path/reviews_|/content/drive/MyDrive/reviews_|g' /content/CIKM2020-S3Rec/data/data_process.py

In [4]:
# remove the runs to preprocess unecessary data

!sed -i '/^amazon_datas =/,$d' /content/CIKM2020-S3Rec/data/data_process.py

In [5]:
# save user/item id mappings

!sed -i $'/data_maps = id_map(user_items)/a\\
   import json\\n\
   with open("/content/CIKM2020-S3Rec/data/Beauty_id_maps.json", "w") as f:\\n\
       json.dump(data_maps, f)\\n\
   print("Mapping saved to data/Beauty_id_maps.json")' /content/CIKM2020-S3Rec/data/data_process.py

In [6]:
# run preprocessing on the Amazon Beauty dataset and save the user/item id mappings

%%bash
cat << 'END' >> /content/CIKM2020-S3Rec/data/data_process.py

main('Beauty', data_type='Amazon')

import json
import pandas as pd

with open('/content/CIKM2020-S3Rec/data/Beauty_id_maps.json', 'r') as f:
    data_maps = json.load(f)
df_user = pd.DataFrame(list(data_maps['id2user'].items()), columns=['new_user_id', 'original_user_id'])
df_item = pd.DataFrame(list(data_maps['id2item'].items()), columns=['new_item_id', 'original_item_id'])

import os
os.makedirs('./output', exist_ok=True)
df_user.to_csv(os.path.join('./output', 'id2user.csv'), index=False)
df_item.to_csv(os.path.join('./output', 'id2item.csv'), index=False)
print("Mapping files saved to ./output")
END

In [8]:
!cd /content/CIKM2020-S3Rec/data/ && python data_process.py

Beauty Raw data has been processed! Lower than 0.0 are deleted!
User 5-core complete! Item 5-core complete!
Mapping saved to data/Beauty_id_maps.json
Total User: 22363, Avg User: 8.8764, Min Len: 5, Max Len: 204
Total Item: 12101, Avg Item: 16.4038, Min Inter: 5, Max Inter: 431
Iteraction Num: 198502, Sparsity: 99.93%
Mapping files saved to ./output


#### Create CSVs for item predictions (validation and test)

In [12]:
# save internal user ids and item recommendations

!sed -i $'/self\.model\.eval()/a\\
           all_user_ids = []\\n\\
           all_candidate_ids = []'  /content/FMLP-Rec/trainers.py

In [13]:
!sed -i '/return self.get_full_sort_score(epoch, answer_list, pred_list)/,/^return self\.get_sample_scores(epoch, pred_list)/ s/^\(.*for i, batch in rec_data_iter:\)/                answer_list = None\n\1/' /content/FMLP-Rec/trainers.py

In [14]:
!sed -i $'/pred_list = test_logits/a\\
                       answer_list = answers.cpu().data.numpy()'  /content/FMLP-Rec/trainers.py

In [15]:
!sed -i $'/pred_list = np.append(pred_list, test_logits, axis=0)/a\\
                       answer_list = np.append(answer_list, answers.cpu().data.numpy(), axis=0)'  /content/FMLP-Rec/trainers.py

In [16]:
!sed -i $'/test_neg_items = torch.cat((answers.unsqueeze(-1), sample_negs), -1)/a\\
                    all_candidate_ids.append(test_neg_items.cpu().numpy())'  /content/FMLP-Rec/trainers.py

In [17]:
# save outputs of user id, recommendations, and ground truth

!sed -i $'/self.get_sample_scores(epoch, pred_list)/i\\
               all_user_ids = np.concatenate(all_user_ids, axis=0).flatten()\\n\\
               all_candidate_ids = np.concatenate(all_candidate_ids, axis=0)\\n\\
               import pandas as pd\\n\\
               import os\\n\\
               sorted_idx = np.argsort(-pred_list, axis=1)\\n\\
               top_k = sorted_idx[:, :20]\\n\\
               top_k_item_ids = np.take_along_axis(all_candidate_ids, top_k, axis=1)\\n\\
               df = pd.DataFrame({\\n\\
                   "user_id": all_user_ids,\\n\\
                   "predicted_top20": [list(row) for row in top_k_item_ids],\\n\\
                   "ground_truth": answer_list.tolist()\\n\\
               })\\n\\
               if dataloader == self.eval_dataloader:\\n\\
                   csv_path = os.path.join(self.args.output_dir, "valid_predictions.csv")\\n\\
                   df.to_csv(csv_path, index=False)\\n\\
               elif dataloader == self.test_dataloader:\\n\\
                   csv_path = os.path.join(self.args.output_dir, "test_predictions.csv")\\n\\
                   df.to_csv(csv_path, index=False)' /content/FMLP-Rec/trainers.py

In [18]:
!sed -i $'/user_ids, input_ids, answers, _, neg_answer = batch/ a\\
                   all_user_ids.append(user_ids.cpu().numpy())' /content/FMLP-Rec/trainers.py

In [19]:
!sed -i $'/user_ids, input_ids, answers, _, sample_negs = batch/ a\\
                   all_user_ids.append(user_ids.cpu().numpy())' /content/FMLP-Rec/trainers.py

#### Run FMLP-Rec on Amazon Beauty Dataset

In [None]:
!cd /content/FMLP-Rec/ && python main.py --data_name='Beauty'

#### Run FMLP-Rec on ML-1M Dataset

In [20]:
# add in ML-1M dataset

!sed -i "s/^sequential_data_list = \['Beauty','Sports_and_Outdoors','Toys_and_Games','Yelp'\]/sequential_data_list = \['Beauty','Sports_and_Outdoors','Toys_and_Games','Yelp','ML-1M'\]/" /content/FMLP-Rec/utils.py

In [None]:
!cd /content/FMLP-Rec/ && python main.py --data_name='ML-1M'

### LinRec

In [None]:
!pip install torch
!pip install recbole
!pip install ray
!pip install kmeans-pytorch

In [None]:
import shutil
import os
import torch

In [None]:
!git clone https://github.com/Applied-Machine-Learning-Lab/LinRec.git

Cloning into 'LinRec'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (36/36), done.[K
remote: Total 38 (delta 18), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (38/38), 23.09 KiB | 11.54 MiB/s, done.
Resolving deltas: 100% (18/18), done.


In [None]:
!mkdir -p /content/LinRec/output

In [None]:
# create a mapping of the internal ids for user and item to the originals

!sed -i $'/create_dataset(config)/a\\
   print("User mapping (internal -> original):", dataset.id2token("user_id", list(range(dataset.num("user_id")))))\\n\
   print("Item mapping (internal -> original):", dataset.id2token("item_id", list(range(dataset.num("item_id")))))\\n\
\\n\
   import pandas as pd\\n\
   user_ids = list(range(dataset.num("user_id")))\\n\
   user_tokens = dataset.id2token("user_id", user_ids)\\n\
   user_mapping = {internal: token for internal, token in zip(user_ids, user_tokens)}\\n\
\\n\
   item_ids = list(range(dataset.num("item_id")))\\n\
   item_tokens = dataset.id2token("item_id", item_ids)\\n\
   item_mapping = {internal: token for internal, token in zip(item_ids, item_tokens)}\\n\
\\n\
   user_df = pd.DataFrame(list(user_mapping.items()), columns=["internal_user_id", "original_user_id"])\\n\
   item_df = pd.DataFrame(list(item_mapping.items()), columns=["internal_item_id", "original_item_id"])\\n\
   user_df.to_csv(config["output_dir"] + "/id2user.csv", index=False)\\n\
   item_df.to_csv(config["output_dir"] + "/id2item.csv", index=False)' /usr/local/lib/python3.11/dist-packages/recbole/quick_start/quick_start.py

In [None]:
# add new metric that saves the user ids and their top k recommendations

%%bash
cat << 'END' >> /usr/local/lib/python3.11/dist-packages/recbole/evaluator/metrics.py

from recbole.evaluator.base_metric import AbstractMetric
from recbole.utils import EvaluatorType
class RecItems(AbstractMetric):
    """A dummy metric that forces the collector to gather the full predicted item list."""
    metric_type = EvaluatorType.RANKING
    metric_need = ["rec.items", "data.label"]

    def __init__(self, config):
        super().__init__(config)
        self.requirements = ["rec.items", "data.label"]

    def calculate_metric(self, dataobject):
        try:
            rec_items = dataobject.get("rec.items")
        except Exception as e:
            return {}
        if rec_items.size(1) < 20:
            top20 = rec_items
        else:
            top20 = rec_items[:, :20]
        top20_list = top20.cpu().numpy().tolist()
        user_ids = list(range(len(top20_list)))
        return {"user_id": user_ids, "recitems@20": top20_list}
END

In [None]:
!sed -i 's/\("Precision"\)[[:space:]]*\]/\1,"RecItems"]/' /content/LinRec/overall.yaml
!sed -i "s/\('metrics': \[[^]]*\)\(]\)/\1, 'RecItems'\2/" /content/LinRec/run.py

In [None]:
# save mapping between user id and label for both validation and test sets

!sed -i $'/^        return next_ds/ i\\
       valid_indices = next_index[1]\\n\
       test_indices = next_index[2]\\n\
       import pandas as pd\\n\
       valid_true_labels = self.inter_feat[self.iid_field][valid_indices].cpu().numpy()\\n\
       test_true_labels = self.inter_feat[self.iid_field][test_indices].cpu().numpy()\\n\
       valid_user_ids = self.inter_feat[self.uid_field][valid_indices].cpu().numpy()\\n\
       test_user_ids = self.inter_feat[self.uid_field][test_indices].cpu().numpy()\\n\
       valid_path = os.path.join(self.config["output_dir"], "valid_true_labels.csv")\\n\
       test_path = os.path.join(self.config["output_dir"], "test_true_labels.csv")\\n\
       pd.DataFrame({"user_id": valid_user_ids, "true_label": valid_true_labels}).to_csv(valid_path, index=False)\\n\
       pd.DataFrame({"user_id": test_user_ids, "true_label": test_true_labels}).to_csv(test_path, index=False)' /usr/local/lib/python3.11/dist-packages/recbole/data/dataset/dataset.py

In [None]:
# change the evaluate signature so we can tell the difference between validation and test sets

!sed -i "s/self, eval_data, load_best_model=True, model_file=None, show_progress=False/self, eval_data, load_best_model=True, model_file=None, show_progress=False, test=False/" /usr/local/lib/python3.11/dist-packages/recbole/trainer/trainer.py

# add parameter of test=True to evaluate method call for the test set
!sed -i 's/test_data, load_best_model=saved, show_progress=config\["show_progress"\]/test_data, load_best_model=saved, show_progress=config\["show_progress"\], test=True/' /usr/local/lib/python3.11/dist-packages/recbole/quick_start/quick_start.py

In [None]:
# insert code to read user ids and recommendation lists from the new metric and save to csv

!sed -i $'/struct = self\.eval_collector\.get_data_struct()/a\\
       import os\\n\
       import pandas as pd\\n\
       dummy_metric = self.evaluator.metric_class["recitems"]\\n\
       dummy_output = dummy_metric.calculate_metric(struct)\\n\
       if "recitems@20" in dummy_output and "user_id" in dummy_output:\\n\
           df = pd.DataFrame({\\n\
               "user_id": dummy_output["user_id"],\\n\
               "predicted_top20": dummy_output["recitems@20"],\\n\
           })\\n\
           if test:\\n\
               csv_filename = "linrec_test_predictions.csv"\\n\
           else:\\n\
               csv_filename = "linrec_valid_predictions.csv"\\n\
           csv_path = os.path.join(self.config["output_dir"], csv_filename)\\n\
           df.to_csv(csv_path, index=False)\\n\
           print("CSV saved to", csv_path)' /usr/local/lib/python3.11/dist-packages/recbole/trainer/trainer.py

#### ML-1M

In [None]:
# change configurations to run on ML-1M dataset

!sed -E -i "s/^([[:space:]]*)torch\.distributed\.barrier\(\)/\1if torch.distributed.is_available() and torch.distributed.is_initialized():\n\1    torch.distributed.barrier()/" /usr/local/lib/python3.11/dist-packages/recbole/data/dataset/dataset.py
!sed -i 's/^# from recbole.quick_start.quick_start import run_recbole/from recbole.quick_start.quick_start import run_recbole/' /content/LinRec/run.py
!sed -i 's/repeatable: False/repeatable: True/' /content/LinRec/overall.yaml
!sed -i "/^checkpoint_dir: 'saved'/a output_dir: 'output'" /content/LinRec/overall.yaml
!sed -i "s/run_recbole(model='SASRec', dataset='ML-1M', config_dict=parameter_dict)/run_recbole(model='SASRec', dataset='ml-1m', config_file_list=['ml-1m.yaml', 'overall.yaml'], config_dict=parameter_dict)/" /content/LinRec/run.py

In [None]:
target_dir_layers = '/usr/local/lib/python3.11/dist-packages/recbole/model/layers.py'
target_dir_yaml = '/usr/local/lib/python3.11/dist-packages/recbole/properties/ml-1m.yaml'

shutil.copy('/content/LinRec/layers.py', target_dir_layers)
shutil.copy('/content/LinRec/ml-1m.yaml', target_dir_yaml)

'/usr/local/lib/python3.11/dist-packages/recbole/properties/ml-1m.yaml'

In [None]:
!cd /content/LinRec/ && python run.py

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# combine recommendations and labels files into one

import pandas as pd
import os

pred_valid_file = "/content/drive/MyDrive/linrec_valid_predictions.csv"
label_valid_file = "/content/drive/MyDrive/valid_true_labels.csv"
output_valid_file = "valid_predictions_LinRec_ML1M.csv"

pred_test_file = "/content/drive/MyDrive/linrec_test_predictions.csv"
label_test_file = "/content/drive/MyDrive/test_true_labels.csv"
output_test_file = "test_predictions_LinRec_ML1M.csv"

# create combined validation predictions file
pred_valid_df = pd.read_csv(pred_valid_file)
pred_valid_df['user_id'] = pred_valid_df['user_id'] + 1

label_valid_df = pd.read_csv(label_valid_file)
combined_df = pd.merge(pred_valid_df, label_valid_df, on="user_id", how="left")

combined_df.to_csv(output_valid_file, index=False)
print("Combined CSV saved to", os.path.abspath(output_valid_file))

# create combined test predictions file
pred_test_df = pd.read_csv(pred_test_file)
pred_test_df['user_id'] = pred_test_df['user_id'] + 1

label_test_df = pd.read_csv(label_test_file)
combined_df = pd.merge(pred_test_df, label_test_df, on="user_id", how="left")

combined_df.to_csv(output_test_file, index=False)
print("Combined CSV saved to", os.path.abspath(output_test_file))


Combined CSV saved to /content/valid_predictions_LinRec_ML1M.csv
Combined CSV saved to /content/test_predictions_LinRec_ML1M.csv


#### Amazon Beauty

In [None]:
# change configuration to run on Amazon Beauty dataset

!sed -E -i "s/^([[:space:]]*)torch\.distributed\.barrier\(\)/\1if torch.distributed.is_available() and torch.distributed.is_initialized():\n\1    torch.distributed.barrier()/" /usr/local/lib/python3.11/dist-packages/recbole/data/dataset/dataset.py
!sed -i 's/^# from recbole.quick_start.quick_start import run_recbole/from recbole.quick_start.quick_start import run_recbole/' /content/LinRec/run.py
!sed -i "s/'train_batch_size': 2048/'train_batch_size': 1024/" /content/LinRec/run.py
!sed -i "s/'eval_batch_size': 2048/'eval_batch_size': 1024/" /content/LinRec/run.py
!sed -i "s/'hidden_size': 128/'hidden_size': 64/" /content/LinRec/run.py
!sed -i 's/repeatable: False/repeatable: True/' /content/LinRec/overall.yaml
!sed -i "/^checkpoint_dir: 'saved'/a output_dir: 'output'" /content/LinRec/overall.yaml
!sed -i "s/run_recbole(model='SASRec', dataset='ML-1M', config_dict=parameter_dict)/run_recbole(model='SASRec', dataset='amazon-beauty', config_file_list=['amazon-beauty.yaml', 'overall.yaml'], config_dict=parameter_dict)/" /content/LinRec/run.py

In [None]:
beauty_yaml_file_path = "/content/LinRec/amazon-beauty.yaml"

yaml_content = """
# Atomic File Format
field_separator: "\t"           # (str) Separator of different columns in atomic files.
seq_separator: " "              # (str) Separator inside the sequence features.

# Basic Information
USER_ID_FIELD: user_id          # (str) Field name of user ID feature.
ITEM_ID_FIELD: item_id          # (str) Field name of item ID feature.
RATING_FIELD: rating            # (str) Field name of rating feature.
TIME_FIELD: timestamp           # (str) Field name of timestamp feature.
seq_len: ~                      # (dict) Field name of sequence feature: maximum length of each sequence
LABEL_FIELD: label              # (str) Expected field name of the generated labels for point-wise dataLoaders.
threshold: ~                    # (dict) 0/1 labels will be generated according to the pairs.
NEG_PREFIX: neg_                # (str) Negative sampling prefix for pair-wise dataLoaders.

# Sequential Model Needed
ITEM_LIST_LENGTH_FIELD: item_length   # (str) Field name of the feature representing item sequences' length.
LIST_SUFFIX: _list              # (str) Suffix of field names which are generated as sequences.
MAX_ITEM_LIST_LENGTH: 200        # (int) Maximum length of each generated sequence.
POSITION_FIELD: position_id     # (str) Field name of the generated position sequence.

# Knowledge-based Model Needed
HEAD_ENTITY_ID_FIELD: head_id   # (str) Field name of the head entity ID feature.
TAIL_ENTITY_ID_FIELD: tail_id   # (str) Field name of the tail entity ID feature.
RELATION_ID_FIELD: relation_id  # (str) Field name of the relation ID feature.
ENTITY_ID_FIELD: entity_id      # (str) Field name of the entity ID.
kg_reverse_r: False             # (bool) Whether to reverse relations of triples for bidirectional edges.
entity_kg_num_interval: ~       # (str) Entity interval for filtering kg, such as [A,B] / [A,B) / (A,B) / (A,B].
relation_kg_num_interval: ~     # (str) Relation interval for filtering kg, such as [A,B] / [A,B) / (A,B) / (A,B].

# Selectively Loading
load_col:                       # (dict) The suffix of atomic files: (list) field names to be loaded.
    inter: [user_id, item_id, rating, timestamp]
unload_col: ~                   # (dict) The suffix of atomic files: (list) field names NOT to be loaded.
unused_col: ~                   # (dict) The suffix of atomic files: (list) field names which are loaded but not used.

# Filtering
rm_dup_inter: ~                 # (str) Whether to remove duplicated user-item interactions.
val_interval: ~                 # (dict) Filter inter by values in {value field (str): interval (str)}.
filter_inter_by_user_or_item: True    # (bool) Whether or not to filter inter by user or item.
user_inter_num_interval: "[5, inf)"     # (str) User interval for filtering inter, such as [A,B] / [A,B) / (A,B) / (A,B].
item_inter_num_interval: "[5, inf)"     # (str) Item interval for filtering inter, such as [A,B] / [A,B) / (A,B) / (A,B].

# Preprocessing
alias_of_user_id: ~             # (list) Fields' names remapped into the same index system with USER_ID_FIELD.
alias_of_item_id: ~             # (list) Fields' names remapped into the same index system with ITEM_ID_FIELD.
alias_of_entity_id: ~           # (list) Fields' names remapped into the same index system with ENTITY_ID_FIELD.
alias_of_relation_id: ~         # (list) Fields' names remapped into the same index system with RELATION_ID_FIELD.
preload_weight: ~               # (dict) Preloaded weight in {IDs (token): pretrained vectors (float-like)}.
normalize_field: ~              # (list) List of filed names to be normalized.
normalize_all: True            # (bool) Whether or not to normalize all the float like fields.
"""

with open(beauty_yaml_file_path, "w") as file:
    file.write(yaml_content)

print(f"YAML file created at: {beauty_yaml_file_path}")

YAML file created at: /content/LinRec/amazon-beauty.yaml


In [None]:
target_dir_layers = '/usr/local/lib/python3.11/dist-packages/recbole/model/layers.py'
target_dir_yaml = '/usr/local/lib/python3.11/dist-packages/recbole/properties/amazon-beauty.yaml'

shutil.copy('/content/LinRec/layers.py', target_dir_layers)
shutil.copy('/content/LinRec/amazon-beauty.yaml', target_dir_yaml)

'/usr/local/lib/python3.11/dist-packages/recbole/properties/amazon-beauty.yaml'

In [None]:
!cd /content/LinRec/ && python run.py

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
import pandas as pd
import os

pred_valid_file = "/content/drive/MyDrive/linrec_valid_predictions_beauty.csv"
label_valid_file = "/content/drive/MyDrive/valid_true_labels_beauty.csv"
output_valid_file = "valid_predictions_LinRec_Beauty.csv"

pred_test_file = "/content/drive/MyDrive/linrec_test_predictions_beauty.csv"
label_test_file = "/content/drive/MyDrive/test_true_labels_beauty.csv"
output_test_file = "test_predictions_LinRec_Beauty.csv"

# create combined validation predictions file
pred_valid_df = pd.read_csv(pred_valid_file)
pred_valid_df['user_id'] = pred_valid_df['user_id'] + 1

label_valid_df = pd.read_csv(label_valid_file)
combined_df = pd.merge(pred_valid_df, label_valid_df, on="user_id", how="left")

combined_df.to_csv(output_valid_file, index=False)
print("Combined CSV saved to", os.path.abspath(output_valid_file))

# create combined test predictions file
pred_test_df = pd.read_csv(pred_test_file)
pred_test_df['user_id'] = pred_test_df['user_id'] + 1

label_test_df = pd.read_csv(label_test_file)
combined_df = pd.merge(pred_test_df, label_test_df, on="user_id", how="left")

combined_df.to_csv(output_test_file, index=False)
print("Combined CSV saved to", os.path.abspath(output_test_file))


Combined CSV saved to /content/valid_predictions_LinRec_Beauty.csv
Combined CSV saved to /content/test_predictions_LinRec_Beauty.csv
