In [1]:
%load_ext autoreload
%autoreload 2

In [46]:
import os

In [11]:
import sys
sys.path.append('../../src/generic')
import numpy as np

## Overview

Create dataset where:
* There are 4 pseudo users --> 2 pairs of users that each come from the same "base"/original user
* As the validation data, use the full train data from the associated "base"/original user
    * this way weight matrix learned should reflect the fact that the training data from both pseudo users are useful for optimizing validation performance
* As test data, use the test data from the associated base user

In [17]:
from datasets import Dataset

from dataset.amazon_reviews_clf_dataset import AmazonClfDataset

In [5]:
data_kwargs = dict(
    data_dir="/data/ddmg/redditlanguagemodeling/data/AmazonReviews/data/amazon_v2.0/",
    raw_data_file="reviews.csv",
    tokenizer_name="distilbert-base-uncased",
    tokenizer_cache_dir="/data/ddmg/redditlanguagemodeling/cached/distilbert",
    split_file="wilds_subpop_shift_user.csv",
    processed_data_dir="amazon_reviews_clf_processed_with_my_subpop_shift_embeds",
)

In [6]:
dataset = AmazonClfDataset(**data_kwargs)

loading processed data from /data/ddmg/redditlanguagemodeling/data/AmazonReviews/data/amazon_v2.0/amazon_reviews_clf_processed_with_my_subpop_shift_embeds


In [7]:
# take users from before (see 2021-09-29 Notebook)
user1 = "A4MO9RO839BEF"
user2 = "A1B5MN8PY0JIJQ"

In [8]:
def get_new_dataset(dataset):
    users = dataset["user"]
    keep_idx = [i for i in range(len(users)) if users[i] in {user1, user2}]
    return dataset.select(keep_idx)

In [9]:
select_train_data = get_new_dataset(dataset.train_data)

### Create Train Data

In [12]:
train_user_ids = select_train_data["user"]
user1_idx = np.argwhere(np.array(train_user_ids) == user1)
user2_idx = np.argwhere(np.array(train_user_ids) == user2)
# split these indicies into two groups (one for each pseudo user)
p1_index = np.random.choice(user1_idx.flatten(), int(len(user1_idx) / 2), replace=False)
p2_index = [elm for elm in user1_idx.flatten() if elm not in p1_index]
p3_index = np.random.choice(user2_idx.flatten(), int(len(user2_idx) / 2), replace=False)
p4_index = [elm for elm in user2_idx.flatten() if elm not in p3_index]

In [13]:
len(p1_index) + len(p2_index) + len(p3_index) + len(p4_index)

317

In [14]:
p_users = np.full(317, -1)
p_users[p1_index] = "1"
p_users[p2_index] = "2"
p_users[p3_index] = "3"
p_users[p4_index] = "4"
np.unique(p_users, return_counts=True)

(array([1, 2, 3, 4]), array([67, 68, 91, 91]))

In [15]:
p_to_o_user = {1: user1, 2: user1, 3: user2, 4: user2}

In [None]:
select_train_data = select_train_data.to_pandas()

In [19]:
# add this column to dataset
select_train_data = Dataset.from_pandas(select_train_data)  # get rid of old rows in PyArrow Table
select_train_data = select_train_data.add_column(name="p_user", column=p_users.tolist())

In [27]:
len(select_train_data)

317

### Create Val Data

In [20]:
select_val_data = select_train_data.to_pandas()

In [22]:
# assign p_users to be single user
def _map_to_single_user(x):
    if x == 2:
        return 1
    if x == 4:
        return 3
    return x
select_val_data["p_user"] = select_val_data["p_user"].apply(lambda x: _map_to_single_user(x))
select_val_data2 = select_val_data.copy()
def _get_other_p_user(x):
    if x == 1:
        return 2
    return 4
select_val_data2["p_user"] = select_val_data["p_user"].apply(lambda x: _get_other_p_user(x))
# duplicate each entry and assign as val data for each psuedo user
select_val_data = select_val_data.append(select_val_data2)

In [23]:
select_val_data["p_user"].value_counts()

4    182
3    182
2    135
1    135
Name: p_user, dtype: int64

In [24]:
select_val_data = select_val_data.drop(columns="__index_level_0__")

In [25]:
# convert back to HF dataset
select_val_data = Dataset.from_pandas(select_val_data)

In [28]:
len(select_val_data)

634

### Create Test Data

In [31]:
select_test_data = get_new_dataset(dataset.test_data)
select_test_data = select_test_data.to_pandas()

In [32]:
def user_to_p_user1(user):
    if user == user1:
        return 1
    return 3

def user_to_p_user2(user):
    if user == user1:
        return 2
    return 4

In [33]:
select_test_data2 = select_test_data.copy()
select_test_data["p_user"] = select_test_data["user"].apply(lambda x: user_to_p_user1(x))
select_test_data2["p_user"] = select_test_data["user"].apply(lambda x: user_to_p_user2(x))

In [39]:
select_test_data = select_test_data.append(select_test_data2)

In [40]:
select_test_data[["user", "p_user"]]

Unnamed: 0,user,p_user
0,A4MO9RO839BEF,1
1,A1B5MN8PY0JIJQ,3
2,A1B5MN8PY0JIJQ,3
3,A1B5MN8PY0JIJQ,3
4,A1B5MN8PY0JIJQ,3
...,...,...
145,A1B5MN8PY0JIJQ,4
146,A1B5MN8PY0JIJQ,4
147,A1B5MN8PY0JIJQ,4
148,A1B5MN8PY0JIJQ,4


In [41]:
select_test_data = select_test_data.drop(columns="__index_level_0__")

In [42]:
select_test_data = Dataset.from_pandas(select_test_data)

In [43]:
train_p_users = np.array(select_train_data["p_user"])
val_p_users = np.array(select_val_data["p_user"])
test_p_users = np.array(select_test_data["p_user"])

In [44]:
print(np.unique(train_p_users, return_counts=True))
print(np.unique(val_p_users, return_counts=True))
print(np.unique(test_p_users, return_counts=True))

(array([1, 2, 3, 4]), array([67, 68, 91, 91]))
(array([1, 2, 3, 4]), array([135, 135, 182, 182]))
(array([1, 2, 3, 4]), array([75, 75, 75, 75]))


In [47]:
data_dir = "/data/ddmg/redditlanguagemodeling/data/AmazonReviews/data"
# save datasets
select_train_data.save_to_disk(os.path.join(data_dir, "amazon_reviews_pseudo_user_data", "train"))
select_val_data.save_to_disk(os.path.join(data_dir, "amazon_reviews_pseudo_user_data", "val"))
select_test_data.save_to_disk(os.path.join(data_dir, "amazon_reviews_pseudo_user_data", "test"))