In [1]:
%load_ext autoreload
%autoreload 2

In [171]:
import sys
sys.path.append('../../src/generic')
import csv
import os
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme()
from copy import deepcopy
from scipy.spatial.distance import cosine
from sklearn.metrics import log_loss
from sklearn.metrics.pairwise import cosine_similarity

In [150]:
from datasets import Dataset

In [19]:
from dataset.amazon_reviews_clf_dataset import AmazonClfDataset
from results.process_results import ResultProcessor

In [3]:
data_dir = "/data/ddmg/redditlanguagemodeling/data/AmazonReviews/data"

In [4]:
data_df = pd.read_csv(os.path.join(data_dir, 'amazon_v2.0/reviews.csv'),
                      dtype={'reviewerID':str, 'asin':str, 'reviewTime':str,'unixReviewTime':int,
                             'reviewText':str,'summary':str,'verified':bool,'category':str, 'reviewYear':int},
                      keep_default_na=False, na_values=[], quoting=csv.QUOTE_NONNUMERIC)

In [5]:
split_df = pd.read_csv(os.path.join(data_dir, 'amazon_v2.0', 'splits', 'wilds_subpop_shift_user.csv'))

In [6]:
data_df["split"] = split_df["split"]

In [7]:
data_df = data_df[data_df["split"].isin([0, 1, 2])]

In [8]:
len(data_df)

292452

In [56]:
# limit to just users in the test set
print(len(data_df["reviewerID"].unique()))
print(len(data_df[data_df["split"] == 1]["reviewerID"].unique()))
print(len(data_df[data_df["split"] == 2]["reviewerID"].unique()))

test_users = data_df[data_df["split"] == 2]["reviewerID"].unique()

test_user_df = data_df[data_df["reviewerID"].isin(test_users)]

1252
1252
626


In [9]:
len(data_df[data_df["split"] == 1])

97703

In [11]:
def count_1(x):
    return sum(x == 1)

count_1.__name__ = "count_1"

def count_2(x):
    return sum(x == 2)

count_2.__name__ = "count_2"

def count_3(x):
    return sum(x == 3)

count_3.__name__ = "count_3"

def count_4(x):
    return sum(x == 4)

count_4.__name__ = "count_4"

def count_5(x):
    return sum(x == 5)

count_5.__name__ = "count_5"

In [13]:
count_fns = [count_1, count_2, count_3, count_4, count_5]

In [57]:
score_df = test_user_df[["reviewerID", "overall"]]

In [58]:
# Examine distribution of user ratings and get users with different distributions
review_dist_by_user = score_df.groupby(["reviewerID"]).agg(count_fns)

In [76]:
# get distr for just train data
train_score_df = test_user_df[test_user_df["split"] == 0][["reviewerID", "overall"]]
train_review_dist_by_user = train_score_df.groupby(["reviewerID"]).agg(count_fns)

In [91]:
# get distr for just train data
val_score_df = test_user_df[test_user_df["split"] == 1][["reviewerID", "overall"]]
val_review_dist_by_user = val_score_df.groupby(["reviewerID"]).agg(count_fns)

In [81]:
# get distr for just test data
test_score_df = test_user_df[test_user_df["split"] == 2][["reviewerID", "overall"]]
test_review_dist_by_user = test_score_df.groupby(["reviewerID"]).agg(count_fns)

In [77]:
train_review_dist_by_user

Unnamed: 0_level_0,overall,overall,overall,overall,overall
Unnamed: 0_level_1,count_1,count_2,count_3,count_4,count_5
reviewerID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
A103M7E0BSFC83,0.0,0.0,25.0,90.0,95.0
A10AWAE0F6CAP1,0.0,0.0,0.0,76.0,141.0
A10JAUCIGVRW9F,2.0,3.0,4.0,14.0,187.0
A10L137T790W84,0.0,0.0,8.0,61.0,67.0
A10PXQY535XY39,0.0,0.0,2.0,36.0,55.0
...,...,...,...,...,...
AY3XPKRAMKKY7,2.0,1.0,11.0,14.0,62.0
AY5E0O8II03DY,2.0,5.0,12.0,29.0,34.0
AYQ14XGJODM9J,0.0,2.0,7.0,17.0,82.0
AYTMXLYK6SC9H,0.0,1.0,7.0,15.0,39.0


In [59]:
review_dist_by_user

Unnamed: 0_level_0,overall,overall,overall,overall,overall
Unnamed: 0_level_1,count_1,count_2,count_3,count_4,count_5
reviewerID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
A103M7E0BSFC83,0.0,3.0,62.0,179.0,180.0
A10AWAE0F6CAP1,0.0,0.0,0.0,149.0,287.0
A10JAUCIGVRW9F,2.0,6.0,10.0,28.0,378.0
A10L137T790W84,0.0,0.0,17.0,124.0,160.0
A10PXQY535XY39,0.0,1.0,8.0,73.0,147.0
...,...,...,...,...,...
AY3XPKRAMKKY7,3.0,6.0,19.0,34.0,163.0
AY5E0O8II03DY,4.0,10.0,31.0,71.0,95.0
AYQ14XGJODM9J,0.0,4.0,16.0,36.0,198.0
AYTMXLYK6SC9H,7.0,8.0,19.0,40.0,104.0


In [60]:
review_distrs = review_dist_by_user.to_numpy()

In [61]:
# get pairwise distances
pairwise_sims = cosine_similarity(review_distrs)

In [62]:
# get least similar dists
min_idx = np.unravel_index(np.argmin(pairwise_sims), pairwise_sims.shape)

In [63]:
pairwise_sims[min_idx]

0.044803727825239045

In [64]:
min_idx

(30, 120)

In [66]:
review_distrs[30]

array([  0.,   0.,   0.,   0., 281.])

In [67]:
review_distrs[120]

array([  0.,   0.,   5., 156.,   7.])

In [68]:
cosine(review_distrs[30], review_distrs[120]) - 1

-0.044803727825239004

In [69]:
user_1 = review_dist_by_user.iloc[30].name
user_2 = review_dist_by_user.iloc[120].name

In [70]:
print(user_1, user_2)

A17EGVTXRDFTRS A1POJZWVDNEX2T


## Examine Model Perf for Different Users

### Global Model

In [17]:
main_result_dir = "/data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split"

In [20]:
base_result_dir = os.path.join(main_result_dir, "eval_global_model")
levels = ["train_seed", "eval_seed"]
global_results = ResultProcessor(base_result_dir, levels, verbose=True)

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/eval_global_model: Found results for 3 train_seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/eval_global_model/42: Found results for 1 eval_seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/eval_global_model/44: Found results for 1 eval_seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/eval_global_model/43: Found results for 1 eval_seeds


In [21]:
group_results_df = global_results.get_group_results("user")

looking for group result column names from file at: /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/eval_global_model/42/42/user_test_results.json
looking for group result column names from file at: /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/eval_global_model/44/42/user_test_results.json
looking for group result column names from file at: /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/eval_global_model/43/42/user_test_results.json


In [22]:
global_user_results = group_results_df.groupby(["user"]).agg("mean")[["loss", "accuracy"]]
global_user_results

Unnamed: 0_level_0,loss,accuracy
user,Unnamed: 1_level_1,Unnamed: 2_level_1
A103M7E0BSFC83,0.975567,0.595556
A10AWAE0F6CAP1,0.621960,0.671111
A10JAUCIGVRW9F,0.591436,0.693333
A10L137T790W84,0.182477,0.933333
A10PXQY535XY39,0.417040,0.786667
...,...,...
AY3XPKRAMKKY7,0.606011,0.755556
AY5E0O8II03DY,0.708066,0.666667
AYQ14XGJODM9J,0.614990,0.720000
AYTMXLYK6SC9H,0.708573,0.711111


In [None]:
gobal_user_results.index

In [71]:
# get results for the specific users
global_user_results.loc[user_1]

loss        0.224817
accuracy    0.991111
Name: A17EGVTXRDFTRS, dtype: float64

In [72]:
global_user_results.loc[user_2]

loss        0.769321
accuracy    0.560000
Name: A1POJZWVDNEX2T, dtype: float64

### Global + User Fine-tune

In [23]:
base_result_dir = os.path.join(main_result_dir, "finetune_user")
levels = ["user", "seed"]
ft_results = ResultProcessor(base_result_dir, levels, verbose=True)

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user: Found results for 626 users
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A2YQ0ULTPTMHJS: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A29A1CPI3AOYSD: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A3F7GJ547NUYY5: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/AMNZH0DQ14X4H: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A16A3MYURJDYS7: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A32DYU

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1ZCEJEA67P6DE: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A34BRH9R7DHXWU: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A2GWTB02UN6MTX: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A3HU2WUBWT20P8: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A18QI1V17JMUEC: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A2SH6A32BE6NEV: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetu

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A3T5VCXB3DRKCK: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A2XKX0EWBR2ZT0: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A6SR7U2PNGN0N: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1I98GJAHJHSAM: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A55ER33W947XU: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A2FNC5NMEG2340: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1YTMISDM7N0B7: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A3IJW8OF1ZKH0K: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1LYT31WSI9NVL: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A2GVWOGCJN0FKH: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1Q4Q4YONL3VIT: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A3ABPSWQZDPHI7: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetu

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A3Q6LH3YX5H41P: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/ACUJMLOJEVYTB: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1EJXET3VTLEIE: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A19XMHRB3G4DIR: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A2IG937LQKWO7C: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A2SDFM6RV0SQ4: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1MDSZSHCMNCLN: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A389670FO394S5: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A306UFMRUW6OTV: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A21FZK6X6CVS2Q: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A2X0DTXQDEK1QB: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1MCH5RXDOH87H: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetu

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A5VGCYS7J6JVG: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A2O5RT4RCC4NU2: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A2O1EXR7TDA273: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A29TWAFI927A05: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A5A327BVHRNS5: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A3TUZGW2KDZ93K: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A633NIAR4JJAR: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A16DSXRAN5QK94: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A31Z48BHHUTB0P: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/AINGNE1J86KTH: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A3KMEQP8CSCPC2: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A3SE1XKZC0L913: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1QVQ7UJY4VASG: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A3DXZEH4CNKFYG: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A12KOCSLMK9RSN: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1WT56BJ1VQDQY: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/AKGF27HN0OF19: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A152EA7ZPCA1Q8: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetun

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1BC62X6HZJGZR: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1VYZLUB3UXCR1: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/AO1BAEKHGHNPH: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A27KU0C98ZSTM9: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A2TLU5ODLGESZ6: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A14UX1QSUH387Z: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetun

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/AXECQUXHLB0A3: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A297R44X7WEQ8T: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A34JRGRUMOD5GO: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/AHVAWUTYVIQA6: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1VHK9A4VLJTHC: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A32FO4CA2NKEBS: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A35QGMRK9PTY3C: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/AQ55H30MZSV0Q: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1KSGE7FCMX6SY: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1CB9F7TVZ7IRT: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/AL7LEBKKW8XCC: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune_user/A1SXDVX98NJLHZ: Found results for 3 seeds
Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/clf/wilds_subpop_user_split/finetune

In [24]:
ft_user_results = ft_results.results_df.groupby(["user"]).agg(["mean", "std"])[["test_loss", "test_accuracy"]]
ft_user_results

Unnamed: 0_level_0,test_loss,test_loss,test_accuracy,test_accuracy
Unnamed: 0_level_1,mean,std,mean,std
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A103M7E0BSFC83,0.898162,0.000158,0.631111,0.015396
A10AWAE0F6CAP1,0.679093,0.005264,0.693333,0.000000
A10JAUCIGVRW9F,0.362298,0.000432,0.906667,0.000000
A10L137T790W84,0.171490,0.003645,0.933333,0.000000
A10PXQY535XY39,0.439986,0.002066,0.800000,0.000000
...,...,...,...,...
AY3XPKRAMKKY7,0.702691,0.002751,0.800000,0.000000
AY5E0O8II03DY,0.653936,0.004155,0.755556,0.007698
AYQ14XGJODM9J,0.657953,0.003461,0.706667,0.000000
AYTMXLYK6SC9H,0.762694,0.003142,0.680000,0.000000


In [73]:
# get results for specific users
ft_user_results.loc[user_1]

test_loss      mean    0.000029
               std     0.000003
test_accuracy  mean    1.000000
               std     0.000000
Name: A17EGVTXRDFTRS, dtype: float64

In [75]:
ft_user_results.loc[user_2]

test_loss      mean    0.194791
               std     0.000228
test_accuracy  mean    0.920000
               std     0.000000
Name: A1POJZWVDNEX2T, dtype: float64

In [78]:
# get naive baselines for these users
train_review_dist_by_user.loc[user_1]

overall  count_1      0.0
         count_2      0.0
         count_3      0.0
         count_4      0.0
         count_5    124.0
Name: A17EGVTXRDFTRS, dtype: float64

In [82]:
test_review_dist_by_user.loc[user_1]

overall  count_1     0.0
         count_2     0.0
         count_3     0.0
         count_4     0.0
         count_5    75.0
Name: A17EGVTXRDFTRS, dtype: float64

In [79]:
train_review_dist_by_user.loc[user_2]

overall  count_1     0.0
         count_2     0.0
         count_3     2.0
         count_4    53.0
         count_5     1.0
Name: A1POJZWVDNEX2T, dtype: float64

In [83]:
test_review_dist_by_user.loc[user_2]

overall  count_1     0.0
         count_2     0.0
         count_3     2.0
         count_4    70.0
         count_5     3.0
Name: A1POJZWVDNEX2T, dtype: float64

In [85]:
70/75

0.9333333333333333

## Another idea: Take users and break their validation data into multiple training sets

In [88]:
# 1. Take 2 random users
chosen_users = np.random.choice(test_users, 2, replace=False)
chosen_users

array(['A4MO9RO839BEF', 'A1B5MN8PY0JIJQ'], dtype=object)

In [95]:
user_1, user_2 = chosen_users[0], chosen_users[1]

In [96]:
review_dist_by_user.loc[user_1]

overall  count_1      0.0
         count_2      0.0
         count_3      1.0
         count_4     44.0
         count_5    254.0
Name: A4MO9RO839BEF, dtype: float64

In [97]:
val_review_dist_by_user.loc[user_1]

overall  count_1     0.0
         count_2     0.0
         count_3     1.0
         count_4    13.0
         count_5    75.0
Name: A4MO9RO839BEF, dtype: float64

In [104]:
75 / (75 + 13 + 1)

0.8426966292134831

In [98]:
review_dist_by_user.loc[user_2]

overall  count_1      0.0
         count_2      4.0
         count_3     21.0
         count_4    139.0
         count_5    213.0
Name: A1B5MN8PY0JIJQ, dtype: float64

In [99]:
val_review_dist_by_user.loc[user_2]

overall  count_1     0.0
         count_2     1.0
         count_3     7.0
         count_4    44.0
         count_5    68.0
Name: A1B5MN8PY0JIJQ, dtype: float64

In [105]:
68 / (68 + 44 + 8)

0.5666666666666667

In [100]:
global_user_results.loc[user1]

loss        0.287309
accuracy    0.893333
Name: A4MO9RO839BEF, dtype: float64

In [101]:
global_user_results.loc[user2]

loss        0.603546
accuracy    0.702222
Name: A1B5MN8PY0JIJQ, dtype: float64

In [102]:
ft_user_results.loc[user1]

test_loss      mean    0.343658
               std     0.000679
test_accuracy  mean    0.893333
               std     0.000000
Name: A4MO9RO839BEF, dtype: float64

In [108]:
ft_user_results.loc[user2]

test_loss      mean    0.745628
               std     0.021004
test_accuracy  mean    0.688889
               std     0.007698
Name: A1B5MN8PY0JIJQ, dtype: float64

### Create new datasets for experiments with these users

Split their training data in 2 parts, to create two new pseudo users for each original user. Use full training data from original user as validation data for each user + full test data from original user as test data.

Experiments:
1. Fine-tune on all both users (what happens when you apply uniform similarity matrix?)
2. User-specific fine-tune (need to rerun with training data used to select early stopping point)
3. User weight trainer with the 4 new users (can the model identify the correct similarity relationships? --> use dataset 2

In [106]:
# load exisiting data
data_kwargs = dict(
    data_dir="/data/ddmg/redditlanguagemodeling/data/AmazonReviews/data/amazon_v2.0/",
    raw_data_file="reviews.csv",
    tokenizer_name="distilbert-base-uncased",
    tokenizer_cache_dir="/data/ddmg/redditlanguagemodeling/cached/distilbert",
    split_file="wilds_subpop_shift_user.csv",
    processed_data_dir="amazon_reviews_clf_processed_with_my_subpop_shift_embeds",
)

In [107]:
dataset = AmazonClfDataset(**data_kwargs)

loading processed data from /data/ddmg/redditlanguagemodeling/data/AmazonReviews/data/amazon_v2.0/amazon_reviews_clf_processed_with_my_subpop_shift_embeds


In [110]:
train_data = dataset.train_data

In [199]:
get_new_dataset(dataset.test_data)

Dataset({
    features: ['__index_level_0__', 'asin', 'attention_mask', 'category', 'input_ids', 'labels', 'reviewTime', 'reviewYear', 'sample_id', 'split', 'summary', 'text', 'unixReviewTime', 'user', 'verified', 'embeddings'],
    num_rows: 150
})

In [134]:
select_train_data

Dataset({
    features: ['__index_level_0__', 'asin', 'attention_mask', 'category', 'input_ids', 'labels', 'reviewTime', 'reviewYear', 'sample_id', 'split', 'summary', 'text', 'unixReviewTime', 'user', 'verified', 'embeddings'],
    num_rows: 317
})

In [137]:
len(user1_idx) + len(user2_idx)

317

In [112]:
# create new dataset with just the 2 selected users + split their data into 4 part

def get_new_dataset(dataset):
    users = dataset["user"]
    keep_idx = [i for i in range(len(users)) if users[i] in {user1, user2}]
    return dataset.select(keep_idx)

In [157]:
select_train_data = get_new_dataset(train_data)

In [158]:
select_val_data = get_new_dataset(dataset.val_data)

In [200]:
select_test_data = get_new_dataset(dataset.test_data)

In [160]:
train_user_ids = select_train_data["user"]

In [161]:
user1_idx = np.argwhere(np.array(train_user_ids) == user1)

In [162]:
user2_idx = np.argwhere(np.array(train_user_ids) == user2)

In [163]:
# split these indicies into two groups (one for each pseudo user)
p1_index = np.random.choice(user1_idx.flatten(), int(len(user1_idx) / 2), replace=False)

In [164]:
p2_index = [elm for elm in user1_idx.flatten() if elm not in p1_index]

In [165]:
p3_index = np.random.choice(user2_idx.flatten(), int(len(user2_idx) / 2), replace=False)

In [166]:
p4_index = [elm for elm in user2_idx.flatten() if elm not in p3_index]

In [167]:
len(p1_index) + len(p2_index) + len(p3_index) + len(p4_index)

317

In [168]:
p_users = np.full(317, -1)

In [169]:
p_users[p1_index] = "1"
p_users[p2_index] = "2"
p_users[p3_index] = "3"
p_users[p4_index] = "4"
np.unique(p_users, return_counts=True)

(array([1, 2, 3, 4]), array([67, 68, 91, 91]))

In [173]:
p_to_o_user = {1: user1, 2: user1, 3: user2, 4: user2}

In [170]:
# add this column to dataset
select_train_data = select_train_data.to_pandas()
select_train_data = Dataset.from_pandas(select_train_data)  # get rid of old rows in PyArrow Table
select_train_data = select_train_data.add_column(name="p_user", column=p_users.tolist())

In [177]:
# create new validation dataset with training data 
# duplicate train data from each of the original users so that it is used for both of the pseudo users
select_val_data = select_train_data.to_pandas()
# assign p_users to be single user
def _map_to_single_user(x):
    if x == 2:
        return 1
    if x == 4:
        return 3
    return x
select_val_data["p_user"] = select_val_data["p_user"].apply(lambda x: _map_to_single_user(x))
select_val_data2 = select_val_data.copy()
def _get_other_p_user(x):
    if x == 1:
        return 2
    return 4
select_val_data2["p_user"] = select_val_data["p_user"].apply(lambda x: _get_other_p_user(x))
# duplicate each entry and assign as val data for each psuedo user
select_val_data = select_val_data.append(select_val_data2)

In [180]:
select_val_data["p_user"].value_counts()

4    182
3    182
2    135
1    135
Name: p_user, dtype: int64

In [181]:
select_val_data[["user", "p_user"]]

Unnamed: 0,user,p_user
0,A4MO9RO839BEF,1
1,A4MO9RO839BEF,1
2,A1B5MN8PY0JIJQ,3
3,A1B5MN8PY0JIJQ,3
4,A1B5MN8PY0JIJQ,3
...,...,...
312,A1B5MN8PY0JIJQ,4
313,A1B5MN8PY0JIJQ,4
314,A4MO9RO839BEF,2
315,A1B5MN8PY0JIJQ,4


In [185]:
select_val_data = select_val_data.drop(columns="__index_level_0__")

In [186]:
# convert back to HF dataset
select_val_data = Dataset.from_pandas(select_val_data)

In [201]:
# duplicate test data for so it exists for each p user
select_test_data = select_test_data.to_pandas(select_test_data)

In [202]:
select_test_data["p_user"] = select_test_data["user"]

In [204]:
def user_to_p_user1(user):
    if user == user1:
        return 1
    return 3

def user_to_p_user2(user):
    if user == user2:
        return 2
    return 4

select_test_data2 = select_test_data.copy()
select_test_data["p_user"] = select_test_data["user"].apply(lambda x: user_to_p_user1(x))
select_test_data2["p_user"] = select_test_data["user"].apply(lambda x: user_to_p_user2(x))

In [205]:
select_test_data = select_test_data.append(select_test_data2)

In [206]:
select_test_data = select_test_data.drop(columns="__index_level_0__")

In [207]:
select_test_data = Dataset.from_pandas(select_test_data)

In [208]:
train_p_users = np.array(select_train_data["p_user"])
val_p_users = np.array(select_val_data["p_user"])
test_p_users = np.array(select_test_data["p_user"])

In [209]:
print(np.unique(train_p_users, return_counts=True))
print(np.unique(val_p_users, return_counts=True))
print(np.unique(test_p_users, return_counts=True))

(array([1, 2, 3, 4]), array([67, 68, 91, 91]))
(array([1, 2, 3, 4]), array([135, 135, 182, 182]))
(array([1, 2, 3, 4]), array([75, 75, 75, 75]))


In [212]:
# save datasets
select_train_data.save_to_disk(os.path.join(data_dir, "amazon_reviews_pseudo_user_data", "train_"))
select_val_data.save_to_disk(os.path.join(data_dir, "amazon_reviews_pseudo_user_data", "val_"))
select_test_data.save_to_disk(os.path.join(data_dir, "amazon_reviews_pseudo_user_data", "test_"))

NameError: name 'dataset_dir' is not defined