In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../src/generic')
import csv
import os
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme()
from sklearn.metrics import log_loss

## Explore User Split Data

In [3]:
data_dir = "/data/ddmg/redditlanguagemodeling/data/AmazonReviews/data"

In [4]:
data_df = pd.read_csv(os.path.join(data_dir, 'amazon_v2.0/reviews.csv'),
                      dtype={'reviewerID':str, 'asin':str, 'reviewTime':str,'unixReviewTime':int,
                             'reviewText':str,'summary':str,'verified':bool,'category':str, 'reviewYear':int},
                      keep_default_na=False, na_values=[], quoting=csv.QUOTE_NONNUMERIC)

In [6]:
split_df = pd.read_csv(os.path.join(data_dir, 'amazon_v2.0', 'splits', 'user.csv'))

In [37]:
data_df["split"] = split_df["split"]

In [38]:
# look at # of reviews per split

data_df.groupby(["split"]).count()["overall"]

split
-1.0    9577445
 0.0     245502
 1.0     100050
 2.0      46950
 3.0     100050
 4.0      46950
Name: overall, dtype: int64

In [15]:
data_df.groupby(["split", "reviewerID"]).count()["overall"].groupby(["split"]).count()

split
-1.0    155638
 0.0      1252
 1.0      1334
 2.0       626
 3.0      1334
 4.0       626
Name: overall, dtype: int64

In [17]:
train_users = set(data_df[data_df["split"] == 0]["reviewerID"])
val_id_users = set(data_df[data_df["split"] == 2]["reviewerID"])
test_id_users = set(data_df[data_df["split"] == 4]["reviewerID"])

In [18]:
val_id_users.issubset(train_users)

True

In [19]:
test_id_users.issubset(train_users)

True

In [20]:
len(val_id_users.intersection(test_id_users))

0

In [21]:
# select samples from each user to use to assess validation perf
# first look at # of samples per user
train_df = data_df[data_df["split"] == 0]

In [45]:
train_df.index

Int64Index([      10,      154,      254,      265,      280,      281,
                 303,      305,      310,      327,
            ...
            10115589, 10115736, 10115812, 10116146, 10116277, 10116481,
            10116581, 10116604, 10116659, 10116710],
           dtype='int64', length=245502)

In [44]:
user_train_counts = train_df.groupby(["reviewerID"]).count()["overall"]
user_train_counts

reviewerID
A101S5PLO0VRHQ     97
A102Z3T7NSM5KC    153
A103M7E0BSFC83    349
A10AWAE0F6CAP1    361
A10B6G6W3DW1EY     98
                 ... 
AZK12FBX7X7LN     195
AZTFYD42KA36L      76
AZULU4TOTOLEU     325
AZV26LP92E6WU     109
AZW8YNESLMRRJ      95
Name: overall, Length: 1252, dtype: int64

In [23]:
user_train_counts.min()

75

In [24]:
user_train_counts.mean()

196.08785942492014

In [26]:
sum(user_train_counts == 75)

10

In [39]:
# use 60% for training and %40 for validation
# make a new split column
data_df["my_split"] = data_df["split"]

In [40]:
# get rid of val OOD, val ID, test OOD
data_df["my_split"][data_df["split"].isin([1, 2, 3])] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df["my_split"][data_df["split"].isin([1, 2, 3])] = -1


In [42]:
# keep test ID the same --> convert to test (2)
data_df["my_split"][data_df["split"] == 4] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df["my_split"][data_df["split"] == 4] = 2


In [46]:
# move some train data to val (1), but otherwise keep the same
for user in user_train_counts.index:
    user_idx = train_df[train_df["reviewerID"] == user].index
    val_count = int(len(user_idx) * .4)
    val_idx = np.random.choice(user_idx, val_count, replace=False)
    data_df.loc[val_idx, "my_split"] = 1

In [47]:
# number of reviews per split
data_df.groupby(["my_split"]).count()["overall"]

my_split
-1.0    9824495
 0.0     147799
 1.0      97703
 2.0      46950
Name: overall, dtype: int64

In [49]:
# examine my user split
# number of users per split
data_df.groupby(["my_split", "reviewerID"]).count()["overall"].groupby(["my_split"]).count()

my_split
-1.0    155648
 0.0      1252
 1.0      1252
 2.0       626
Name: overall, dtype: int64

In [51]:
set(data_df[data_df["my_split"] == 0]["reviewerID"]) == set(data_df[data_df["my_split"] == 1]["reviewerID"])

True

In [52]:
set(data_df[data_df["my_split"] == 2]["reviewerID"]).issubset(set(data_df[data_df["my_split"] == 0]["reviewerID"]))

True

In [54]:
# save split
my_split_df = data_df[["my_split"]]
my_split_df["clean"] = split_df["clean"]
my_split_df = my_split_df.rename(columns={"my_split": "split"})
my_split_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_split_df["clean"] = split_df["clean"]


Unnamed: 0,split,clean
0,-1.0,True
1,-1.0,True
2,-1.0,True
3,-1.0,False
4,-1.0,True
...,...,...
10116942,-1.0,True
10116943,-1.0,True
10116944,-1.0,True
10116945,-1.0,False


In [55]:
my_split_df.to_csv(os.path.join(data_dir, 'amazon_v2.0', 'splits', 'wilds_subpop_shift_user.csv'), index=False)

In [56]:
# create debug split
# take 25 users from test data & also use them for train and val
test_df = data_df[data_df["my_split"] == 2]
test_users = list(set(test_df["reviewerID"]))
debug_users = np.random.choice(test_users, 25, replace=False)

In [63]:
len(debug_users)

25

In [57]:
is_debug_user = data_df["reviewerID"].isin(debug_users)

In [64]:
sum(is_debug_user)

7156

In [67]:
my_split_df2 = my_split_df.copy()

In [68]:
remove_idx = my_split_df2[~is_debug_user].index
my_split_df2.loc[remove_idx, "split"] = -1

In [69]:
data_df["debug_split"] = my_split_df2["split"]

In [70]:
data_df.groupby(["debug_split"]).count()["overall"]

debug_split
-1.0    10110299
 0.0        2873
 1.0        1900
 2.0        1875
Name: overall, dtype: int64

In [71]:
data_df.groupby(["debug_split", "reviewerID"]).count()["overall"].groupby(["debug_split"]).count()

debug_split
-1.0    155656
 0.0        25
 1.0        25
 2.0        25
Name: overall, dtype: int64

In [72]:
my_split_df2

Unnamed: 0,split,clean
0,-1.0,True
1,-1.0,True
2,-1.0,True
3,-1.0,False
4,-1.0,True
...,...,...
10116942,-1.0,True
10116943,-1.0,True
10116944,-1.0,True
10116945,-1.0,False


In [73]:
my_split_df2.to_csv(os.path.join(data_dir, 'amazon_v2.0', 'splits', 'debug_user.csv'), index=False)