In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../src/generic')
import time
import csv
import os
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme()
from tqdm import tqdm

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import torch
assert torch.cuda.is_available(), "selected gpus, but cuda not available"

In [5]:
from datasets import Dataset
from torch.utils.data.sampler import SequentialSampler
from torch.utils.data.dataloader import DataLoader
from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer, default_data_collator

from common.factories import get_embed_model

from dataset.amazon_reviews_clf_dataset import AmazonClfDataset

## Generate New Per-User Data Split

In [6]:
data_dir = "/data/ddmg/redditlanguagemodeling/data/AmazonReviews/data"

In [7]:
data_df = pd.read_csv(os.path.join(data_dir, 'amazon_v2.0/reviews.csv'),
                      dtype={'reviewerID':str, 'asin':str, 'reviewTime':str,'unixReviewTime':int,
                             'reviewText':str,'summary':str,'verified':bool,'category':str, 'reviewYear':int},
                      keep_default_na=False, na_values=[], quoting=csv.QUOTE_NONNUMERIC)

In [11]:
data_df

Unnamed: 0,reviewerID,asin,overall,reviewTime,unixReviewTime,reviewText,summary,verified,category,reviewYear
0,A14SJT4M0BP298,B0009RF9DW,5.0,"12 17, 2017",1513468800,Great product for relaxing!,Five Stars,True,All_Beauty,2017
1,A2XNLIC0O07GPW,B0009RF9DW,5.0,"05 25, 2017",1495670400,Bought along with their shampoo and lotion for...,Great Price For A Rental Welcome Gift,True,All_Beauty,2017
2,ASL42Q7LYJWFV,B0009RF9DW,5.0,"11 3, 2016",1478131200,Tried many but this is the one to buy.,A favorite shower bathing gel.,True,All_Beauty,2016
3,A3RUBIOZYJNY0D,B0009RF9DW,5.0,"09 24, 2016",1474675200,thank you,Five Stars,True,All_Beauty,2016
4,A28TUBAOIO801,B0009RF9DW,5.0,"06 3, 2016",1464912000,"My favorite body wash, suds up nice, leaves sk...",Can't go wrong with oil of Olay,True,All_Beauty,2016
...,...,...,...,...,...,...,...,...,...,...
10116942,A3R16UTKDL4FOA,B01F9HMO2K,4.0,"09 13, 2018",1536796800,"Most shooter games focus in the future, like C...",AMAZING GAME,True,Video_Games,2018
10116943,A2L167UOLOYOO0,B01GKF7T9S,5.0,"09 22, 2018",1537574400,Bought this for my brothers birthday and he re...,Great,True,Video_Games,2018
10116944,AS2J1Q4Y0ZM5A,B01GKGVI8U,3.0,"09 19, 2018",1537315200,science game i didn't love the game -owens,science game i didn't love the game,True,Video_Games,2018
10116945,A2EPYAWX8OG473,B01GW3POY0,5.0,"08 26, 2018",1535241600,Great game.,Recommended,True,Video_Games,2018


In [8]:
split_df = pd.read_csv(os.path.join(data_dir, 'amazon_v2.0', 'splits', 'user.csv'))

In [14]:
data_df["split"] = split_df["split"]

In [15]:
keep_df = data_df[split_df["split"] != -1]

In [16]:
keep_df

Unnamed: 0,reviewerID,asin,overall,reviewTime,unixReviewTime,reviewText,summary,verified,category,reviewYear,split
10,AKWBRE0JKA2A1,B0009RF9DW,5.0,"03 21, 2015",1426896000,Fragarant is strong enough to use without loti...,Satisfied!,True,All_Beauty,2015,0.0
19,A3HHQ7UIJJAOAV,B0009RF9DW,5.0,"07 14, 2014",1405296000,I love this for when I take a shower.,Five Stars,False,All_Beauty,2014,1.0
34,A3HHQ7UIJJAOAV,B0009RF9DW,5.0,"06 4, 2013",1370304000,We both love the shower gel. It smells so goo...,shower gel used by grandmom and grand-daughter...,True,All_Beauty,2013,1.0
88,AMYTL79JMGQ6D,B000URXP6E,5.0,"10 26, 2016",1477440000,To me this shampoo has to best smell. It clean...,My favorite shampoo,True,All_Beauty,2016,3.0
144,A173YMJ9XFVRSY,B0010ZBORW,5.0,"10 24, 2017",1508803200,Great for an at home spa experience. I have dr...,Moisturizing,False,All_Beauty,2017,1.0
...,...,...,...,...,...,...,...,...,...,...,...
10116683,AAGQNG7TITMMX,B01GW3POY0,5.0,"03 14, 2017",1489449600,This game is scary as all get up. At first I w...,Every resident evil I've played has been diff...,False,Video_Games,2017,1.0
10116710,A8YYMBXZN2SEZ,B01GW3POY0,4.0,"01 24, 2017",1485216000,Just got it and if it is anything like any of ...,NICE,True,Video_Games,2017,0.0
10116755,A37AO20OXS51QA,B01GW3NY5Q,2.0,"03 23, 2017",1490227200,The tens and zeros are both jokes. It is IMPOS...,"Made poorly, not up to modern standards for op...",False,Video_Games,2017,3.0
10116878,A33M07VZBYRMPY,B00CJTA87A,5.0,"06 11, 2018",1528675200,I was a little disappointed when my new Ninten...,Absolutely needed,True,Video_Games,2018,1.0


In [17]:
keep_df.groupby('reviewerID').describe()

Unnamed: 0_level_0,overall,overall,overall,overall,overall,overall,overall,overall,unixReviewTime,unixReviewTime,...,reviewYear,reviewYear,split,split,split,split,split,split,split,split
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
reviewerID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
A035230154WEA8JCP8HS,75.0,5.000000,0.000000,5.0,5.0,5.0,5.0,5.0,75.0,1.465407e+09,...,2017.0,2018.0,75.0,3.000000,0.000000,3.0,3.0,3.0,3.0,3.0
A100U1GGBSDMLL,75.0,4.600000,0.493197,4.0,4.0,5.0,5.0,5.0,75.0,1.501641e+09,...,2017.0,2018.0,75.0,1.000000,0.000000,1.0,1.0,1.0,1.0,1.0
A100UD67AHFODS,75.0,4.653333,0.877548,1.0,5.0,5.0,5.0,5.0,75.0,1.402377e+09,...,2015.5,2018.0,75.0,1.000000,0.000000,1.0,1.0,1.0,1.0,1.0
A101S5PLO0VRHQ,172.0,3.255814,1.276747,1.0,2.0,3.0,4.0,5.0,172.0,1.446569e+09,...,2016.0,2016.0,172.0,0.872093,0.994682,0.0,0.0,0.0,2.0,2.0
A102Z3T7NSM5KC,228.0,4.162281,0.793685,1.0,4.0,4.0,5.0,5.0,228.0,1.392896e+09,...,2014.0,2016.0,228.0,0.657895,0.941729,0.0,0.0,0.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZULU4TOTOLEU,400.0,4.340000,0.781618,1.0,4.0,4.0,5.0,5.0,400.0,1.448359e+09,...,2016.0,2018.0,400.0,0.375000,0.781602,0.0,0.0,0.0,0.0,2.0
AZV26LP92E6WU,184.0,4.788043,0.770826,1.0,5.0,5.0,5.0,5.0,184.0,1.374057e+09,...,2015.0,2017.0,184.0,0.815217,0.985461,0.0,0.0,0.0,2.0,2.0
AZV2U6GU5QA6C,75.0,4.573333,0.573593,2.0,4.0,5.0,5.0,5.0,75.0,1.410755e+09,...,2015.0,2017.0,75.0,3.000000,0.000000,3.0,3.0,3.0,3.0,3.0
AZW8YNESLMRRJ,170.0,4.900000,0.300886,4.0,5.0,5.0,5.0,5.0,170.0,1.446202e+09,...,2016.0,2018.0,170.0,0.882353,0.995989,0.0,0.0,0.0,2.0,2.0


In [18]:
my_split_df = split_df.copy()

In [22]:
# split user data with random 60-20-20 split
users = keep_df['reviewerID'].unique()

for user in users:
    user_ids = keep_df[keep_df['reviewerID'] == user].index
    full_count = len(user_ids)
    train_count = int(.6 * full_count)
    val_count = int(.2 * full_count)
    test_count = full_count - (train_count + val_count)
    assignments = [0] * train_count + [1] * val_count + [2] * test_count
    np.random.shuffle(assignments)
    keep_df.loc[user_ids, "split"] = assignments
    my_split_df.loc[user_ids, "split"] = assignments

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [23]:
keep_df.groupby('reviewerID').describe()

Unnamed: 0_level_0,overall,overall,overall,overall,overall,overall,overall,overall,unixReviewTime,unixReviewTime,...,reviewYear,reviewYear,split,split,split,split,split,split,split,split
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
reviewerID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
A035230154WEA8JCP8HS,75.0,5.000000,0.000000,5.0,5.0,5.0,5.0,5.0,75.0,1.465407e+09,...,2017.0,2018.0,75.0,0.600000,0.805387,0.0,0.0,0.0,1.0,2.0
A100U1GGBSDMLL,75.0,4.600000,0.493197,4.0,4.0,5.0,5.0,5.0,75.0,1.501641e+09,...,2017.0,2018.0,75.0,0.600000,0.805387,0.0,0.0,0.0,1.0,2.0
A100UD67AHFODS,75.0,4.653333,0.877548,1.0,5.0,5.0,5.0,5.0,75.0,1.402377e+09,...,2015.5,2018.0,75.0,0.600000,0.805387,0.0,0.0,0.0,1.0,2.0
A101S5PLO0VRHQ,172.0,3.255814,1.276747,1.0,2.0,3.0,4.0,5.0,172.0,1.446569e+09,...,2016.0,2016.0,172.0,0.604651,0.806103,0.0,0.0,0.0,1.0,2.0
A102Z3T7NSM5KC,228.0,4.162281,0.793685,1.0,4.0,4.0,5.0,5.0,228.0,1.392896e+09,...,2014.0,2016.0,228.0,0.609649,0.808160,0.0,0.0,0.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZULU4TOTOLEU,400.0,4.340000,0.781618,1.0,4.0,4.0,5.0,5.0,400.0,1.448359e+09,...,2016.0,2018.0,400.0,0.600000,0.801002,0.0,0.0,0.0,1.0,2.0
AZV26LP92E6WU,184.0,4.788043,0.770826,1.0,5.0,5.0,5.0,5.0,184.0,1.374057e+09,...,2015.0,2017.0,184.0,0.608696,0.809189,0.0,0.0,0.0,1.0,2.0
AZV2U6GU5QA6C,75.0,4.573333,0.573593,2.0,4.0,5.0,5.0,5.0,75.0,1.410755e+09,...,2015.0,2017.0,75.0,0.600000,0.805387,0.0,0.0,0.0,1.0,2.0
AZW8YNESLMRRJ,170.0,4.900000,0.300886,4.0,5.0,5.0,5.0,5.0,170.0,1.446202e+09,...,2016.0,2018.0,170.0,0.600000,0.802363,0.0,0.0,0.0,1.0,2.0


In [24]:
sum(my_split_df["split"] != -1)

539502

In [25]:
sum(my_split_df["split"] == 0)

323189

In [26]:
323189 / 539502

0.5990506059291717

In [27]:
my_split_df.to_csv("/data/ddmg/redditlanguagemodeling/data/AmazonReviews/data/amazon_v2.0/splits/my_user_split.csv", index=False)

## Get Smaller Subset of this for Debugging (100 users)