In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../src/generic')
import csv
import os
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme()

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import torch
assert torch.cuda.is_available(), "selected gpus, but cuda not available"

In [4]:
from transformers import AutoConfig, AutoModelForMaskedLM

In [5]:
from common.factories import get_embed_model

In [6]:
from dataset.amazon_reviews_clf_dataset import AmazonClfDataset

## Get Post Embeddings

In [7]:
data_kwargs = dict(
    data_dir="/data/ddmg/redditlanguagemodeling/data/AmazonReviews/data/amazon_v2.0/",
    raw_data_file="reviews.csv",
    tokenizer_name="distilbert-base-uncased",
    tokenizer_cache_dir="/data/ddmg/redditlanguagemodeling/cached/distilbert",
    split_file="user.csv",
    processed_data_dir="amazon_reviews_clf_processed",
)

In [8]:
dataset = AmazonClfDataset(**data_kwargs)

loading processed data from /data/ddmg/redditlanguagemodeling/data/AmazonReviews/data/amazon_v2.0/amazon_reviews_clf_processed


In [9]:
dataset.train_data

Dataset({
    features: ['__index_level_0__', 'asin', 'attention_mask', 'category', 'input_ids', 'labels', 'reviewTime', 'reviewYear', 'split', 'summary', 'text', 'unixReviewTime', 'user', 'verified'],
    num_rows: 245502
})

In [10]:
embed_model_config = "distilbert-base-uncased"
config = AutoConfig.from_pretrained(embed_model_config)
embed_model = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased", config=config)

In [None]:
embed_path = "/data/ddmg/redditlanguagemodeling/data/AmazonReviews/data/amazon_v2.0/embeddings/pretrained_distilbert_embeddings"
dataset.embed_data(embed_model, embed_path)

  0%|          | 0/30688 [00:00<?, ?it/s]

embedding train data


 25%|██▌       | 7813/30688 [2:11:37<6:44:00,  1.06s/it] 

In [12]:
dataset.train_data

Dataset({
    features: ['__index_level_0__', 'asin', 'attention_mask', 'category', 'input_ids', 'labels', 'reviewTime', 'reviewYear', 'split', 'summary', 'text', 'unixReviewTime', 'user', 'verified', 'embeddings'],
    num_rows: 245502
})

In [13]:
full_dataset = dataset.merge_data_splits()

In [14]:
full_dataset

Dataset({
    features: ['__index_level_0__', 'asin', 'attention_mask', 'category', 'input_ids', 'labels', 'reviewTime', 'reviewYear', 'split', 'summary', 'text', 'unixReviewTime', 'user', 'verified', 'embeddings'],
    num_rows: 445602
})

In [19]:
embeds = np.array(full_dataset['embeddings'])

In [20]:
embeds.shape

(445602, 768)

## Train GMM-based Clustering Model

In [16]:
from sklearn.mixture import GaussianMixture

In [None]:
# start with k=10 as example, will examine other k later
gmm = GaussianMixture(n_components=10).fit(embeds)