In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import csv
import os
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme()

In [3]:
data_dir = "/data/ddmg/redditlanguagemodeling/data/AmazonReviews/data"

In [6]:
data_df = pd.read_csv(os.path.join(data_dir, 'amazon_v2.0/reviews.csv'),
                      dtype={'reviewerID':str, 'asin':str, 'reviewTime':str,'unixReviewTime':int,
                             'reviewText':str,'summary':str,'verified':bool,'category':str, 'reviewYear':int},
                      keep_default_na=False, na_values=[], quoting=csv.QUOTE_NONNUMERIC)

In [4]:
split_df = pd.read_csv(os.path.join(data_dir, 'amazon_v2.0', 'splits', 'user.csv'))

In [7]:
clean_df = data_df[split_df["clean"]]

## Generate Covariate Shift Data

### Fully Simulated

In [11]:
# from LEAF benchmark

class LeafSyntheticDataset:
    def __init__(self, num_classes, num_dim, prob_clusters, seed=42):
        
        np.random.seed(seed)
        self.num_classes = num_classes
        self.num_dim = num_dim
        self.num_clusters = len(prob_clusters)
        self.prob_clusters = prob_clusters
        self.Q = np.random.normal(loc=0.0, scale=1.0, size=(self.num_dim + 1, self.num_classes, self.num_clusters))
        self.Sigma = np.zeros((self.num_dim, self.num_dim))
        for i in range(self.num_dim):
            self.Sigma[i, i] = (i + 1)**(-1.2)
        
        self.means = self._generate_clusters()
    
    def get_task(self, num_samples):
        cluster_idx = np.random.choice(
            range(self.num_clusters), size=None, replace=True, p=self.prob_clusters
        )
        new_task = self._generate_task(self.means[cluster_idx], cluster_idx, num_samples)
        return new_task
        
    def _generate_clusters(self):
        means = []
        for i in range(self.num_clusters):
            loc = np.random.normal(loc=0, scale=1, size=None)
            mu = np.random.normal(loc=loc, scale=1, size=self.num_clusters)
            means.append(mu)
        return means
    
    def _generate_x(self, num_samples):
        B = np.random.normal(loc=0, scale=1, size=None)
        loc = np.random.normal(loc=B, scale=1, size=self.num_dim)
        samples = np.ones((num_samples, self.num_dim + 1))
        samples[:, 1:] = np.random.multivariate_normal(mean=loc, cov=self.Sigma, size=num_samples)
        return samples
    
    def _generate_y(self, x, cluster_mean):
        model_info = np.random.normal(loc=cluster_mean, scale=0.1, size=cluster_mean.shape)
        w = np.matmul(self.Q, model_info)
        num_samples = x.shape[0]
        prob = softmax(np.matmul(x, w) + np.random.normal(loc=0, scale=0.1, size=(num_samples, self.num_classes)), axis=1)
        y = np.argmax(prob, axis=1)
        return y, w, model_info
    
    def _generate_task(self, cluster_mean, cluster_id, num_samples):
        x = self._generate_x(num_samples)
        y, w, model_info = self._gerenate_y(x, cluster_mean)
        # remove bias coeff
        x = x[:, 1]
        return {'x': x, 'y': y, 'w': w, 'model_info': model_info, 'cluster': cluster_id}
        

In [None]:
class SyntheticCovariateShift:
    def __init__(self, num_classes, num_dim, probl_clusters, seed=42):
        np.random.seed(seed)
        self.num_classes = num_classes
        self.num_dim = num_dim
        self.num_clusters = len(prob_clusters)
        self.prob_clusters = prob_clusters
        self.means = self._generate_clusters()
        self.w = np.random.normal(loc=0, scale=1, size=(self.num_dim + 1, self.num_classes))
    
    def get_task(self, num_samples):
        cluster_idx = np.random.choice(
            range(self.num_clusters), size=None, replace=True, p=self.prob_clusters
        )
        new_task = self._generate_task(self.means[cluster_idx], cluster_idx, num_samples)

    def _generate_clusters(self):
        means = []
        for i in range(self.num_clusters):
            loc = np.random.normal(loc=0, scale=1, size=None)
            mu = np.random.normal(loc=loc, scale=1, size=self.num_dim)
            means.append(mu)
        return means
    
    def _generate_x(self, num_samples, cluster_mean):
        loc = cluster_mean
        samples = np.ones((num_samples, self.num_dim + 1))
        samples[:, 1:] = np.random.multivariate_normal(mean=loc, cov=np.eye(self.num_dim + 1), size=num_samples)
        return samples
    
    def _generate_y(self, x):
        prob = softmax(np.matmul(x, w) + np.random.normal(loc=0, scale=0.1, size=(num_samples, self.num_classes)), axis=1)
        y = np.argmax(prob, axis=1)
        return y
        
    def _generate_task(self, cluster_mean, cluster_id, num_samples):
        x = self._generate_x(num_samples, cluster_mean)
        y = self._generate_y(x)
        # remove bias coeff
        x = x[:, 1]
        return {'x': x, 'y': y, 'cluster': cluster_id}

In [None]:
def gen_sim_data_leaf(num_tasks, )