In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import random
from sklearn.model_selection import train_test_split
import tensorflow as tf
import pathlib
import shutil

In [2]:
NUM_WORDS=8000

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

2 Physical GPUs, 2 Logical GPUs


2021-09-27 16:23:46.397395: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-09-27 16:23:47.463107: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10727 MB memory:  -> device: 0, name: NVIDIA Tesla M40, pci bus id: 0000:08:00.0, compute capability: 5.2
2021-09-27 16:23:47.464343: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 10727 MB memory:  -> device: 1, name: NVIDIA Tesla M40, pci bus id: 0000:09:00.0, compute capability: 5.2


In [4]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=NUM_WORDS)
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=256, padding='post')
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=256, padding='post')
_X_train = X_train.copy()
_X_test = X_test.copy()
_y_train = y_train.copy()
_y_test = y_test.copy()

In [5]:
print(len(X_train))

25000


# RQ1-Split-1-1-1-1

In [6]:
tf.random.set_seed(233)
np.random.seed(233)
random.seed(233)
X_train, X_test, y_train, y_test = _X_train.copy(), _X_test.copy(), _y_train.copy(), _y_test.copy()

In [7]:
tmp = list(zip(X_train, y_train))
random.shuffle(tmp)
X_train, y_train = zip(*tmp)

In [8]:
PARTS=4
X_train_splits = np.array_split(X_train, PARTS)
y_train_splits = np.array_split(y_train, PARTS)
dir_names = [os.path.join('RQ1-IMDB-SplitResults', str(i)) for i in range(PARTS)]
print(dir_names)

['RQ1-IMDB-SplitResults/0', 'RQ1-IMDB-SplitResults/1', 'RQ1-IMDB-SplitResults/2', 'RQ1-IMDB-SplitResults/3']


In [9]:
splits = list(zip(X_train_splits, y_train_splits, dir_names))
print(len(splits))

4


In [10]:
for X, y, dir_name in splits:
    file_name = os.path.join(dir_name, 'imdb_data.npz')
    pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True)
    np.savez(file_name, X_train=X, y_train=y, X_test=X_test, y_test=y_test)

In [11]:
for dir_name in dir_names:
    shutil.copyfile('main.py', os.path.join(dir_name, 'main.py'))

# RQ2-Split-1-2-3-4

In [12]:
tf.random.set_seed(233)
np.random.seed(233)
random.seed(233)
X_train, X_test, y_train, y_test = _X_train.copy(), _X_test.copy(), _y_train.copy(), _y_test.copy()

In [13]:
weightage = np.array([1,2,3,4])
weightage = weightage / weightage.sum()
weightage = (weightage * len(X_train)).astype(np.int)
weightage = weightage.cumsum()[:-1]
weightage

array([ 2500,  7500, 15000])

In [14]:
tmp = list(zip(X_train, y_train))
random.shuffle(tmp)
X_train, y_train = zip(*tmp)

In [15]:
X_train_splits = np.split(X_train, weightage)
y_train_splits = np.split(y_train, weightage)
dir_names = [os.path.join('RQ2-IMDB-SplitResults', str(i)) for i in range(len(weightage) + 1)]

In [16]:
splits = list(zip(X_train_splits, y_train_splits, dir_names))

In [17]:
for X, y, dir_name in splits:
    file_name = os.path.join(dir_name, 'imdb_data.npz')
    pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True)
    np.savez(file_name, X_train=X, y_train=y, X_test=X_test, y_test=y_test)

In [18]:
for dir_name in dir_names:
    shutil.copyfile('main.py', os.path.join(dir_name, 'main.py'))

# RQ2-Split-LT

In [19]:
tf.random.set_seed(233)
np.random.seed(233)
random.seed(233)
X_train, X_test, y_train, y_test = _X_train.copy(), _X_test.copy(), _y_train.copy(), _y_test.copy()

In [20]:
idx_0 = np.where(y_train==0)[0]
idx_1 = np.where(y_train==1)[0]

In [21]:
y_train==1

array([ True, False, False, ..., False,  True, False])

In [22]:
np.random.shuffle(idx_0)
np.random.shuffle(idx_1)

In [23]:
idx_0 = np.random.choice(idx_0, 12000)
idx_1 = np.random.choice(idx_1, 4000)

In [24]:
idx = np.concatenate((idx_0, idx_1))

In [29]:
splits_idx = np.array(list(np.concatenate((x,y)) for x,y in zip(np.split(idx_0, 4), np.split(idx_1, 4))))
X_train_splits = X_train[splits_idx]
y_train_splits = y_train[splits_idx]
dir_names = [os.path.join('RQ2-IMDB_LT-SplitResults', str(i)) for i in range(4)]
splits = list(zip(X_train_splits, y_train_splits, dir_names))

In [30]:
dir_name_all = os.path.join('RQ2-IMDB_LT-SplitResults', 'all')
pathlib.Path(dir_name_all).mkdir(parents=True, exist_ok=True)
shutil.copyfile('main.py', os.path.join(dir_name_all, 'main.py'))
file_name = os.path.join(dir_name_all, 'imdb_data.npz')
np.savez(file_name, X_train=X_train[idx], y_train=y_train[idx], X_test=X_test, y_test=y_test)

In [31]:
for X, y, dir_name in splits:
    file_name = os.path.join(dir_name, 'imdb_data.npz')
    pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True)
    np.savez(file_name, X_train=X, y_train=y, X_test=X_test, y_test=y_test)

In [32]:
for dir_name in dir_names:
    shutil.copyfile('main.py', os.path.join(dir_name, 'main.py'))

# RQ4-Low-Quality

In [33]:
tf.random.set_seed(233)
np.random.seed(233)
random.seed(233)
X_train, X_test, y_train, y_test = _X_train.copy(), _X_test.copy(), _y_train.copy(), _y_test.copy()

In [34]:
tmp = list(zip(X_train, y_train))
random.shuffle(tmp)
X_train, y_train = zip(*tmp)

In [35]:
PARTS=4
X_train_splits = np.array_split(X_train, PARTS)
y_train_splits = np.array_split(y_train, PARTS)
dir_names = [os.path.join('RQ4-IMDB-SplitResults', str(i)) for i in range(PARTS)]
print(dir_names)

['RQ4-IMDB-SplitResults/0', 'RQ4-IMDB-SplitResults/1', 'RQ4-IMDB-SplitResults/2', 'RQ4-IMDB-SplitResults/3']


In [36]:
y_train_splits[3] = np.bitwise_xor(y_train_splits[3], np.random.binomial(1, 0.5, len(y_train_splits[3])))

In [37]:
splits = list(zip(X_train_splits, y_train_splits, dir_names))
print(len(splits))

4


In [38]:
for X, y, dir_name in splits:
    file_name = os.path.join(dir_name, 'imdb_data.npz')
    pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True)
    np.savez(file_name, X_train=X, y_train=y, X_test=X_test, y_test=y_test)

In [39]:
for dir_name in dir_names:
    shutil.copyfile('main.py', os.path.join(dir_name, 'main.py'))