# Data selection and splitting

The following file contains the code for selecting and splitting the data and then saving the results in pickle files for a particular target domain. Note: Make sure to select the correct domain when loading the data below.

Make sure that the directories "data/sentence_embeddings/general/sorted/train/" and "data/sentence_embeddings/general/sorted/val_test/" exist such that the data can be saved there.

## Importing libraries

In [308]:
# imports
import os
import numpy as np
import pandas as pd
import random as rn
import pickle as pkl
import tensorflow as tf

## Loading the data

In [528]:
# import all the data from the general sentence embeddings
with open('data/sentence_embeddings/general/unsorted/sentemb/sentemb.p', 'rb') as f:
    data_general = pkl.load(f)

with open('data/sentence_embeddings/general/unsorted/label_domain/label_domain_train_sentemb.p', 'rb') as f:
    temp_train = pkl.load(f)

with open('data/sentence_embeddings/general/unsorted/label_domain/label_domain_test_sentemb.p', 'rb') as f:
    temp_test = pkl.load(f)

    
labels_general = np.hstack((temp_train, temp_test))

In [577]:
# import all the data from the general sentence embeddings
with open('data/sentence_embeddings/general/unsorted/sentemb/sentemb_unlabeled.p', 'rb') as f:
    data_general = pkl.load(f)

with open('data/sentence_embeddings/general/unsorted/label_domain/label_domain_train_sentemb_unlabeled.p', 'rb') as f:
    temp_train = pkl.load(f)

with open('data/sentence_embeddings/general/unsorted/label_domain/label_domain_test_sentemb_unlabeled.p', 'rb') as f:
    temp_test = pkl.load(f)

labels_general = np.hstack((temp_train, temp_test))

In [578]:
labels_general.shape

(3, 31753)

In [579]:
# import all the specific sentence embedding data - here domain 0 was chosen
with open('data/sentence_embeddings/specific/sentemb/sentemb_unlabeled_1.p', 'rb') as f:
    data_spec = pkl.load(f)
    
with open('data/sentence_embeddings/specific/label_domain/label_domain_train_sentemb_unlabeled_1.p', 'rb') as f:
    temp_train = pkl.load(f)

with open('data/sentence_embeddings/specific/label_domain/label_domain_test_sentemb_unlabeled_1.p', 'rb') as f:
    temp_test = pkl.load(f)
    
labels_spec = np.hstack((temp_train, temp_test))

In [551]:
labels_general.shape

(3, 31753)

In [533]:
pd.set_option('display.max_rows', df.shape[0]+1)

In [565]:

df = pd.DataFrame(labels_spec.transpose(), columns = ['label','domain','idx_domain'])

In [566]:
df = df[df.label!=3]

In [567]:
df

Unnamed: 0,label,domain,idx_domain
0,0,1,0
1,1,1,1
2,0,1,2
3,0,1,3
4,0,1,4
5,0,1,5
6,0,1,6
7,1,1,7
8,0,1,8
9,1,1,9


In [568]:
array = df.to_numpy().astype("int")

In [569]:
labels_spec = array.transpose()

In [571]:
labels_general.shape

(3, 31753)

Make sure to load the data of the desired target domain here:

In [397]:
# import all the specific sentence embedding data - here domain 0 was chosen
with open('data/sentence_embeddings/specific/sentemb/sentemb_0.p', 'rb') as f:
    data_spec = pkl.load(f)
    
with open('data/sentence_embeddings/specific/label_domain/label_domain_train_sentemb_0.p', 'rb') as f:
    temp_train = pkl.load(f)

with open('data/sentence_embeddings/specific/label_domain/label_domain_test_sentemb_0.p', 'rb') as f:
    temp_test = pkl.load(f)
    
labels_spec = np.hstack((temp_train, temp_test))

In [586]:
labels_spec.shape

(3, 2000)

## Necessary functions

In [582]:
# function for sorting two arrays such that both arrays have the same labels
# returns indeces_sorted which consists of indices and is used for sorting array_to_sort
def sort_array(array_to_sort, array_ref):
    
    y, y_ref = array_to_sort[0].astype(int), array_ref[0].astype(int)
    indeces_zeros, indeces_ones = [], []

    # get indices when array_to_sort is 0 (indeces_zeros) and when it is 1 (indeces_ones)
    for i in np.arange(y.shape[0]):
        if y[i] == 0:
            indeces_zeros.append(i)
        else:
            indeces_ones.append(i)

    indeces_sorted = np.zeros(y_ref.shape[0])
    cnt_zeros, cnt_ones = 0,0
    
    # get sorted indeces
    # pair the first positive (/negative) instance of both arrays, etc. 
    for i in np.arange(y_ref.shape[0]):
        if y_ref[i] == 0:
            indeces_sorted[i] = indeces_zeros[cnt_zeros]
            cnt_zeros += 1
        else:
            indeces_sorted[i] = indeces_ones[cnt_ones]
            cnt_ones += 1
    
    return indeces_sorted.astype(int)

## Sort, split and save data

In [583]:
# get indices for sorting the array
ind = sort_array(labels_general, labels_spec)

# save all the general instances that weren't chosen
data_left = np.delete(data_general, ind, axis = 0)        
labels_left = np.delete(labels_general, ind, axis = 1)

# sorted general sentence embeddings
data_general = data_general[ind]
labels_general = labels_general[:, ind]

IndexError: index 302 is out of bounds for axis 0 with size 300

In [576]:
X_train

array([[-0.02498276,  0.00168271,  0.00792256, ...,  0.0036762 ,
         0.00484477,  0.1932837 ],
       [ 0.00365227, -0.03625336, -0.02291604, ...,  0.03304866,
         0.03202532,  0.03387291],
       [ 0.01216486, -0.03942176, -0.00052602, ...,  0.02228229,
         0.01054347,  0.07295793],
       ...,
       [-0.00322475, -0.01633911, -0.0150385 , ...,  0.0236278 ,
         0.02101964,  0.00627279],
       [-0.00493971, -0.00424474, -0.00962966, ...,  0.01981669,
        -0.00560577,  0.08589787],
       [ 0.01400684, -0.01928884,  0.00668082, ...,  0.03220878,
         0.02715117,  0.032345  ]], dtype=float32)

In [574]:
# split the data 70-10-20 (train-validation-test) - data was already shuffled before
X_train = data_general[:1400]
X_val = data_general[1400:1600]
X_test = data_general[1600:]

In [575]:
# save data
pkl.dump(np.vstack((X_val, X_test)), open("data/sentence_embeddings/general/sorted/val_test/vt_data_1.p", "wb"))
pkl.dump(np.hstack((labels_general[:,1400:1600],labels_general[:,1600:])), open("data/sentence_embeddings/general/sorted/val_test/vt_labels_1.p", "wb"))

pkl.dump(np.vstack((X_train,data_left)), open("data/sentence_embeddings/general/sorted/train/train_data_1.p", "wb"))
pkl.dump(np.hstack((labels_general[:,:1400],labels_left)), open("data/sentence_embeddings/general/sorted/train/train_labels_1.p", "wb"))