In [1]:
## VER 2: EDIT DATA COLLECTOR CLASS TO ENABLE SWITCHING BETWEEN BINARY AND ORIG MODE 

In [2]:
import numpy as np
import pandas as pd
import os, sys, random, json, gc
from sklearn.model_selection import train_test_split
from collections import Counter
from copy import deepcopy
from math import ceil, floor

sys.path.append("../Code/utils/")
from text_processing import process_tweet_bert
from ekphrasis.classes.segmenter import Segmenter

seed = 123
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)

%load_ext autoreload
%autoreload 2

# segmenter using the word statistics from english Wikipedia
seg_eng = Segmenter(corpus="english") 
seg_tw = Segmenter(corpus="twitter") 

def fill_text_na(df:pd.DataFrame, cols: list = ['text_std'], fill_value=' '):
    """
        Fill text variables with space for tokenizer to work
    """
    dt = df.copy()
    dt[cols] =  df[cols].fillna(fill_value)
    return dt

Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading twitter - 1grams ...
Reading twitter - 2grams ...


In [3]:
def equal_sample(df:pd.DataFrame, var_to_sample:'str', K:int, seed=123):
    """
        Sample a data with as equal of the var_to_sample as much as possible 
    """
    if df.shape[0] <= K: 
        # if not enough to meet quota, return dataset
        return df
    elif df[var_to_sample].nunique == 1 : # only 1 class
        return df.sample(K, random_state=seed)
    else: 
        counter_dict =  dict(Counter(df[var_to_sample]))
        ideal_cat_size = int(K/len(counter_dict))

        delta_dict = {k: v - ideal_cat_size for k,v in counter_dict.items()}
        delta_dict = {k: v for k, v in sorted(delta_dict.items(), key=lambda item: item[1])}

        k_needed = K 
        num_class_elig = len(counter_dict)

        append_ls = []
        for c, v in delta_dict.items(): 
            extra_needed = counter_dict[c]
            if v > 0: 
                extra_needed = floor(k_needed / num_class_elig)
            n_in_sample = min(counter_dict[c],  extra_needed)
            append_ls.append(df[df[var_to_sample] == c].sample(n_in_sample, random_state=seed))

            # update parameters
            num_class_elig -= 1
            k_needed -= n_in_sample
        
        return pd.concat(append_ls)
    

In [4]:
def split_list_label(f, var:str):
    d_t = np.array(f[var].apply(lambda x: [int(y) for y in x[1:-1].split(',')]).values.tolist())
    num_cols = d_t.shape[-1]
    columns=['v'+str(i) for i in range(num_cols)]
    split_dt =   pd.DataFrame(np.array(d_t), columns=columns)
    split_dt['id'] = f['id'].values
    return split_dt, num_cols

In [5]:
def equal_sample_multilabel(d, var:str, K, seed):
    # convert values from list ot sepearte arrays
#     d_t = np.array(d[var].apply(lambda x: [int(y) for y in x[1:-1].split(',')]).values.tolist())
#     num_cols = d_t.shape[-1]
#     columns=['v'+str(i) for i in range(num_cols)]
#     split_dt =   pd.DataFrame(np.array(d_t), columns=columns)
#     split_dt['id'] = d['id'].values
#     d_t = split_dt
    d_t, num_cols = split_list_label(d, var)
    subs = []
    
    for i in range(num_cols): 
        # print("col at ", i)
        temp = equal_sample(d_t, 'v'+str(i), K, seed=seed)
        subs.append(temp)

    base = pd.concat(subs, axis=0).drop_duplicates()
    num_extra = K*num_cols - base.shape[0]
    extra = d_t[~d_t.id.isin(base.id)].sample(num_extra, random_state=seed)
    sample_ids = pd.concat([base, extra]).sample(frac=1, random_state=seed)
    sample_ids = np.unique(sample_ids['id'])

    return d[d.id.isin(sample_ids)]

In [6]:
class DataCollector:
    def __init__(self, drt:str, filelist:list=['founta'], mainvars:list = ['id', 'name','text_all', 'text_std', 'label_bin', 'label_orig', 'label_target'], 
                savevars:list = ['id', 'name','text_std',], 
                label_mode = 'bin', test_ratio=2000, num_pos=1000, pos_ratio=1, val_size=5, max_domain_size=5000):
        self.directory = drt + '/' if len(drt) > 0 else ''
        self.filelist = list(set(filelist)) #dedup
        self.test_ratio = test_ratio
        self.num_pos = num_pos
        self.pos_ratio = pos_ratio
        self.val_size = val_size
        self.max_domain_size = max_domain_size

        self.mainvars = mainvars
        self.savevars = deepcopy(savevars)
        print("CHECK 0", self.savevars)
        self.label_mode = label_mode
        self.label_config = dict()
        self.report_table = pd.DataFrame()
        if self.label_mode == 'bin':
            self.savevars += ['label_bin']
            self.label_var = 'label_bin'
        else:
            self.savevars += ['label_orig','label_target']
            print("CHECK 1", self.savevars)
            if self.label_mode == 'orig':
                self.label_var = 'label_orig'
            elif self.label_mode == 'target':
                self.label_var = 'label_target'
            
        assert type(self.label_var) is str 

        self.train_df = pd.DataFrame(columns=self.mainvars)
        self.val_df   = pd.DataFrame(columns=self.mainvars)
        self.test_df = pd.DataFrame(columns=self.mainvars)
        self.test_ids = dict()
        self.val_ids  = dict()
        
        for name in self.filelist: 
            print("Processing ", name)
            train,val,test = self.__prepareData__(name)            
            # print("Shape", train.shape, val.shape, test.shape)
            self.__appendData__(train, 'train')
            self.__appendData__(val, 'val')
            self.__appendData__(test, 'test')
        
        # Keep track of ids in val and test data
        for k in self.val_df.name.unique():
            ids = self.val_df[self.val_df.name == k].id.array.tolist()
            self.val_ids[k] = ids 
        
        for k in self.test_df.name.unique():
            ids = self.test_df[self.test_df.name == k].id.array.tolist()
            self.test_ids[k] = ids     

   
    def standardize_data(self, segmenter=None):
        # standardize data
        
        if segmenter is not None:
            # tokenizer cannot work with NAN data
            self.train_df['text_all'].fillna(' ', inplace=True)
            self.test_df['text_all'].fillna(' ', inplace=True)

            self.train_df['text_std'] = self.train_df.text_all.apply(lambda x: process_tweet_bert(x, dict(), segmenter, verbose=False))
            self.test_df['text_std'] = self.test_df.text_all.apply(lambda x: process_tweet_bert(x, dict(), segmenter, verbose=False))
        
        self.train_df['text_std'].fillna(' ', inplace=True)
        self.val_df['text_std'].fillna(' ', inplace=True)
        self.test_df['text_std'].fillna(' ', inplace=True)
        
    
    def __appendData__(self, base:pd.DataFrame, to_which='train'):
        if to_which == 'train': 
            self.train_df = pd.concat([base, self.train_df], ignore_index=True)
            self.train_df.reset_index(drop=True, inplace=True)
        elif to_which == 'val':
            self.val_df = pd.concat([base, self.val_df], ignore_index=True)
            self.val_df.reset_index(drop=True, inplace=True)
        else:
            self.test_df = pd.concat([base, self.test_df], ignore_index=True)
            self.test_df.reset_index(drop=True, inplace=True)
    
    
    def __get_problem_config__(self, df: pd.DataFrame):
        """
        Create a dictionary of problem tyes for each variable that contains 'label' in name
        
        Return: 
            problem_config: nested dict, {domain: {'label_var': {problem_type, num_classes}}}
        """
        problem_config = dict()
        label_vars = [l for l in df.columns if 'label' in l ]
        
        for label_var in label_vars: 
            if df[label_var].dtypes == int or df[label_var].dtypes == float:
                problem_type = 'single_label_classification'
                num_classes = df[label_var].nunique()
            elif df[label_var].dtypes == object:
                if max(df[label_var].str.len()) == 1:
                    problem_type = 'single_label_classification'
                    num_classes = df[label_var].nunique()
                else: 
                    problem_type = 'multi_label_classification'
                    num_classes = max(df[label_var].str[1:-1].apply(lambda x: len(x.split(','))))

            problem_config[label_var] = {'problem_type': problem_type,  'num_classes': num_classes}
        
        return problem_config 
            
        
    def __prepareData__(self, name, sep=','): 
        """
        Split dataframe into processed train and original test 
        @param fname: name of directory that contains dataset
        @param test_ratio: float, ratio of test size on origial shape 
        @param num_pos: int, OVERLOADED if label_mode = bin, number of positive samples to be included in train set
                        else, it is the number of samples to be included in train set 
        @param pos_ratio: float, ratio of pos vs neg samples
        """ 
        fname = name[0].lower() + name[1:]
        dt = pd.read_csv(self.directory + '/' + fname + '.csv', delimiter=sep)
        dt['name'] = fname
        dt = dt.sample(frac=1, random_state=seed) # shuffle first
        
        problem_config =  self.__get_problem_config__(dt)
        self.label_config[fname] = problem_config
        
        dt = dt[[var for var in self.mainvars if var in dt.columns]]
        if  self.label_mode == 'bin' and  len(dt.label_bin.unique()) > 3: 
            print(name, '\n', dt.label_bin.unique()) 
            print("Before\n",train.label_bin.unique(), test.label_bin.unique(), '\n')
        
        # val and test set are not artificially balanced 
        train, test = train_test_split(dt, test_size=self.test_ratio, shuffle=False)
        train, val  = train_test_split(train, test_size=self.val_size, shuffle=False)
        val = val.sample(frac=1, random_state=seed)
        
        # equal samples of the chosen variable if possible
        if self.label_mode in ['bin', 'orig', 'target']:
            label_var = self.label_var
            if self.label_var not in dt.columns and self.label_mode != 'bin':
                label_var = 'label_orig'
            if problem_config[label_var]['problem_type'] == 'multi_label_classification':
                train = equal_sample_multilabel(train, label_var, self.num_pos, seed)
            else:
                K_required = self.num_pos * dt[label_var].nunique()
                print("K_required: {}".format(K_required))
                train = equal_sample(train, label_var, K_required, seed)
            
            # enforce maximum limit per set
            if train.shape[0] > self.max_domain_size: 
                train = train.sample(self.max_domain_size, random_state=seed)
        else: 
            train = train.sample(self.num_pos, random_state=seed)
        return train, val, test 
    
    
    def report(self, print_dir = None ):
        label_var = 'label_orig' if self.label_mode == 'target' else self.label_var
        report_vars = ['name'] if self.label_mode == 'bin' else ['name', label_var]
        train = self.train_df.groupby(report_vars)[label_var].count()
        test = self.test_df.groupby(report_vars)[label_var].count()
        report = pd.concat([train, test], axis=1)
        report.columns = ['Train','Test']
        self.report_table = report
        print("Total number of datasets:", report.shape[0],"\n", report)
        print("TEXT LENGTHS\nTrain set:\n",pd.DataFrame(self.train_df.text_all.str.len()).describe())
        print("Test set:\n",pd.DataFrame(self.test_df.text_std.str.len()).describe())

        
    def saveFiles(self, save_dir:str, fname_prefix:str, save_test = True, sep=','):
        # save label_config
        label_config_dir = save_dir + '/' + fname_prefix + '_' + str(len(self.filelist)) 
        with open(label_config_dir + '_label_config.json','w') as l :
                json.dump(self.label_config, l)
        l.close()
        
        train_dir = save_dir + '/' + fname_prefix + '_' + str(len(self.filelist)) + '_' + str(self.num_pos) +  '_' + \
                 str(self.test_ratio)  + '_' + self.label_mode
        test_dir  = save_dir + '/' + fname_prefix + '_' + str(len(self.filelist)) + '_' + str(self.test_ratio) + '_' + self.label_mode
        self.train_df[self.savevars].to_csv(train_dir + '_train.csv', index=None, sep=sep)
        print("CHECK", self.savevars )
        
        if save_test: 
            self.test_df[self.savevars].to_csv(test_dir + '_test.csv', index=None, sep=sep)
            with open(test_dir + '_testids.json','w') as f :
                json.dump(self.test_ids, f)
            f.close()
            
            self.val_df[self.savevars].to_csv(test_dir + '_val.csv', index=None, sep=sep)
            with open(test_dir + '_valids.json','w') as g :
                json.dump(self.val_ids, g)
            g.close()
                
        self.report()
        if self.label_mode != 'bin':
            self.report_table.to_csv(test_dir + '_report.csv', sep=sep)


In [7]:
filelist = ['Waseem', 'Davidson', 'Trac', 'Hateval','Jigsaw', 'Olid', 'Founta','Goldbeck', 'Gab_kennedy', 'Sab']
# filelist = ['sab']
test_ratio=2000
num_poses = [2000]
label_mode = 'bin'

for i, num_pos in enumerate(num_poses):
    b = DataCollector(drt='Meta_data/Raw', test_ratio=test_ratio, filelist=filelist, num_pos=num_pos, label_mode=label_mode, 
                      savevars= ['id', 'name','text_std'])
    b.standardize_data()
#     print(o.train_df.label_bin.unique())
#     print(o.test_df.label_bin.unique())
    if i == 0: 
        b.saveFiles('Meta_data', 'meta')
    else:
        b.saveFiles('Meta_data', 'meta', save_test = False)
    del b


CHECK 0 ['id', 'name', 'text_std']
Processing  Gab_kennedy
K_required: 4000
Processing  Jigsaw
K_required: 4000
Processing  Olid
K_required: 4000
Processing  Waseem
K_required: 4000
Processing  Sab
K_required: 4000
Processing  Goldbeck
K_required: 4000
Processing  Davidson
K_required: 4000
Processing  Hateval
K_required: 4000
Processing  Trac
K_required: 4000
Processing  Founta
K_required: 4000
CHECK ['id', 'name', 'text_std', 'label_bin']
Total number of datasets: 10 
              Train  Test
name                    
davidson      4000  2000
founta        4000  2000
gab_kennedy   4000  2000
goldbeck      4000  2000
hateval       4000  2000
jigsaw        4000  2000
olid          4000  2000
sab           4000  2000
trac          4000  2000
waseem        4000  2000
TEXT LENGTHS
Train set:
            text_all
count  39997.000000
mean     141.077381
std      217.176073
min        2.000000
25%       68.000000
50%      110.000000
75%      141.000000
max     5000.000000
Test set:
          

In [8]:
test_ratio=2000
num_poses = [1000] # for each class of the specified label
label_mode = 'orig'
for i, num_pos in enumerate(num_poses):
    o = DataCollector(drt='Meta_data/Raw', test_ratio=test_ratio, filelist=filelist, num_pos=num_pos, label_mode=label_mode)
    o.standardize_data()
    if i == 0: 
        o.saveFiles('Meta_data', 'meta')
    else:
        o.saveFiles('Meta_data', 'meta', save_test = False)
    del o

CHECK 0 ['id', 'name', 'text_std']
CHECK 1 ['id', 'name', 'text_std', 'label_orig', 'label_target']
Processing  Gab_kennedy
Processing  Jigsaw
Processing  Olid
K_required: 3000
Processing  Waseem
K_required: 3000
Processing  Sab
K_required: 2000
Processing  Goldbeck
K_required: 2000
Processing  Davidson
K_required: 3000
Processing  Hateval
K_required: 3000
Processing  Trac
K_required: 3000
Processing  Founta
K_required: 4000
CHECK ['id', 'name', 'text_std', 'label_orig', 'label_target']
Total number of datasets: 72 
                      Train    Test
name     label_orig               
davidson 0            1000   127.0
         1            1000  1517.0
         2            1000   356.0
founta   0            1000   306.0
         1            1000  1053.0
...                    ...     ...
trac     1            1000   662.0
         2            1000   452.0
waseem   0            1496  1466.0
         1            1495   531.0
         2               9     3.0

[72 rows x 2 columns]

In [None]:
########## END OF OFFICIAL CODE ################

In [None]:
###### DO NOT DELETE ##############
## TEST EQUAL_SAMPLE_MULTILABLE #####3
name = 'gab_kennedy'
mainvars = ['id', 'name','text_all', 'text_std', 'label_bin', 'label_orig', 'label_target']
# os.listdir('Meta_data/Raw/')

In [None]:
d = pd.read_csv('Meta_data/Raw/'+name+'.csv')
d['name'] = name
label_var = 'label_orig' if 'label_orig' in d.columns else 'label_target'
d = d.sample(frac=1, random_state=seed) # shuffle first
d = d[[var for var in mainvars if var in d.columns]]

In [None]:
# test split
x, y = split_list_label(d, label_var)
x.hist()
plt.show()

In [None]:
train  = d[:5000]
a = equal_sample_multilabel(train, 'label_orig', 100, 123)
select_ids = a.id.sample(5)

In [None]:
# labels values do not change
assert all(a[a.id.isin(select_ids)].label_orig.values == d[d.id.isin(select_ids)].label_orig.values)

In [None]:
b, _ = split_list_label(a, label_var)
b.hist()
plt.show()

In [None]:
# def equal_sampling(df, k, var, seed=123):
#     counter_dict = dict(Counter(df[var]))
#     print(counter_dict)
#     num_samples = int(k / len(counter_dict))
#     stratas = []
    
#     for c, count in counter_dict.items():
#         strata = df[df[var] == c].sample(min(num_samples,count), random_state=123)
#         stratas.append(strata)
    
#     sample = pd.concat(stratas)
#     print("Sample shape ", sample.shape)
#     # in case there is not enough samples to meet quota, backfill 
#     if sample.shape[0] < k:
#         complement = df.iloc[~df.index.isin(sample.index)]
        
#     return complement

In [None]:
# temp = pd.read_csv("Meta_data/Raw/founta.csv")

In [None]:
# sample = equal_sampling(temp, 500, 'label_orig')