In [1]:
import torch
import torch.nn.functional as F

from datasets import load_dataset,concatenate_datasets,load_from_disk, Dataset, Value, ClassLabel,features

import numpy as np
import matplotlib.pyplot as plt

from tqdm.auto import tqdm, trange
import csv


NEUTRAL = 0
FEMALE = 1
MALE = 2

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

def get_gender_ratio(train_set):
    gender_rat = []
    for i in range (0,3):
        gender_rat.append(train_set.filter(lambda example: example['labels'] == i).num_rows)
    gender_rat = list(map(lambda x: x/len(train_set),gender_rat))
    return gender_rat

In [None]:
funpedia = load_dataset("md_gender_bias", "funpedia")
wizard = load_dataset("md_gender_bias", "wizard")
image_chat = load_dataset("md_gender_bias","image_chat")
yelp = load_dataset("md_gender_bias","yelp_inferred")
convai = load_dataset("md_gender_bias","convai2_inferred")
open_subtitles = load_dataset("md_gender_bias","opensubtitles_inferred")
light = load_dataset("md_gender_bias","light_inferred")

### Let us process every dataset into clean format text,label - [neutral,female,male]

with features as follows:

## Funpedia

In [3]:
funpedia = funpedia.remove_columns(['persona','title']).rename_column('gender', 'labels')
funpedia_train = funpedia['train']
funpedia_val = funpedia['validation']
funpedia_test = funpedia['test']

let features be same as in funpedia 

In [None]:
features = funpedia['train'].features

## Wizard

In [None]:
wizard = wizard.remove_columns(['chosen_topic']).rename_column('gender', 'labels')
wizard_train = wizard['train']
wizard_val = wizard['validation']
wizard_test = wizard['test']

In [None]:
get_gender_ratio(funpedia_train)

## Image chat

Here we will convert information of male,female as boolean value to our format [neutral,female,male]

In [None]:
def clean_text(text):
    return text.replace('<start> a ','').replace('<start> an ','').replace('<start> ','').replace(' <eos>','')

#method for creating new dataset out of existing which is labeled in a way that doesn't fit our other datasets
#mapping male=False female=False -> 0
#        male=False female=True  -> 1
#        male=True  female=False -> 2
def clean_image_chat_data(split:str):
    img_male = image_chat[split].filter(lambda item: item['male'] == True and item['female'] == False)
    img_female = image_chat[split].filter(lambda item: item['male'] == False and item['female'] == True)
    img_neutral = image_chat[split].filter(lambda item: item['male'] == False and item['female'] == False)
    
    labels = [MALE]*img_male.num_rows + [FEMALE]*img_female.num_rows + [NEUTRAL]*img_neutral.num_rows
    texts = [clean_text(t) for t in img_male['caption']] + [clean_text(t) for t in img_female['caption']] + [clean_text(t) for t in img_neutral['caption']]

    dictionary = {'text': texts, 'labels' : labels}
    final = Dataset.from_dict(dictionary,features=features)
    
    #valid split is too large, let it be smaller for faster processing
    if split == 'validation':
        final = final.shuffle()
        final = final.shard(num_shards=150, index=0)
    
    return final.shuffle()

In [None]:
image_train = clean_image_chat_data('train')
image_val = clean_image_chat_data('validation')
image_test = clean_image_chat_data('test')

In [None]:
get_gender_ratio(image_train)

## Yelp

In [None]:
def clean_yelp_data(split:str,certainity:int):
    
    data = yelp[split].filter(lambda x : x['binary_score'] > certainity)
    yelp_male = data.filter(lambda item: item['binary_label'] == 1 and item['binary_score'] > certainity+0.15)
    yelp_female = data.filter(lambda item: item['binary_label'] == 0)
    
    male_texts = list(filter(lambda x : '<UNK>' not in x ,yelp_male['text']))
    female_texts = list(filter(lambda x : '<UNK>' not in x ,yelp_female['text']))
    
    #labeling 0 -> 1, 1->2
    labels = [MALE]*len(male_texts) + [FEMALE]*len(female_texts)
    print(len(male_texts))
    print(len(female_texts))
    
    dictionary = {'text': male_texts+female_texts, 'labels' : labels}
    final = Dataset.from_dict(dictionary,features=features)
    
    #valid split is too large, let it be smaller for faster processing
    if split == 'test':
        final = final.shuffle()
        final = final.shard(num_shards=150, index=0)
    
    return final.shuffle()

In [None]:
yelp_train = clean_yelp_data('train',0.7)
yelp_val = clean_yelp_data('validation',0.7)
yelp_test = clean_yelp_data('test',0.7)

In [None]:
get_gender_ratio(yelp_train)

## Convai

In [None]:
def clean_convai_data(split:str,certainity):
    data = convai[split]
    
    male = data.filter(lambda item: item['ternary_label'] == 1 and item['ternary_score'] > certainity)
    female = data.filter(lambda item: item['ternary_label'] == 0 and item['ternary_score'] > certainity)
    neutral = data.filter(lambda item: item['ternary_label'] == 2 and item['ternary_score'] > certainity-0.2)
    
    texts = male['text'] + female['text'] + neutral['text']
    labels = [MALE]*male.num_rows + [FEMALE]*female.num_rows + [NEUTRAL]*neutral.num_rows
        
    dictionary = {'text': texts, 'labels' : labels}
    final = Dataset.from_dict(dictionary,features=features)
    
    return final.shuffle()

In [None]:
convai_train = clean_convai_data('train',0.7)
convai_val = clean_convai_data('validation',0.7)
convai_test = clean_convai_data('test',0.7)

In [None]:
get_gender_ratio(convai_train)

## Opensubtitles

In [None]:
def clean_subtitles_data(split:str,certainity:int):
    data = open_subtitles[split]
        
    male = data.filter(lambda item: item['ternary_label'] == 1 and item['ternary_score'] > (certainity+0.05))
    female = data.filter(lambda item: item['ternary_label'] == 0 and item['ternary_score'] > certainity)
    neutral = data.filter(lambda item: item['ternary_label'] == 2 and item['ternary_score'] > certainity - 0.2)
    
    texts = male['text'] + female['text'] + neutral['text']
    labels = [MALE]*male.num_rows + [FEMALE]*female.num_rows + [NEUTRAL]*neutral.num_rows
        
    dictionary = {'text': texts, 'labels' : labels}
    final = Dataset.from_dict(dictionary,features=features)
    
    return final.shuffle()

In [None]:
open_subs_train = clean_subtitles_data('train',0.7)
open_subs_val = clean_subtitles_data('validation',0.7)
open_subs_test = clean_subtitles_data('test',0.7)

In [None]:
get_gender_ratio(open_subs_train)

## Light

In [None]:
light['train'].features

In [None]:
def clean_light_data(split:str,level:str,certainity:int):
    data = light[split]
        
    male = data.filter(lambda item: item['ternary_label'] == 1 and item['ternary_score'] > (certainity+0.05))
    female = data.filter(lambda item: item['ternary_label'] == 0 and item['ternary_score'] > certainity)
    neutral = data.filter(lambda item: item['ternary_label'] == 2 and item['ternary_score'] > certainity)
    
    texts = male['text'] + female['text'] + neutral['text']
    labels = [MALE]*male.num_rows + [FEMALE]*female.num_rows + [NEUTRAL]*neutral.num_rows
        
    dictionary = {'text': texts, 'labels' : labels}
    final = Dataset.from_dict(dictionary,features=features)
    
    return final.shuffle()

In [None]:
light_train = clean_light_data('train','ternary',0.7)
light_val = clean_light_data('validation','ternary',0.7)
light_test = clean_light_data('test','ternary',0.7)

In [None]:
get_gender_ratio(light_train)

In [None]:
train_data = concatenate_datasets([funpedia_train,wizard_train,yelp_train,convai_train,open_subs_train,image_train,light_train]).shuffle()
valid_data = concatenate_datasets([funpedia_val,wizard_val,yelp_val,convai_val,open_subs_val,image_val,light_val]).shuffle()
test_data = concatenate_datasets([funpedia_test,wizard_test,yelp_test,convai_test,open_subs_test,image_test,light_test]).shuffle()

In [204]:
train_data

Dataset({
    features: ['text', 'labels'],
    num_rows: 457483
})

In [206]:
train_data.to_csv('./train.csv')
valid_data.to_csv('./valid.csv')
test_data.to_csv('./test.csv')

1814904

In [None]:
train_data[0]

In [None]:
data = load_dataset('csv',data_files='./valid.csv')