<a href="https://colab.research.google.com/github/kruttikajain/ACL2019-Reducing-Gender-Bias-in-Word-Level-Language-Models-Using-A-Gender-Equalizing-Loss-Function/blob/master/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
%cd /content/gdrive/My Drive/Colab Notebooks/MSProject

/content/gdrive/My Drive/Colab Notebooks/MSProject


In [3]:
pip install unidecode



In [4]:
pip install jupyter_argparser


[31mERROR: Could not find a version that satisfies the requirement jupyter_argparser (from versions: none)[0m
[31mERROR: No matching distribution found for jupyter_argparser[0m


In [5]:
# coding: utf-8
import os
import multiprocessing as mp
import re
import ctypes
import argparse
import struct
import pickle
import gzip
import spacy
from unidecode import unidecode
from io import BytesIO
from keras.preprocessing.text import Tokenizer
import gc 

en = spacy.load('en')


def is_valid_token(w):
    """
    Returns True if a token is valid
    """
    return bool(re.search('[a-zA-Z0-9,.!?<>\']+', w))
    #    !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~)


def transform_token(w):
    """
    Transforms a token by making lowercase, and for numeric tokens replaces
    digits with placeholders
    """
    #替换 非 A-Za-z <>$. - ‘，包括space
    return re.sub(r'[^A-Za-z,.!?\'<>]', '', 
            re.sub(r'\d+', '<NUM>',
                unidecode(w).lower()))



# def is_valid_token(w):
#     """
#     Returns True if a token is valid
#     """
#     return bool(re.search('[a-zA-Z0-9,.!?]+', w))


# def transform_token(w):
#     """
#     Transforms a token by making lowercase, and for numeric tokens replaces
#     digits with placeholders
#     """
#     return re.sub(r'[.\-\']+$', '',
#       re.sub(r'[.\-\']+', '',
#         re.sub(r'[^A-Za-z<>$.\-\']', '',
#             re.sub(r'\d+', '<NUM>',
#                 unidecode(w).lower()))))


def preprocess_file(filepath):
    """
    Preprocesses a file by splitting it into sentences and tokenizing it
    """
    # Open file
    try:
        with open(filepath, 'r') as f:
            text = f.read()
    except UnicodeDecodeError as e:
        try:
            # Account for some files that may be encoded with ISO-8859-1
            with open(filepath, 'r', encoding='iso-8859-1') as f:
                text = f.read()
        except UnicodeDecodeError as e:
            msg = "Could not open {}: {}".format(filepath, str(e))
            raise Exception(msg)


    # Remove any additional information e.g. "@highlights"
    main_text_body = text.split('\n@')[0]

    # Split up lines, and then break up lines into sentences
    sentences = []
    for line in main_text_body.split('\n\n'):
        sentences += list(en(line.strip('\n')).sents)

    # Get tokens for each sentence
    tokens = set()
    sentence_tokens = []
    for sent in sentences:
        sent_tokens = []
        for w in sent:
            if not is_valid_token(w.text):
                continue
            w = transform_token(w.text)
            sent_tokens.append(w)
            tokens.add(w)
        if len(sent_tokens) > 1:
            sentence_tokens.append(sent_tokens)

    return sentence_tokens, tokens


def write_preprocessed_file(encoded_sentences, output_path):
    """
    Write encoded sentences to a binary file
    """
    num_sentences = len(encoded_sentences)
    offset = 0
    uint_size = ctypes.sizeof(ctypes.c_uint)

    # Get total file size
    total_size = sum([(len(sent)+1) * uint_size for sent in encoded_sentences]) - uint_size

    buf = ctypes.create_string_buffer(total_size)
    for sent_idx, sent in enumerate(encoded_sentences):
        # Encode words as unsigned ints
        for idx in sent:
            struct.pack_into('I', buf, offset, idx + 1)
            offset += uint_size

        # Add a sentence delimter
        if sent_idx != (num_sentences - 1):
            struct.pack_into('I', buf, offset, 0)
            offset += uint_size

    # Gzip and save
    with gzip.open(output_path, 'wb') as f:
        f.write(buf)


def encode_sentences(sentences, word_to_idx):
    """
    Encode tokens in sentences by vocab indices
    """
    tokenized = []
    for sent in sentences:
        sentence2idx = []
        for w in sent:
            try:
                sentence2idx.append(word_to_idx[w])
            except:
                sentence2idx.append(word_to_idx['_unk_'])
        tokenized.append(sentence2idx)

    return tokenized


def read_preprocessed_file(filepath, vocab):
    """
    Reads a preprocessed text file. Returns a list of sentences, where
    each sentence is a list of tokens.
    """
    # Get binary string
    with gzip.open(filepath, 'rb') as f:
        buf = f.read()

    sentences = []
    sent = []
    for (val,) in struct.iter_unpack('I', buf):
        if val > 0:
            # Get words for the current sentence
            sent.append(vocab[val-1])
        else:
            # We've reached the end of the sentence
            sentences.append(sent)
            sent = []

    return sentences


def read_preprocessed_file_as_str(filepath, vocab, sent_delim='<eos>'):  #version2: no <eos> 
    """
    Reads a preprocessed text file. Returns a list of sentences, where
    each sentence is a list of tokens.
    """
    # Get binary string
    with gzip.open(filepath, 'rb') as f:
        buf = f.read()

    res = ""
    first_word = True
    for (val,) in struct.iter_unpack('I', buf):
        if not first_word:
            res += ' '
        else:
            first_word = False

        if val > 0:

            # Get words for the current sentence
            res += vocab[val-1]
        else:
            # We've reached the end of the sentence
            res += sent_delim
    res += ' ' + sent_delim

    return res


def load_preprocesed_dataset(pp_dataset_dir, sent_delim='<eos>', vocab_path=None): #version2: no <eos> 
    if not vocab_path:
        vocab_path = os.path.join(pp_dataset_dir, 'VOCAB_t.txt')

    vocab = read_vocab(vocab_path)

    data_dir = os.path.join(pp_dataset_dir, 'Data/sample_stories')

    res = ""
    for idx, fname in enumerate(os.listdir(data_dir)):
        if idx != 0:
            res += ' '
        filepath = os.path.join(data_dir, fname)
        res += read_preprocessed_file_as_str(filepath, vocab, sent_delim='<eos>')

    return res


def read_vocab(vocab_path):
    """
    Read a vocabulary file. Returns a list of words
    """
    vocab = []
    with open(vocab_path, 'r') as f:
        for line in f:
            vocab.append(line.strip('\n'))

    return vocab


def preprocess_worker(args):
    """
    Multiprocessing worker for preprocessing a text file
    """
    txt_path, dataset_dir, output_data_dir = args
    basename, ext = os.path.splitext(os.path.basename(txt_path))
    out_prefix = os.path.dirname(txt_path).replace(dataset_dir, '').replace('/', '_')
    if out_prefix:
        out_prefix += '_'
    out_path = os.path.join(output_data_dir, '{}{}.bin'.format(out_prefix, basename))

    sentences, tokens = preprocess_file(txt_path)

    return out_path, sentences, tokens

def save_worker(args):
    """
    Multiprocessing worker for saving a preprocessed file
    """
    output_path, sentences, word_to_idx = args

    sentences = encode_sentences(sentences, word_to_idx)
    write_preprocessed_file(sentences, output_path)


def preprocess_dataset(dataset_dir, output_dir, vocabsize=50000, target_ext='.txt', num_workers=1):
    """
    Preprocesses a dataset by splitting each file into sentences, tokenizing
    each sentence, encoding the files, and saving them.
    """
    dataset_dir = os.path.abspath(dataset_dir)
    output_dir = os.path.abspath(output_dir)
    output_data_dir = os.path.join(output_dir, 'data')
    
    print("voc_size is", vocabsize)
    if not os.path.isdir(dataset_dir):
        raise ValueError('Dataset directory {} does not exist'.format(dataset_dir))

    if not os.path.isdir(output_data_dir):
        os.makedirs(output_data_dir)

    worker_args = []

    print("Getting list of files...")
    # Get list of txt files
    for root, dirs, files in os.walk(dataset_dir):
        root = os.path.abspath(root)
        for fname in files:
            basename, ext = os.path.splitext(fname)

            # if ext.lower() != target_ext.lower():
            #     continue

            # if basename.lower() == 'readme':
            #     continue

            txt_path = os.path.join(root, fname)

            worker_args.append((txt_path, dataset_dir, output_data_dir))

    pool = mp.Pool(num_workers)

    print("Preprocessing files...")
    output_paths = []
    articles = []
    wholetokens = []
    num_files = len(worker_args)
    # Preprocess each file and get the tokens in each file
    for idx, (out_path, sentences, tokens) in enumerate(pool.imap_unordered(preprocess_worker, worker_args)):
        print("idx", idx)
        output_paths.append(out_path)
        articles.append(sentences)
        wholetokens += sentences

        if ((idx+1) % 1000) == 0:
            print("Preprocessed {}/{} files".format(idx+1, num_files))

    pool.close()
    pool.join()

    # create the tokenizer
    #voc_size = vocabsize #in keras,it builds a voc from 1-45000  original 47885
    voc_size =50000
    t = Tokenizer(num_words=voc_size,oov_token = '_unk_',filters='!"$,-./:;<>?\t\n') 
    # fit the tokenizer on the documents
    t.fit_on_texts(wholetokens)
    # summarize what was learned
    vocab = []
    #in keras,t.word_index.items() is ordered dic, i.e {'work': 1, 'well': 2, 'done': 3, 'good': 4}
    #print
    for key, value in t.word_index.items():
      try:
        if value<voc_size:
            vocab.append(key)
        else:
            break
      except:
        print("key:",key, "value",value,"voc_size:",voc_size )
    vocab.insert(0, '_pad_')
    print(len(vocab))

    # Sort vocab and make into a list
    print("Saving vocab...")
    vocab = list(sorted(vocab))
    word_to_idx = {w: idx for (idx, w) in enumerate(vocab)}

    # Write vocab to disk
    vocab_path = os.path.join(output_dir, 'VOCAB.txt')
    with open(vocab_path, 'w') as f:
        f.write('\n'.join(vocab))

    # Encode preprocessed files and write them to disk
    worker_args = [(output_path, sentences, word_to_idx)
                   for output_path, sentences in zip(output_paths, articles)]

    print("Saving files...")
    pool = mp.Pool(num_workers)
    for idx, _ in enumerate(pool.imap_unordered(save_worker, worker_args)):
        if ((idx+1) % 1000) == 0:
            print("Saved {}/{} files".format(idx+1, num_files))
    pool.close()
    pool.join()

    print("Done.")





    

In [6]:
'''def parse_arguments():
    """
    Get command line arguments
    """
    parser = argparse.ArgumentParser(description='Preprocess text data into lists of tokens')
    parser.add_argument('dataset_dir', help='Path to directory containing text files', type=str)
    parser.add_argument('output_dir', help='Path to output directory', type=str)
    parser.add_argument('target_ext', help='Extension of relevant text files', type=str)
    parser.add_argument('vocabsize', help='Vocabulary size', type=int, default=50000)
    parser.add_argument('-n', '--num-workers', dest='num_workers', type=int, default=1, help='Number of workers')
    return vars( parser.parse_args(args=[]))
    #args, unknown = parser.parse_known_args()

    #args = parser.parse_args(args=[])'''


'def parse_arguments():\n    """\n    Get command line arguments\n    """\n    parser = argparse.ArgumentParser(description=\'Preprocess text data into lists of tokens\')\n    parser.add_argument(\'dataset_dir\', help=\'Path to directory containing text files\', type=str)\n    parser.add_argument(\'output_dir\', help=\'Path to output directory\', type=str)\n    parser.add_argument(\'target_ext\', help=\'Extension of relevant text files\', type=str)\n    parser.add_argument(\'vocabsize\', help=\'Vocabulary size\', type=int, default=50000)\n    parser.add_argument(\'-n\', \'--num-workers\', dest=\'num_workers\', type=int, default=1, help=\'Number of workers\')\n    return vars( parser.parse_args(args=[]))\n    #args, unknown = parser.parse_known_args()\n\n    #args = parser.parse_args(args=[])'

In [7]:
class Args:
  dataset_dir =  '/content/gdrive/My Drive/Colab Notebooks/MSProject/Data/sample_stories'
  output_dir =  '/content/gdrive/My Drive/Colab Notebooks/MSProject/Output'
  target_ext =  '.txt'
  vocabsize = 50000
  num_workers = 1

#args=Args()

In [9]:
a=Args()

In [10]:
print(a.dataset_dir)

/content/gdrive/My Drive/Colab Notebooks/MSProject/Data/sample_stories


In [11]:
#import sys
#sys.argv = ['/content/gdrive/My Drive/Colab Notebooks/MSProject', '/content/gdrive/My Drive/Colab Notebooks/MSProject', 'txt', 50000]

In [12]:
if __name__ == '__main__':
    preprocess_dataset(a.dataset_dir, a.output_dir, a.target_ext, a.vocabsize, a.num_workers)

voc_size is .txt
Getting list of files...
Preprocessing files...
idx 0
idx 1
idx 2
idx 3
idx 4
idx 5
idx 6
idx 7
idx 8
idx 9
idx 10
idx 11
idx 12
idx 13
idx 14
idx 15
idx 16
idx 17
idx 18
idx 19
idx 20
idx 21
idx 22
idx 23
idx 24
idx 25
idx 26
idx 27
idx 28
idx 29
idx 30
idx 31
idx 32
idx 33
idx 34
idx 35
idx 36
idx 37
idx 38
idx 39
idx 40
idx 41
idx 42
idx 43
idx 44
idx 45
idx 46
idx 47
idx 48
idx 49
idx 50
idx 51
idx 52
idx 53
idx 54
idx 55
idx 56
idx 57
idx 58
idx 59
idx 60
idx 61
idx 62
idx 63
idx 64
idx 65
idx 66
idx 67
idx 68
idx 69
idx 70
idx 71
idx 72
idx 73
idx 74
idx 75
idx 76
idx 77
idx 78
idx 79
idx 80
idx 81
idx 82
idx 83
idx 84
idx 85
idx 86
idx 87
idx 88
idx 89
idx 90
idx 91
idx 92
idx 93
idx 94
idx 95
idx 96
idx 97
idx 98
idx 99
idx 100
idx 101
idx 102
idx 103
idx 104
idx 105
idx 106
idx 107
idx 108
idx 109
idx 110
idx 111
idx 112
idx 113
idx 114
idx 115
idx 116
idx 117
idx 118
idx 119
idx 120
idx 121
idx 122
idx 123
idx 124
idx 125
idx 126
idx 127
idx 128
idx 129
idx 1

In [None]:
#!python3 preprocess.ipynb dataset_dir /content/gdrive/My Drive/Colab Notebooks/MSProject/sample_stories

In [None]:
%tb

In [None]:
#python -u preprocess.py ../data/sample_stories_cda ../data/preprocessed_cda .txt 50000 -n 32