 # Data Pre-processing 

In [1]:
import string as st

### Define global parameters

Text files are named as '{gender}_{number}.txt', gender taking values of 'h' for men, 's' for women and 't' for non-binary gendered people. For example, s_6.txt is the story generated for a woman CEO.
- PATH: Location of the files.
- GENDER: 'h', 's', 't' defined as above.
- FROM_WEB: If text files were collected from web.
- NUM_TO_LOAD: Number of STORIES to load.
- MAIN_PRONOUN: A dictionary of pronouns for the main character - CEO. This is used for changing CEO names into pronouns.
- OTHER_PRONOUN: A dictionary of original pronouns and target pronouns for non-CEO. Since all the texts are trained together, it is necessary to differentiate between CEO pronouns and non-CEO pronouns.


If text files were collected from web:
- START_NUM: File number to start with.

If text files were collected from terminal/shell:
- START_FILE:File number to start with.
- NUM_FILE: Number of FILES to load.
- TITLE: Used for title deletion.




In [2]:
PATH = ''

FROM_WEB = 0   
GENDER = 't'
START_NUM = 1
NUM_TO_LOAD = 100

START_FILE = 5
NUM_FILES = 2     # how many files from 'new_From_scrapper' folder to be loaded

SAVE_NAME = 't301-400'

TITLE = {'s':'she/her', 'h':'he/him', 't':'they/them'}

MAIN_PRONOUN = {'s':['her', 'she'],
                'h':['his', 'he'],
                't':['their', 'they']}

OTHER_PRONOUN = {'s':[['he', 'him', 'his', 'they', 'them', 'their', 'theirs'],
                      ['hee', 'himm', 'hiss', 'theyy', 'themm', 'theirr', 'theirss']],
                
                 'h':[['she', 'her', 'hers', 'they', 'them', 'their', 'theirs'],
                      ['shee', 'herr', 'herss', 'theyy', 'themm', 'theirr', 'theirss']],
                
                 't':[['he', 'him', 'his', 'she', 'her', 'hers'],
                      ['hee', 'himm', 'hiss', 'shee', 'herr', 'herss']]}


### Funtions of preprocessing 

#### Here below are three functions for peprocessing data from web.
- **del_prompt_end()**: Delete prompt and "the end". All of the stories have prompt in the first line, some of them are ended with "The end."
- **get_ceo_name()**: Extract CEO names to prepare for replacement in the next function.
- **replace_name_pronoun()**: replace CEO name with related pronouns and replace non-CEO pronouns with irregular ones. For example, if a man, Jack, is the CEO, then this function will let 'Jack' -> 'he'; 'Jack's' -> 'his'. And the other pronouns she/her... and  they/their/... are replaced with shee/herr/... and theyy/theirr/...


In [3]:
def del_prompt_end(story):   # when FROM_WEB = 1
    story = story.split("--------------------------")[1]
    story = story.split("The end.")[0].replace('\n',' ').strip()
    story = story.translate(str.maketrans('', '', '!"#$%&\()*+,/:;<=>\'?@[\\]^_`{|}~\n0123456789'))
    return story
    
def get_ceo_name(story):
    first_sentence = story.split(".")[0].split()
    name = [word for word in first_sentence[1:] if (word[0].isupper() and word[1].islower())]
    if len(name) == 1:
        return name[0]
    else:
        return 'NAN'
    
def replace_name_pronoun(story, rep_name=True, rep_pronoun=True,\
                         main=MAIN_PRONOUN[GENDER], other=OTHER_PRONOUN[GENDER]):
    name = get_ceo_name(story).lower()
    story = story.lower().split(".", 1)[1]
    if rep_pronoun:
        for i in range(len(other[0])):
            story = story.replace(' '+other[0][i]+' ', ' '+other[1][i]+' ')
            story = story.replace(' '+other[0][i]+'.', ' '+other[1][i]+'.')
    if rep_name:
        story = story.replace(name+"s", main[0])
        story = story.replace(name, main[1])
    
    return story

#### Below are two functions for peprocessing data from chatgpt wrapper.
- **del_extra()**: Delete irrelevant contents, including prompts, command of starting conversation, etc,. Then remove numbers and punctuation.
- **replace_names_pronouns()**: replace CEO name with related pronouns and replace non-CEO pronouns with irregular ones. For example, if a man, Jack, is the CEO, then this function will let 'Jack' -> 'he'; 'Jack's' -> 'his'. And the other pronouns she/her... and  they/their/... are replaced with shee/herr/... and theyy/theirr/...


In [4]:
def del_extra(story, title=TITLE[GENDER]):  # when FROM_WEB = 0
    story = story.split(f'write a story about a CEO with {title} pronouns')
    story = [i.replace('!new', '')\
              .replace('• New conversation started.', '')\
              .replace('The end.','')\
              .translate(str.maketrans('', '', '!"#$%&\()*+,/:;<=>\'?@[\\]^_`{|}~\n0123456789'))\
              .strip() for i in story if len(i) > 1000]
    return story    # list of stories

def replace_names_pronouns(story, rep_name=True, rep_pronoun=True,\
                         main=MAIN_PRONOUN[GENDER], other=OTHER_PRONOUN[GENDER]):
    names = set([get_ceo_name(i).lower() for i in contents])
    story = [i.lower() for i in story]
    if rep_name:
        for name in names:
            story = [i.replace(name+"s", main[0]) for i in story]
            story = [i.replace(name, main[1]) for i in story]
    if rep_pronoun:
        for k in range(len(other[0])):
            story = [i.replace(' '+other[0][k]+' ', ' '+other[1][k]+' ') for i in story]
            story = [i.replace(' '+other[0][k]+'.', ' '+other[1][k]+'.') for i in story]
    
    return story

### Read and process text files 
Here are two chucks of code implementing functions above to read and process the story.


In [5]:
if FROM_WEB == 1:
    l = []
    for i in range(START_NUM, START_NUM + NUM_TO_LOAD):
        with open(PATH +'raw_from_web/' +GENDER +str(i) +'.txt') as f:
            contents = f.read()
            contents = del_prompt_end(contents)
            if get_ceo_name(contents) == 'NAN':
                print('ERROR! WRONG CEO NAME')
                break
            else:
                contents = replace_name_pronoun(contents).replace('-', ' ').strip()
                
                l.append(contents)
    

In [6]:
if FROM_WEB == 0:
    l = []
    for i in range(START_FILE,START_FILE+NUM_FILES):
        with open(PATH +'raw_from_wrapper/new_' +GENDER +str(i) +'.txt') as f:
            contents = f.read()
            contents = del_extra(contents)
            names = set([get_ceo_name(i) for i in contents])
            if 'NAN' in names:
                print('ERROR! WRONG CEO NAME')
                break
            else:
                contents = replace_names_pronouns(contents)
                contents = [i.replace('-',' ').split(".", 1)[1].strip() for i in contents]
                l += contents
    

### Export  processed stories into a new file
#### The processed file containing NUM_TO_LOAD stories with the following format:
- Lowercased
- Punctuationless except '.'
- CEO names are replaced with related pronouns
- Pronouns of other characters are replaced with irregular ones 
- Two adjacent stories are divided by '/n'

In [7]:
if len(l) == NUM_TO_LOAD:
    with open(f'{SAVE_NAME}.txt', 'w') as ff:
        ff.write('\n'.join(l))
        print('SAVED!')
else:
    print(f'ONLY {len(l)} IN HERE')

SAVED!
