# Check transcript and convert to CSV
Standardize transcript rules, look for mistakes, and create clean CSV

In [1]:
import pandas as pd
import os
import re
EPISODE = 1 # Change as necessary

## Run preliminary tests

In [4]:
def checkFile(file):
    '''Do a first run-through to catch any mistakes.'''
    with open(file) as f:
        for i, line in enumerate(f.readlines()):
            condition_1 = line.startswith('[')
            condition_2 = line.startswith('?')
            condition_3 = line.split(' ')[0].isupper()
            if not (condition_1 or condition_2 or condition_3):
                print('Line #{} is a problem'.format(i))
                print(line)

checkFile('data/clean_episode{:02d}.txt'.format(EPISODE))

## Create helper functions

In [5]:
def inbrackets(text):
    assert ('[' in text and ']' in text) or ('[' not in text), ('Problem! ' + text)
    return ', '.join(re.findall('\[([^\]]+)\]', text))
def outbrackets(text):
    assert ('[' in text and ']' in text) or ('[' not in text), ('Problem! ' + text)
    first = re.sub('\[([^\]]+)\]', '', text) 
    return re.sub('  ', ' ', first).strip()

## Parse transcript into dataframe

Dataframe structure:
* episode: #
* line number: # 
* talking: person talking
* talking_to: person spoken at
* words: non-bracketed material
* context: bracketed material
* type: dialogue / sound / testimony / exclamation

In [20]:
allData = []
with open('data/clean_episode{:02d}.txt'.format(EPISODE)) as f:
    for i, line in enumerate(f.readlines()):
        d = {}
        d['episode'] = EPISODE
        d['line'] = i        
        if line.startswith('['):
            m = inbrackets(line)
            assert len(m) + 2 == len(line.strip()), 'Line #{} is a problem\n{} {}'.format(i, line, m)
            d['talking'] = None
            d['talking_to'] = None
            d['words'] = None
            d['context'] = m
            d['type'] = 'sound'
        else:
            lineA = line.split(': ')
            assert len(lineA) == 2, 'Line #{} is a problem\n{}'.format(i, line)
            speaking = lineA[0]
            if '[' not in speaking:
                d['talking'] = speaking.lower()
                d['talking_to'] = 'aside'
                d['type'] = 'testimony'
            else:
                d['talking'] = speaking.split(' ')[0].lower()
                m = inbrackets(speaking).lower()
                if m == 'in exclamation':
                    d['talking_to'] = 'self'
                    d['type'] = 'exclamation'
                else:
                    d['talking_to'] = m[3:] # drop 'to'
                    d['type'] = 'dialogue'
            cc = lineA[1]
            d['words'] = outbrackets(cc)
            d['context'] = inbrackets(cc)            
        allData.append(d)

df = pd.DataFrame(allData)

## Last quality spot check and reorder columns

In [21]:
print(sorted([i for i in df.talking.unique() if not i is None]))

['???', 'alexb', 'alexd', 'angelique', 'annie', 'bri', 'caelynn', 'caitlin', 'cassie', 'catherine', 'chris', 'colton', 'courtney', 'demi', 'devin', 'elyse', 'erika', 'erin', 'hannahb', 'hannahg', 'heather', 'jane', 'katie', 'kirpa', 'laura', 'nicole', 'nina', 'onyeka', 'revian', 'sydney', 'tahzjuan', 'tayshia', 'tracy', 'women']


In [22]:
print(sorted([i for i in df.talking_to.unique() if not i is None]))

['???', 'alexb', 'alexd', 'all', 'angelique', 'annie', 'aside', 'bri', 'caelynn', 'caitlin', 'cassie', 'catherine', 'chris', 'colton', 'courtney', 'demi', 'devin', 'dog', 'elyse', 'erika', 'erin', 'hannahb', 'hannahg', 'heather', 'jane', 'katie', 'kirpa', 'laura', 'nicole', 'nina', 'onyeka', 'producer', 'revian', 'self', 'sydney', 'tahzjuan', 'tayshia', 'tracy', 'women']


In [23]:
m = ['line', 'type', 'context', 'talking', 'talking_to', 'words', 'episode']
df = df[m]
df.sample(50)

Unnamed: 0,line,type,context,talking,talking_to,words,episode
713,713,dialogue,,catherine,colton,"Yeah, absolutely.",1
342,342,dialogue,,colton,laura,You look fantastic.,1
303,303,testimony,,nicole,aside,"Like, I had no idea, I didn't get the debutant...",1
718,718,sound,laughs,,,,1
46,46,dialogue,,colton,chris,I remember my walkup still.,1
416,416,dialogue,,colton,erin,You know how to make an entrance.,1
105,105,dialogue,,caelynn,colton,So I am miss North Carolina.,1
231,231,dialogue,,onyeka,colton,You are going to kill it. You are going to kil...,1
6,6,testimony,,katie,aside,"Hi, my name is katie, 26 years old and I am fr...",1
152,152,dialogue,,cassie,colton,"I have so many butterflies. No, I literally ha...",1


## Convert dataframe to CSV

In [24]:
df.to_csv('data/episode{:02d}.csv'.format(EPISODE), index=False)

## Combine all CSVs

In [27]:
l = []
for f in sorted(os.listdir('data')):
    if f.startswith('episode') and f.endswith('.csv'):
        print(f)
        df_t = pd.read_csv('data/' + f, index_col=None, header=0)
        l.append(df_t)
df_big = pd.concat(l, axis = 0, ignore_index = True)
df_big.to_csv('data/allData.csv', index=False)

episode01.csv
episode02.csv
episode03.csv
episode04.csv
