## Counts or clips data specified
These functions count the instances where the regex string has occurred. It also clips the audio and TextGrid files to include that window of data. 

### Imports

In [3]:
import re
import pandas as pd
from praatio import textgrid
from pydub import AudioSegment
from pathlib import Path

### count

**count** looks at a file and counts all instances of the specified regular expression string. If there are multiple instances in a string, it counts it multiple times.

**Inputs**:
- string&nbsp;&nbsp;&nbsp;*filename*
- string&nbsp;&nbsp;&nbsp;*string*

**Outputs**: (int) the count of instances

In [1]:
def count(filename, string):
    count = 0
    df = pd.read_csv(filename)

    for i, text in enumerate(df['text']):
        r = re.findall(string, text)
        count += len(r)
            
    return count

### count_diff

**count_diff** looks at two CSV files and finds the difference between the two representations. For the Seoul Corpus data, it can be used to find the differences between the orthographic and pronounced data. 

**Inputs**:
- string&nbsp;&nbsp;&nbsp;*filename1*
- string&nbsp;&nbsp;&nbsp;*filename2*
- string&nbsp;&nbsp;&nbsp;*string1*
- string&nbsp;&nbsp;&nbsp;*string2*

**Outputs**: (int, int) tuple of number of instances and number of differences. 

In [None]:
def count_diff(filename1, filename2, string1, string2, i_start=0, i_end=0):
    countOrig = 0
    countDiff = 0

    df1 = pd.read_csv(filename1)
    df2 = pd.read_csv(filename2)

    for i, (text1, text2) in enumerate(zip(df1['text'], df2['text'])):
        iter1 = re.finditer(string1, text1)
        iter2 = re.finditer(string2, text2)

        for m, n in zip(iter1, iter2):
            sub1 = text1[m.start()+i_start:m.start()+i_end]
            sub2 = text2[n.start()+i_start:n.start()+i_end]
            
            if (sub1 != sub2): countDiff += 1

        countOrig += len(re.findall(string1, text1))

    return countOrig, countDiff

### clip_times

**clip_times** creates an tuple containing (start, end) of the interval we want to clip.

**Inputs**:
- string&nbsp;&nbsp;&nbsp;*filename* containing the CSV file containing all our data
- string&nbsp;&nbsp;&nbsp;*string* containing the regex pattern

**Outputs**: array of (str, int, int) tuple with filename, start time, and end time

In [None]:
def clip_times(filename, string):
    df = pd.read_csv(filename)
    times = []

    for i, text in enumerate(df['text']):
        r = re.search(string, text)
        if r != None:
            times.append((df['InputFile'][i], df['start'][i], df['end'][i]))

    return times

### clip

**clip** clips the audio and TextGrid files and exports it to specified folder. 

**Inputs**:
- array of (str, int, int) tuples&nbsp;&nbsp;&nbsp;*times* containing name, start time, and end time
- int&nbsp;&nbsp;&nbsp;*string* containing the window
- folder&nbsp;&nbsp;&nbsp;*folder* containing the destination folder

**Outputs**: None

In [17]:
def clip(times, window, folder):  
    Path.mkdir(folder)
    for name, start, end in times:
        filename_flac = "data/sound/" + name + ".flac"
        filename_textgrid = "data/label/" + name + ".TextGrid"
        output_name = folder + "/" + str(start) + "_" + name
        output_wav = output_name + ".flac"
        output_textgrid = output_name + ".TextGrid"

        tg = textgrid.openTextgrid(filename_textgrid, includeEmptyIntervals=True, duplicateNamesMode='rename')
        audio = AudioSegment.from_file(filename_flac, format="flac")

        start = max(0, (start - window))
        end   = min(len(audio), (end + window))

        tg_cut = tg.crop(start, end, mode="truncated", rebaseToZero=True)
        tg_cut.save(output_textgrid, format="short_textgrid", includeBlankSpaces=True)

        segment = audio[start*1000:end*1000]
        segment.export(output_wav, format="flac")


s0 13922
s0ya 8
s0EE 1757
s0vv 2038
s0oo 700
s0yu 19
s0wi 72
s0yo 15
s0we 1
s0uu 948
s0aa 4370
s0ee 328
s0yv 26
s0xx 620
s0ii 3020


ss 2649
ssya 0
ssEE 51
ssvv 210
ssoo 83
ssyu 0
sswi 1
ssyo 21
sswe 0
ssuu 936
ssaa 529
ssee 74
ssyv 1
ssxx 453
ssii 289


s0 15435
s0ya 7
s0EE 1785
s0vv 2005
s0oo 771
s0yu 12
s0wi 122
s0yo 36
s0we 0
s0uu 1887
s0aa 4624
s0ee 393
s0yv 22
s0xx 571
s0ii 3197


ss 966
ssya 0
ssEE 2
ssvv 147
ssoo 18
ssyu 0
sswi 0
ssyo 0
sswe 0
ssuu 6
ssaa 285
ssee 5
ssyv 0
ssxx 438
ssii 46


s0 565
ss 71


s0 15677
ss 5912


aas0 3491
ees0 2762
iis0 1410
oos0 930
uus0 916
vvs0 1235
xxs0 597
aass 876
eess 323
iiss 2347
ooss 97
uuss 31
vvss 824
xxss 62


t0-s0 (2, 0)
