### Welcome!
This notebook generates various sound frequency counts from the word frequency data in HKCanCor (Luke & Wong, 2015), which are presented in Li, Alderete, & Badrulhisham (2020). To run the code blocks below, it is recommended that you download the parent folder of this file and run it on [Google Colab](https://colab.research.google.com/notebooks/intro.ipynb) (default, recommended) or on your local installation of [Jupyter Notebooks](https://jupyter.org/).

If you're using Colab, go ahead and run the code blocks. If you're using a local installation, please ensure that the packages required are installed in your local system. This notebook can also downloaded as a .py and be ran in your local system.

If you're running this notebook on your local installation, the blocks from line 2-11 can be deleted.

In [None]:
#package setup
try:
  !pip install pycantonese
  !pip install xlsxwriter
except:
  pass
try:
  from google.colab import drive
  drive.mount('/content/gdrive')
except:
  pass
import pycantonese
import re
from pandas import read_csv
from collections import Counter
import xlsxwriter


Please adjust the location in line 2 accordingly.


In [None]:
#workbook setup
workbook = xlsxwriter.Workbook("hkcancor_phono.xlsx")
wordsize_sheet = workbook.add_worksheet('wordsize')
syllshape_sheet = workbook.add_worksheet('syllshape')
OvRtoken_sheet = workbook.add_worksheet('tokenOvR')
OvRtype_sheet = workbook.add_worksheet('typeOvR')
CvTall_sheet = workbook.add_worksheet('CvTall')
CvTdisyll_sheet = workbook.add_worksheet('CvTdisyll')
phonotactics_sheet = workbook.add_worksheet('phonotactics')

Please adjust the location in line 23 accordingly.

In [None]:
#loading word frequency .csv
def sylldiv(word):
    units = re.split('(\d)', word)
    units = units[:-1]
    zipped = [''.join(item) for item in zip(units[::2], units[1::2])]
    return zipped

def parsesyll(syll):
    jp = pycantonese.parse_jyutping(syll)[0]
    onset = jp[0]
    tone = jp[3]
    if jp[2] == '':
        coda = 'none'
    elif jp[2] in 'mnng':
        coda = 'nasal'
    elif jp[2] in 'aieou':
        coda = 'dip'
    else:
        coda = 'stop'
    rime = ''.join(jp[1:3])
    return (onset, rime, coda, tone)

word_data = read_csv("hkcancor_word.csv", names = ['orthos', 'phonos', 'counts', 'probability'], header = 0)
phonos = word_data.phonos.tolist()
phonos = [sylldiv(i) for i in phonos]
for i in range(0, len(phonos)):
    phonos[i] = [parsesyll(j) for j in phonos[i]]
tokens = word_data.counts.tolist()
tokens = [int(i) for i in tokens]

In [None]:
#wordsize
wordsize_sheet.write(0, 0, 'syllables')
wordsize_sheet.write(0, 1, 'tokens')
wordsize_sheet.write(0, 2, 'types')

wordsize_list = [len(i) for i in phonos]
wordsize_tokens = [0, 0, 0, 0, 0, 0]
wordsize_types = [0, 0, 0, 0, 0, 0]

for i in range(0, len(wordsize_list)):
  size_num = wordsize_list[i]
  if size_num >= 6:
    wordsize_types[5] += 1
    wordsize_tokens[5] += int(tokens[i])
  else:
    wordsize_types[size_num-1] += 1
    wordsize_tokens[size_num-1] += int(tokens[i])

for i in range(1, 7):
  wordsize_sheet.write(i, 0, i)
  wordsize_sheet.write(i, 1, wordsize_tokens[i-1])
  wordsize_sheet.write(i, 2, wordsize_types[i-1])

In [None]:
#syllshape
def syll_shape(syll):
    if syll[0] == '':
        onset = ''
    else:
        onset = 'C'
    if syll[2] == 'none':
        if syll[1] in 'mng':
            return 'C'
        else:
            return onset+'V'
    elif syll[2] == 'dip':
        return onset+'VV'
    elif syll[2] == 'nasal':
        return onset+'VN'
    elif syll[2] == 'stop':
        return onset+'VS'

syllshape_sheet.write(0, 0, 'syllable shape')
syllshape_sheet.write(0, 1, 'token count')
syllshape_sheet.write(0, 2, 'type count')

syllshape_list = ['C', 'V', 'VV', 'VN', 'VS', 'CV', 'CVV', 'CVN', 'CVS']
syllshape_tokens = [0, 0, 0, 0, 0, 0, 0, 0, 0]
syllshape_types = [0, 0, 0, 0, 0, 0, 0, 0, 0]

for i in range(0, len(phonos)):
  for j in phonos[i]:
    syllshape_target = syll_shape(j)
    syllshape_index = syllshape_list.index(syllshape_target)
    syllshape_tokens[syllshape_index] += tokens[i]
    syllshape_types[syllshape_index] += 1

for i in range(1, 10):
  syllshape_sheet.write(i, 0, syllshape_list[i-1])
  syllshape_sheet.write(i, 1, syllshape_tokens[i-1])
  syllshape_sheet.write(i, 2, syllshape_types[i-1])

In [None]:
#setting up types and token list for sub-syllabic analysis
subsyll_tokens = []
subsyll_types = []
disyll_tokens = []
disyll_types = []
for i in range(0, len(phonos)):
  word_check = phonos[i]
  if len(word_check) == 2:
    disyll_types.append(word_check)
    disyll_tokens.extend([word_check]*tokens[i])
  for j in word_check:
    subsyll_types.append(j)
    subsyll_tokens.extend([j]*tokens[i])

In [None]:
#onset versus rime
onset_list = ['', 'b', 'p', 'd', 't', 'g', 'k', 'gw', 'kw', 'f', 's', 'h', 'z', 'c', 'm', 'n', 'ng', 'w', 'l', 'j']
rime_list = ['i', 'e', 'yu', 'oe', 'u', 'o', 'aa', 'ei', 'eoi', 'ui', 'oi', 'ai', 'aai', 'iu', 'eu', 'ou', 'au', 'aau', 'im', 'ip', 'em', 'ep', 'am', 'ap', 'aam', 'aap', 'in', 'it', 'en', 'et', 'yun', 'yut', 'eon', 'eot', 'an', 'at', 'aan', 'aat', 'on', 'ot', 'un', 'ut', 'ing', 'ik', 'eng', 'ek', 'oeng', 'oek', 'ang', 'ak', 'aang', 'aak', 'ong', 'ok', 'ung', 'uk', 'm', 'ng']
def OvR(onset, big_list):
    input_list = [i[1] for i in big_list if i[0] == onset]
    return Counter(input_list)

#token onset vs. rime
column = 1
for i in rime_list:
    OvRtoken_sheet.write(0, column, i)
    column += 1
row = 1
for i in onset_list:
    OvRtoken_sheet.write(row, 0, i)
    column = 1
    freq_access = OvR(i, subsyll_tokens)
    for j in rime_list:
        OvRtoken_sheet.write(row, column, freq_access[j])
        column += 1
    row += 1

#type onset vs. rime
column = 1
for i in rime_list:
    OvRtype_sheet.write(0, column, i)
    column += 1
row = 1
for i in onset_list:
    OvRtype_sheet.write(row, 0, i)
    column = 1
    freq_access = OvR(i, subsyll_types)
    for j in rime_list:
        OvRtype_sheet.write(row, column, freq_access[str(j)])
        column += 1
    row += 1

In [None]:
#coda versus tone
coda_list = ['none', 'nasal', 'stop']
def CvT(coda, big_list):
    if coda == 'none':
        input_list = [i[3] for i in big_list if i[2] == 'none' or i[2] == 'dip']
    else:
        input_list = [i[3] for i in big_list if i[2] == coda]
    return Counter(input_list)

#all coda versus tone
for i in range(1,7):
    CvTall_sheet.write(0, i, i)
CvTall_sheet.write(1, 0, 'token frequency')
CvTall_sheet.write(6, 0, 'type frequency')
row = 2
for i in coda_list:
    CvTall_sheet.write(row, 0, i)
    CvTall_sheet.write(row+5, 0, i)
    CvTtoken = CvT(i, subsyll_tokens)
    CvTtype = CvT(i, subsyll_types)
    for j in range(1,7):
        CvTall_sheet.write(row, j, CvTtoken[str(j)])
        CvTall_sheet.write(row+5, j, CvTtype[str(j)])
    row += 1

#disyll coda versus tone
for i in range(1,7):
    CvTdisyll_sheet.write(0, i, i)
    CvTdisyll_sheet.write(0, i+7, i)
CvTdisyll_sheet.write(1, 0, 'token frequency')
CvTdisyll_sheet.write(6, 0, 'type frequency')
row = 2
for i in coda_list:
    CvTall_sheet.write(row, 0, i)
    CvTall_sheet.write(row+5, 0, i)
    CvT1token = CvT(i, [i[0] for i in disyll_tokens])
    CvT2token = CvT(i, [i[1] for i in disyll_tokens])
    CvT1type = CvT(i, [i[0] for i in disyll_types])
    CvT2type = CvT(i, [i[1] for i in disyll_types])
    for j in range(1,7):
        CvTdisyll_sheet.write(row, j, CvT1token[str(j)])
        CvTdisyll_sheet.write(row, j+7, CvT2token[str(j)])
        CvTdisyll_sheet.write(row+5, j, CvT1type[str(j)])
        CvTdisyll_sheet.write(row+5, j+7, CvT2type[str(j)])
    row += 1

In [None]:
#phonotactic violations
phonotactics_sheet.write(0, 0, 'category')
phonotactics_sheet.write(0, 1, 'tokens')
phonotactics_sheet.write(0, 2, 'types')

#lab_lab
phonotactics_sheet.write(1, 0, '*lab...lab')
cat1_search = ['b', 'p', 'm', 'f', 'gw', 'kw']
cat1_token_count = 0
cat1_type_count = 0
for i in subsyll_types:
    if i[0] in cat1_search:
        for j in cat1_search:
            if j in i[1]:
                cat1_type_count += 1
phonotactics_sheet.write(1, 2, cat1_type_count)

if cat1_type_count != 0:
    for i in subsyll_tokens:
        if i[0] in cat1_search:
            for j in cat1_search:
                if j in i[1]:
                    cat1_token_count += 1
phonotactics_sheet.write(1, 1, cat1_token_count)

#round_lab
phonotactics_sheet.write(2, 0, '*round...lab')
cat2_search = ['up', 'um', 'op', 'om', 'yp', 'ym', 'yup', 'yum']
cat2_token_count = 0
cat2_type_count = 0
for i in subsyll_types:
    if i[1] in cat2_search:
        cat2_type_count += 1
phonotactics_sheet.write(2, 2, cat2_type_count)

if cat2_type_count != 0:
    for i in subsyll_tokens:
        if i[1] in cat2_search:
            cat2_token_count += 1
phonotactics_sheet.write(2, 1, cat2_token_count)

#lab_backround
phonotactics_sheet.write(3, 0, '*lab...backround')
cat3_search = ['y', 'eo', 'oe']
cat3_token_count = 0
cat3_type_count = 0
for i in subsyll_types:
    if i[0] in cat1_search:
        if i[1][0] in cat3_search or i[1][:2] in cat3_search:
            if i[2] != 'dip':
                cat3_type_count += 1
phonotactics_sheet.write(3, 2, cat3_type_count)

if cat3_type_count != 0:
    for i in subsyll_tokens:
        if i[0] in cat1_search:
            if i[1][0] in cat3_search or i[1][:2] in cat3_search:
                if i[2] != 'dip':
                    cat3_token_count += 1
phonotactics_sheet.write(3, 1, cat3_token_count)

#cor_backround_cor
phonotactics_sheet.write(4, 0, '*cor...backround...cor')
cat4_search = ['d', 't', 's', 'n', 'l']
cat4_search_vowel = ['o', 'u']
cat4_token_count = 0
cat4_type_count = 0
for i in subsyll_types:
    if i[1][1:] in cat4_search:
        if i[1][0] in cat4_search_vowel:
            if i[0] in cat4_search:
                cat4_type_count += 1
phonotactics_sheet.write(4, 2, cat4_type_count)

if cat4_type_count != 0:
    for i in subsyll_tokens:
        if i[1][1:] in cat4_search:
            if i[1][0] in cat4_search_vowel:
                if i[0] in cat4_search:
                    cat4_token_count += 1
phonotactics_sheet.write(4, 1, cat4_token_count)

#e_lab/cor
phonotactics_sheet.write(5, 0, '*e...lab/cor')
cat5_search = ['em', 'en', 'ep', 'et']
cat5_token_count = 0
cat5_type_count = 0
for i in subsyll_types:
    if i[1] in cat5_search:
        cat5_type_count += 1
phonotactics_sheet.write(5, 2, cat5_type_count)

if cat5_type_count != 0:
    for i in subsyll_tokens:
        if i[1] in cat5_search:
            cat5_token_count += 1
phonotactics_sheet.write(5, 1, cat5_token_count)

#cor_u
phonotactics_sheet.write(6, 0, '*cor...u')
cat6_token_count = 0
cat6_type_count = 0
for i in subsyll_types:
    if i[1][0] == 'u' and i[1] not in ['ung', 'uk']:
        if i[0] in cat4_search:
            cat6_type_count += 1
phonotactics_sheet.write(6, 2, cat6_type_count)

if cat6_type_count != 0:
    for i in subsyll_tokens:
        if i[1][0] == 'u' and i[1] not in ['ung', 'uk']:
            if i[0] in cat4_search:
                cat6_token_count += 1
phonotactics_sheet.write(6, 1, cat6_token_count)

#high_dorsal
phonotactics_sheet.write(7, 0, '*high...dorsal')
cat7_search = ['yuk', 'yung']
cat7_token_count = 0
cat7_type_count = 0
for i in subsyll_types:
    if i[1] in cat7_search:
        cat7_type_count += 1
phonotactics_sheet.write(7, 2, cat7_type_count)

if cat7_type_count != 0:
    for i in subsyll_tokens:
        if i[1] in cat7_search:
            cat7_token_count += 1
phonotactics_sheet.write(7, 1, cat7_token_count)

In [None]:
workbook.close()