# Processing of the raw data

## Basic settings

In [7]:
import pandas as pd

ORIG_DIR = "../data/00_original"
PROC_DIR = "../data/01_process"

DATA_FILE = f"{ORIG_DIR}/data_table.tsv"
PT_PHON_FILE = f"{ORIG_DIR}/pt_words_phon.tsv"
DATAFRAME = f"{PROC_DIR}/dataframe.tsv"

## Load files and build dataframe 

In [9]:
data = pd.read_csv(DATA_FILE, sep="\t", names=["Student", "Orth", "Hankul", "Yale", "KO"])

pt_phon = pd.read_csv(PT_PHON_FILE, sep="\t", names=["Orth", "PT"], index_col="Orth").to_dict()['PT']
data["PT"] = data.apply(lambda row: pt_phon[row.Orth], axis=1)

word_number = dict((y, x) for x, y in enumerate(data['PT'][:50]))
data["Word"] = data.apply(lambda row: word_number[row.PT], axis=1)

data = data[["Student", "Word", "Orth", "Hankul", "Yale", "PT", "KO"]]

In [10]:
data

Unnamed: 0,Student,Word,Orth,Hankul,Yale,PT,KO
0,1,0,pais,바이스,pa.i.su,ˈpajs,pa.i.sɯ
1,1,1,país,바이쓰,pa.i.ssu,pa.ˈis,pa.i.s͈ɯ
2,1,2,vela,으벨라,u.peyl.la,ˈvɛ.lɐ,ɯ.pɛl.la
3,1,3,bela,벨라,peyl.la,ˈbɛ.lɐ,pɛl.la
4,1,4,ele,이블리,i.pul.li,ˈe.lɪ,i.pɯl.li
...,...,...,...,...,...,...,...
901,17,45,talento,달륀뚜,tal.lwin.ttwu,ta.ˈlẽ.tʊ,tal.lʏn.t͈u
902,17,46,daquilo,다끼일루,ta.kki.il.lwu,da.ˈki.lʊ,ta.k͈i.il.lu
903,17,47,naquilo,나끼일루,na.kki.il.lwu,na.ˈki.lʊ,na.k͈i.il.lu
904,17,48,carroça,가허싸,ka.he.ssa,ka.ˈhɔ.sɐ,ka.hʌ.s͈a


## Save dataframe

In [6]:
data.to_csv(DATAFRAME, sep='\t')

In [14]:
d = data[['PT', 'KO']].to_dict()

def contain(series, what): return series.map(lambda x: what in x)

unstressed_o = data[contain(data["PT"], "ʊ")]
unstressed_e = data[contain(data["PT"], "ɪ")]

In [15]:
unstressed_o[["PT", "KO"]]

Unnamed: 0,PT,KO
11,ˈze.lʊ,tɕil.lu
12,ˈʒe.lʊ,tɕɛl.lu
19,ˈbo.lʊ,pol.lo
21,ˈli.ki.dʊ,li.k͈i.pu
22,li.ˈki.dʊ,li.k͈i.pʰu
...,...,...
886,ˈsej.ʊ,sɛ.i.u
898,pa.ˈka.tʊ,pat.k͈a.t͈u
901,ta.ˈlẽ.tʊ,tal.lʏn.t͈u
902,da.ˈki.lʊ,ta.k͈i.il.lu


In [16]:
table_pt_ko = data.pivot_table(index="PT", columns="Student", values=["KO"], aggfunc=lambda x: ", ".join(x))

In [17]:
table_pt_ko.to_excel(f"{PROC_DIR}/table_pt_ko.xlsx")