In [76]:
# [Author]: Jun Yao
# [Date]: 2021-12-03

# [Description] 
# this file transformed decoded text(transcript) from model 2, it will be used as test data in model 1
# as a comparison to text created from the official transcript.

# [Instructions] 
# decoded text is stored in the log files of tri3 model from kaldi.
# so log files from model 2 is needed to do this transformation.
# the author copied the log files into the same folder where this script sits

# input of this script:
# ./log/decode.*.log
# tedlium3_topic_labels.csv

# output of this script:
# test_text_from_model2.csv

In [77]:
import pandas as pd

#log files come from:
#kaldi-trunk/egs/tedlium/s5_r3/exp/tri3/decode_test/log/decode.*.log
#they will be copied to ./log/ after kaldi training and decoding process

tri3_decode_path = "./log/" 

# read gold labels file
df = pd.read_csv("tedlium3_topic_labels.csv")

# add a new column to the dataframe
df = df.assign(TRANSCRIPT="")

print(len(df))
df.head()

2351


Unnamed: 0,ID,FILENAME,URI,TITLE,AUTHOR,UPLOAD_DATE,DURATION,TOPIC1,TOPIC2,TOPIC3,TRANSCRIPT
0,1,911Mothers_2010W.stm,/talks/aicha_el_wafi_phyllis_rodriguez_the_mot...,"The mothers who found forgiveness, friendship",Aicha el-Wafi + Phyllis Rodriguez,2011-05-02,9M54S,culture,global issues,parenting,
1,2,AalaElKhani_2016X.stm,/talks/aala_el_khani_what_it_s_like_to_be_a_pa...,What it's like to be a parent in a war zone,Aala El-Khani,2017-02-10,14M16S,communication,community,family,
2,3,AaronHuey_2010X.stm,/talks/aaron_huey_america_s_native_prisoners_o...,America's native prisoners of war,Aaron Huey,2010-11-10,15M27S,TEDx,culture,history,
3,4,AaronKoblin_2011.stm,/talks/aaron_koblin_visualizing_ourselves_with...,Visualizing ourselves ... with crowd-sourced data,Aaron Koblin,2011-05-23,18M18S,collaboration,data,design,
4,5,AaronOConnell_2011.stm,/talks/aaron_o_connell_making_sense_of_a_visib...,Making sense of a visible quantum object,Aaron O'Connell,2011-06-02,7M51S,philosophy,physics,science,


In [78]:
import re
import os

def read_decoded_text_from_kaldi(file_name):
    #print(file_name)
    full_path = tri3_decode_path + file_name
    #print(f"file path: {full_path}")
    trans_name = None
    trans = []
    with open(full_path) as file:
        for line in file:
            if line.startswith("LOG"):
                continue
            m = re.findall(r"\d{7}-\d{7}", line)
            if m:
                #print(line)
                #print(f"find segement: {m[0]}")
                splitstrs = line.split("-"+m[0])
                trans_name = splitstrs[0]
                trans.append(splitstrs[1].replace("<unk>","").replace("'","").replace("\n","").strip())
    return [trans_name,". ".join(trans)]

list_of_files = []

for root, dirs, files in os.walk(tri3_decode_path):
    for file in files:
        if file.startswith("decode."):
            list_of_files.append(file)
print(f"====>find {len(list_of_files)} log files:")
for name in list_of_files:
    print(name)

df_len = len(df)

# read all decoded texts
for log_file_name in list_of_files:
    print(f"====>finding decoded segements in {log_file_name}....")
    with open(tri3_decode_path+log_file_name) as file:
        tmp = read_decoded_text_from_kaldi(log_file_name)
        print(f"====>decoded text from {log_file_name}:")
        #print(tmp)
        if tmp[0] is None:
            continue
            
        # add transcript to df
        fname = tmp[0]+".stm"
        print(f"stm file name is {fname}")
        year = tmp[0].split("_")[1]
        if len(year)>4:
            year = year[0:4]
        file_name_to_match = tmp[0].split("_")[0]+"_"+str(year)
        print(file_name_to_match)
        for i in range(df_len):
            #print(df.iloc[i,1])
            if df.iloc[i,1].find(tmp[0].split("_")[0])>-1:
                print("found")
                df.iloc[i,10] = df.iloc[i,10] + ". " + tmp[1]
                break
        #break
    
print("success")

filtered_df = df[df['TRANSCRIPT'] != ""]
print(f"=====>found {len(filtered_df)} files, final dataframe is:")
print(filtered_df)


====>find 38 log files:
decode.2.log
decode.3.log
decode.1.log
decode.4.log
decode.5.log
decode.7.log
decode.6.log
decode.25.log
decode.31.log
decode.19.log
decode.18.log
decode.30.log
decode.24.log
decode.32.log
decode.26.log
decode.27.log
decode.33.log
decode.37.log
decode.23.log
decode.22.log
decode.36.log
decode.20.log
decode.34.log
decode.35.log
decode.21.log
decode.10.log
decode.38.log
decode.11.log
decode.13.log
decode.12.log
decode.16.log
decode.17.log
decode.29.log
decode.15.log
decode.14.log
decode.28.log
decode.8.log
decode.9.log
====>finding decoded segements in decode.2.log....
====>decoded text from decode.2.log:
stm file name is AimeeMullins_2009P.stm
AimeeMullins_2009
found
====>finding decoded segements in decode.3.log....
====>decoded text from decode.3.log:
stm file name is AimeeMullins_2009P.stm
AimeeMullins_2009
found
====>finding decoded segements in decode.1.log....
====>decoded text from decode.1.log:
stm file name is AimeeMullins_2009P.stm
AimeeMullins_2009
fou

In [85]:
# prepare the data for comparison in model 1
column_names = ["titles", "summaries", "terms"]
df_md_1 = pd.DataFrame(columns = column_names,index = list(range(len(filtered_df))))
for i in range(len(filtered_df)):
    df_md_1.iloc[i,0] = filtered_df.iloc[i,1]
    df_md_1.iloc[i,1] = filtered_df.iloc[i,10]
    if len(str(df.iloc[i,8]))<1:
        df_md_1.iloc[i,2] = f"['{filtered_df.iloc[i,7]}']"
    elif len(str(df.iloc[i,9]))<1:
        df_md_1.iloc[i,2] = f"['{filtered_df.iloc[i,7]}','{filtered_df.iloc[i,8]}']"
    else:
        df_md_1.iloc[i,2] = f"['{filtered_df.iloc[i,7]}','{filtered_df.iloc[i,8]}','{filtered_df.iloc[i,9]}']"
print(df_md_1[0:])

                    titles                                          summaries  \
0    AimeeMullins_1998.stm  . but in the previous instances of those the s...   
1       BillGates_2009.stm  . the temperature will continue to rise and so...   
2      DanBarber_2008P.stm  . sid what percentage. of your feed his chicke...   
3  JaneMcGonigal_2012G.stm  . recently scientists have suggested that her ...   
4    RobertGupta_2012P.stm  . and i played the first movement of the beeth...   
5       TomWujec_2009G.stm  . several years ago here at ted peter skillman...   

                                        terms  
0         ['beauty','body language','design']  
1           ['business','education','health']  
2    ['entertainment','food','global issues']  
3  ['body language','entertainment','gaming']  
4       ['TED Fellows','activism','medicine']  
5        ['astronomy','history','innovation']  


In [86]:
# save dataframe to csv file
df_md_1.to_csv('test_text_from_model2.csv',index=False)