In [None]:
# [Author]: Jun Yao
# [Date]: 2021-12-03

# [Description] 
# this file prepares the train and test dataset for model 1

# [Instructions] 
# official transcript files (2351 .stm files) are copied to the folder where this script sits
# to make this folder all-inclusive, so you can put this folder anywhere you want.

# input of this script:
# ./stm/*
# tedlium3_topic_labels.csv

# output of this script:
# stm_transcript_labels.csv

In [21]:
import pandas as pd

stm_path = "./stm/"

# read gold labels file
df = pd.read_csv("tedlium3_topic_labels.csv")

# add a new column to the dataframe
df = df.assign(TRANSCRIPT="")

print(len(df))
df.head()

2351


Unnamed: 0,ID,FILENAME,URI,TITLE,AUTHOR,UPLOAD_DATE,DURATION,TOPIC1,TOPIC2,TOPIC3,TRANSCRIPT
0,1,911Mothers_2010W.stm,/talks/aicha_el_wafi_phyllis_rodriguez_the_mot...,"The mothers who found forgiveness, friendship",Aicha el-Wafi + Phyllis Rodriguez,2011-05-02,9M54S,culture,global issues,parenting,
1,2,AalaElKhani_2016X.stm,/talks/aala_el_khani_what_it_s_like_to_be_a_pa...,What it's like to be a parent in a war zone,Aala El-Khani,2017-02-10,14M16S,communication,community,family,
2,3,AaronHuey_2010X.stm,/talks/aaron_huey_america_s_native_prisoners_o...,America's native prisoners of war,Aaron Huey,2010-11-10,15M27S,TEDx,culture,history,
3,4,AaronKoblin_2011.stm,/talks/aaron_koblin_visualizing_ourselves_with...,Visualizing ourselves ... with crowd-sourced data,Aaron Koblin,2011-05-23,18M18S,collaboration,data,design,
4,5,AaronOConnell_2011.stm,/talks/aaron_o_connell_making_sense_of_a_visib...,Making sense of a visible quantum object,Aaron O'Connell,2011-06-02,7M51S,philosophy,physics,science,


In [40]:
def read_stm_transcript(file_name):
    #print(file_name)
    full_path = stm_path + file_name
    #print(f"file path: {full_path}")
    trans = []
    with open(full_path) as file:
        for line in file:
            trans.append(line.split("<NA>")[1].replace("<unk>","").replace("'","").replace("\n","").strip())
    return ". ".join(trans)

# read all transcripts
df_len = len(df)
for i in range(df_len):
    trans = read_stm_transcript(df.iloc[i,1]) #notice the index of the column FILENAME
    df.iloc[i,10] = trans
    if(i%100==0):
        print(f"========>processed {i} files")
print("success")

df.head()

success


Unnamed: 0,ID,FILENAME,URI,TITLE,AUTHOR,UPLOAD_DATE,DURATION,TOPIC1,TOPIC2,TOPIC3,TRANSCRIPT
0,1,911Mothers_2010W.stm,/talks/aicha_el_wafi_phyllis_rodriguez_the_mot...,"The mothers who found forgiveness, friendship",Aicha el-Wafi + Phyllis Rodriguez,2011-05-02,9M54S,culture,global issues,parenting,because of. the fact that we have what most pe...
1,2,AalaElKhani_2016X.stm,/talks/aala_el_khani_what_it_s_like_to_be_a_pa...,What it's like to be a parent in a war zone,Aala El-Khani,2017-02-10,14M16S,communication,community,family,over one point five billion people experience ...
2,3,AaronHuey_2010X.stm,/talks/aaron_huey_america_s_native_prisoners_o...,America's native prisoners of war,Aaron Huey,2010-11-10,15M27S,TEDx,culture,history,m here today to show my photographs of the lak...
3,4,AaronKoblin_2011.stm,/talks/aaron_koblin_visualizing_ourselves_with...,Visualizing ourselves ... with crowd-sourced data,Aaron Koblin,2011-05-23,18M18S,collaboration,data,design,data can actually make us more human. we re co...
4,5,AaronOConnell_2011.stm,/talks/aaron_o_connell_making_sense_of_a_visib...,Making sense of a visible quantum object,Aaron O'Connell,2011-06-02,7M51S,philosophy,physics,science,this is a representation of your brain and yo...


In [55]:
# prepare the data for model 1
column_names = ["titles", "summaries", "terms"]
df_md_1 = pd.DataFrame(columns = column_names,index = list(range(df_len)))
for i in range(df_len):
    df_md_1.iloc[i,0] = df.iloc[i,1]
    df_md_1.iloc[i,1] = df.iloc[i,10]
    if len(str(df.iloc[i,8]))<1:
        df_md_1.iloc[i,2] = f"['{df.iloc[i,7]}']"
    elif len(str(df.iloc[i,9]))<1:
        df_md_1.iloc[i,2] = f"['{df.iloc[i,7]}','{df.iloc[i,8]}']"
    else:
        df_md_1.iloc[i,2] = f"['{df.iloc[i,7]}','{df.iloc[i,8]}','{df.iloc[i,9]}']"
df_md_1.head()

Unnamed: 0,titles,summaries,terms
0,911Mothers_2010W.stm,because of. the fact that we have what most pe...,"['culture','global issues','parenting']"
1,AalaElKhani_2016X.stm,over one point five billion people experience ...,"['communication','community','family']"
2,AaronHuey_2010X.stm,m here today to show my photographs of the lak...,"['TEDx','culture','history']"
3,AaronKoblin_2011.stm,data can actually make us more human. we re co...,"['collaboration','data','design']"
4,AaronOConnell_2011.stm,this is a representation of your brain and yo...,"['philosophy','physics','science']"


In [56]:
# save dataframe to csv file
df_md_1.to_csv('stm_transcript_labels.csv',index=False)