In [3]:
import pandas as pd
import numpy as np
import os 
from typing import Mapping
import json

In [4]:
dataset_dir = './massive-dataset/1.1/data'
files = os.listdir(dataset_dir)
files

['de-DE.jsonl',
 'am-ET.jsonl',
 'sl-SL.jsonl',
 'ko-KR.jsonl',
 'sq-AL.jsonl',
 'fr-FR.jsonl',
 'is-IS.jsonl',
 'fi-FI.jsonl',
 'it-IT.jsonl',
 'sw-KE.jsonl',
 'fa-IR.jsonl',
 'es-ES.jsonl',
 'lv-LV.jsonl',
 'km-KH.jsonl',
 'ar-SA.jsonl',
 'th-TH.jsonl',
 'vi-VN.jsonl',
 'pl-PL.jsonl',
 'jv-ID.jsonl',
 'ta-IN.jsonl',
 'en-US.jsonl',
 'ru-RU.jsonl',
 'az-AZ.jsonl',
 'af-ZA.jsonl',
 'ms-MY.jsonl',
 'he-IL.jsonl',
 'te-IN.jsonl',
 'kn-IN.jsonl',
 'hu-HU.jsonl',
 'nl-NL.jsonl',
 'pt-PT.jsonl',
 'ml-IN.jsonl',
 'my-MM.jsonl',
 'zh-TW.jsonl',
 'ur-PK.jsonl',
 'nb-NO.jsonl',
 'hi-IN.jsonl',
 'ja-JP.jsonl',
 'ro-RO.jsonl',
 'tr-TR.jsonl',
 'cy-GB.jsonl',
 'mn-MN.jsonl',
 'sv-SE.jsonl',
 'bn-BD.jsonl',
 'ka-GE.jsonl',
 'da-DK.jsonl',
 'el-GR.jsonl',
 'tl-PH.jsonl',
 'ca-ES.jsonl',
 'hy-AM.jsonl',
 'id-ID.jsonl',
 'zh-CN.jsonl']

In [5]:
english_json = pd.read_json(f"{dataset_dir}/en-US.jsonl",lines=True)
english_json

Unnamed: 0,id,locale,partition,scenario,intent,utt,annot_utt,worker_id
0,0,en-US,test,alarm,alarm_set,wake me up at five am this week,wake me up at [time : five am] [date : this week],1
1,1,en-US,train,alarm,alarm_set,wake me up at nine am on friday,wake me up at [time : nine am] on [date : friday],1
2,2,en-US,train,alarm,alarm_set,set an alarm for two hours from now,set an alarm for [time : two hours from now],1
3,3,en-US,test,audio,audio_volume_mute,quiet,quiet,1
4,4,en-US,train,audio,audio_volume_mute,olly quiet,olly quiet,1
...,...,...,...,...,...,...,...,...
16516,17176,en-US,train,email,email_query,do i have emails,do i have emails,12
16517,17177,en-US,train,email,email_query,what emails are new,what emails are new,12
16518,17178,en-US,train,email,email_query,do i have new emails from john,do i have new emails from [person : john],12
16519,17179,en-US,test,email,email_query,has john sent me an email,has [person : john] sent me an email,12


In [6]:
def build_file_path(file_name: str)->str:
    return f"{dataset_dir}/{file_name}"

In [7]:
def language_df(file_name:str) -> pd.DataFrame:
    df = pd.read_json(build_file_path(file_name),lines=True)
    return df


In [8]:
def get_lang(file_name:str)->str:
    return file_name.split('-')[1].split('.')[0]

In [19]:
for file_path in files:
    df = language_df(file_path)
    df = df.merge(english_json, on="id")
    
    df = df.rename(
        {
            "utt_x": "utterance_translation",
            "annot_utt_x": "annotation_utterance_translation",
            "utt_y": "utterance",
            "annot_utt_y": "annot_utt",
            "partition_y": "partition"
        },axis=1
    )
    df = df[["utterance_translation", "annotation_utterance_translation", "utterance", "annot_utt","id","partition"]]
    
    df.to_excel(f'./processed-dataset/en-{get_lang(file_path)}.xlsx')



### Question 2

In [10]:
english_df = language_df('en-US.jsonl')
kiswahili_df = language_df('sw-KE.jsonl')
german_df = language_df('de-DE.jsonl')

In [11]:
def separate_dfs(df:pd.DataFrame,key:str) -> pd.DataFrame:
    return  df[df['partition']==key]


In [12]:
def export_as_jsonl(df:pd.DataFrame,name:str):
    df.to_json(f'./partitioned-datasets/{name}.jsonl',lines=True,orient='records')
 



In [13]:
partitions = ['dev','train','test']
dfs = [english_df,kiswahili_df,german_df]
dfs_names = ['english','kiswahili','german']

for df,name in zip(dfs,dfs_names):
    for partition in partitions:
        export_as_jsonl(separate_dfs(df,partition),f'{name}-{partition}')

In [23]:
processed_dir_path = './processed-dataset'
files_english = os.listdir(processed_dir_path)
print(files_english)

['en-SE.xlsx', 'en-HU.xlsx', 'en-IR.xlsx', 'en-MY.xlsx', 'en-ZA.xlsx', 'en-MN.xlsx', 'en-JP.xlsx', 'en-LV.xlsx', 'en-US.xlsx', 'en-RU.xlsx', 'en-BD.xlsx', 'en-KH.xlsx', 'en-FR.xlsx', 'en-ET.xlsx', 'en-SA.xlsx', 'en-DE.csv', 'en-TH.xlsx', 'en-IL.xlsx', 'en-TW.xlsx', 'en-RO.xlsx', 'en-NL.xlsx', 'en-KR.xlsx', 'en-GB.xlsx', 'en-GR.xlsx', 'en-PL.xlsx', 'en-ID.xlsx', 'en-MM.xlsx', 'DE.xlsx', 'en-AL.xlsx', 'en-IT.xlsx', 'en-ES.xlsx', 'en-VN.xlsx', 'en-KE.xlsx', 'en-AM.xlsx', 'en-SL.xlsx', 'en-NO.xlsx', 'en-IS.xlsx', 'en-FI.xlsx', 'en-CN.xlsx', 'en-IN.xlsx', 'en-GE.xlsx', 'en-DK.xlsx', 'en-AZ.xlsx', 'en-PK.xlsx', 'en-PT.xlsx', 'en-DE.xlsx', 'en-TR.xlsx', 'en-PH.xlsx']


In [None]:
def df_to_json(df:pd.DataFrame) -> Mapping:
    return df.to_dict()

In [25]:
def export_to_json(name:str,data:list[dict]):
    with open(f"{name}.json", "w") as outfile:
        json.dump(data, outfile)

In [30]:

result = []
for file in files_english:
    try:
        file_path = f"{processed_dir_path}/{file}"
        df = pd.read_excel(file_path)
        df = separate_dfs(df,'train')
        df = df[['id','utterance_translation',]]
        result.append(df.to_dict())
    except:
        pass



./processed-dataset/en-SE.xlsx
   id                  utterance_translation
1   1             väck mig vid nio på fredag
2   2  sätt ett alarm för två timmar från nu
4   4                              olly tyst
5   5                                  sluta
6   6              olly pausa i tio sekunder


In [27]:
export_to_json('combined',result)