In [2]:
# Import the required module for text to speech synthesis
from gtts import gTTS
import numpy as np
import pandas as pd
import random
import os

In [3]:
# Total vocabulary
LETTERS = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]

PHONETIC_LETTERS = ["Alfa", "Bravo", "Charlie", "Delta", "Echo", "Foxtrot", "Golf", "Hotel", "India", "Juliett", "Kilo", "Lima", "Mike","November",
"Oscar", "Papa", "Quebec", "Romeo", "Sierra", "Tango", "Uniform", "Victor", "Whiskey", "Xray", "Yankee", "Zulu"]

PHONETIC_NUMBERS =["Zero", "One", "Two", "Three", "Four", "Fife", "Six", "Seven", "Eight", "Niner"]

DESIGNATOR = ["Ryanair", "Speedbird", "Astraeus"]

CALLSIGN = ["RYA", "BAW", "AEU"]

ACTION_TYPE = ["Flight Level", "Heading", "Speed", "Incomm", "Outcomm", "Route"]

ACTION_CLIMB = ["climb", "descend"]

SUB_ACTION_HEADING = ["Absolute", "Relative", "Continue"]

SUB_ACTION_SPEED = ["Knots", "Mach"]

SUB_ACTION_ROUTE = ["On Route", "Resume"]

NEIGHBOUR_SECTORS = ["London"]

MISC_VOCAB = ["hundred", "degrees", "left", "right", "roger"]

vocab = PHONETIC_LETTERS + PHONETIC_NUMBERS + DESIGNATOR + \
        ACTION_TYPE + ACTION_CLIMB + SUB_ACTION_HEADING + SUB_ACTION_SPEED + SUB_ACTION_ROUTE + \
        NEIGHBOUR_SECTORS + MISC_VOCAB
print(len(vocab))

60


In [4]:
vocab = list(filter(lambda x: x not in ["Incomm", "Outcomm", "On Route", "Absolute", "Relative"], vocab))
print(len(vocab))

55


In [5]:
# Configurations
ACCENT = ["com.au", "co.uk", "com", "ca", "co.in", "ie", "co.za"]
df_fixes= pd.read_csv('data_sector/s25_waypoints.csv')
s25_fixes = df_fixes["fix"].values.tolist()

# Aircraft identifier string length
IDENTIFIER_LENGTH = 3

# Flight levels
MIN_FL = 10
MAX_FL = 600
FL = np.arange(MIN_FL, MAX_FL, 10) 
# Heading angles (absolute, relative)
ANGLE_ABS = np.arange(5,360, 5)
ANGLE_REL = np.arange(5,40, 5)
# Speed 
SPEED_MACH = np.arange(30, 90, 5 )
SPEED_KNTS = np.arange(150, 300, 10)

In [6]:
# Helper functions 

def gen_call_sign():
    # Output string
    output_spoken = ""
    output_str = ""

    # Pick a random airline
    idx = np.random.randint(0, len(DESIGNATOR))
    output_spoken +=  DESIGNATOR[idx] + " "
    output_str += CALLSIGN[idx]

    # Pick random numerics of length IDENTIFIER_LENGTH
    for i in range(IDENTIFIER_LENGTH):
        # Letter or number
        if random.random() < 0.5: # Letter
            idx = np.random.randint(0, len(PHONETIC_LETTERS))
            output_spoken += PHONETIC_LETTERS[idx] + " "
            output_str += LETTERS[idx]
        else: # Number
            idx = np.random.randint(0, len(PHONETIC_NUMBERS))
            output_spoken += PHONETIC_NUMBERS[idx] + " "
            output_str += str(idx)
    
    output_spoken = output_spoken[:-1]

    return output_spoken, output_str

# Generate a random flight level within the bounds
def gen_flight_level():
    # Output string
    output_spoken = " " + ACTION_TYPE[0]
    output_str = "_FL_"

    # Add optional action phrase
    if random.random() < 0.5:
        output_spoken = " " + random.choice(ACTION_CLIMB) + output_spoken
    
    # Pick a random flight level
    fl = random.choice(FL)
    output_str += str(fl)

    # Generate corresponding tokens
    if fl % 100 == 0:
        output_spoken += " " + PHONETIC_NUMBERS[int(fl / 100)] + " hundred"
    else:
        for k in str(fl):
            output_spoken += " " + PHONETIC_NUMBERS[int(k)]
    
    return output_spoken, output_str

# Generate a random heading within the bounds
def gen_heading():
    # Output string
    output_spoken = ""
    output_str = ""

    # Draw a sub-action heading
    subaction_type = random.choice(SUB_ACTION_HEADING)

    # Handle each sub-action type
    if subaction_type == "Absolute":
        output_spoken += " fly heading "
        output_str += "_Abs_"
        
        # Draw heading
        heading_angle = str(random.choice(ANGLE_ABS))
        heading_angle.rjust(3, "0")
        
        # Convert spoken string
        for i in heading_angle:
            output_spoken += PHONETIC_NUMBERS[int(i)] + " "
        output_spoken += "degrees"
        output_str += heading_angle
    
    elif subaction_type == "Relative":
        # Draw heading
        heading_angle = str(random.choice(ANGLE_REL))
        direction = random.choice(["left", "right"])

        output_spoken += " turn " + direction + " "
        output_str += "_Rel_" + direction + "_" + heading_angle

        # Convert spoken string
        for i in heading_angle:
            output_spoken += PHONETIC_NUMBERS[int(i)] + " "
        output_spoken += "degrees"

    elif subaction_type == "Continue":
        output_spoken += " continue present heading"
        output_str += "_Cont"

    return output_spoken, output_str

def gen_speed():
    # Output string
    output_spoken = ""
    output_str = ""

    # Draw a sub-action heading
    subaction_type = random.choice(SUB_ACTION_SPEED)

    # Handle each sub-action type
    if subaction_type == "Mach":
        output_spoken += " Mach decimal "
        output_str += "_Mach_"

        # Draw speed
        speed = str(random.choice(SPEED_MACH))
        
        # Convert spoken string
        for i in speed:
            output_spoken += PHONETIC_NUMBERS[int(i)] + " "
        output_str += speed

    elif subaction_type == "Knots":
        output_spoken += random.choice([" speed ", " fly speed ", " make your speed "])
        output_str += "_Knots_"

        # Draw speed
        speed = str(random.choice(SPEED_KNTS))
        
        # Convert spoken string
        for i in speed:
            output_spoken += PHONETIC_NUMBERS[int(i)] + " "
        output_spoken += "knots "
        output_str += speed

    output_spoken = output_spoken[:-1]

    return output_spoken, output_str

# Incoming communications
def gen_incomms():
    output_spoken = " roger"
    output_str = "_InComms"
    return output_spoken, output_str

# Outgoing communications
def gen_outcomms():
    # Draw neighbour
    neighbour = random.choice(NEIGHBOUR_SECTORS)
    # Convert string
    output_spoken = " contact " + neighbour
    output_str = "_OutComms_" + neighbour
    return output_spoken, output_str

# Navigation
def gen_route():
    output_spoken = " "
    output_str = "_Route_"

    # Draw route
    subaction_type = random.choice(SUB_ACTION_ROUTE)
    destination = random.choice(s25_fixes)
    
    output_str += destination
    # Handle each case
    if subaction_type == "On Route":
        output_spoken += random.choice(["route ", "route to ", "route direct ", "route direct to "])

    elif subaction_type == "Resume":
        output_spoken += "resume own navigation " + random.choice(["to ", "direct to "]) 

    for letter in destination:
        output_spoken += PHONETIC_LETTERS[LETTERS.index(letter)] + " "
    output_spoken = output_spoken[:-1]

    return output_spoken, output_str

    

In [7]:
def gen_data(n_samples):
    # output tokens (labels for tokenizer)
    speech_spoken = []
    # filenames
    speech_str = []

    # Generate samples
    for i in range(n_samples):
        sample_spoken = ""
        sample_str = ""

        # Generate callsign
        callsign_spoken, callsign_str = gen_call_sign()
        sample_spoken += callsign_spoken
        sample_str += callsign_str

        # Generate high level action - ["Flight Level", "Heading", "Speed", "Incomm", "Outcomm", "Route"]
        action_type = random.choices(ACTION_TYPE, weights=[5, 5, 5, 1, 3, 5], k=1)[0]

        # Handle each type of high level action
        gen_fn = None
        if action_type == "Flight Level":
            gen_fn = gen_flight_level
        elif action_type == "Heading":
            gen_fn = gen_heading
        elif action_type == "Speed":
            gen_fn = gen_speed
        elif action_type == "Incomm":
            gen_fn = gen_incomms
        elif action_type == "Outcomm":
            gen_fn = gen_outcomms
        elif action_type == "Route":
            gen_fn = gen_route
        
        action_spoken, action_str = gen_fn()
        sample_spoken += action_spoken
        sample_str += action_str


        speech_spoken.append(sample_spoken)
        speech_str.append(sample_str)
    return speech_spoken, speech_str

In [8]:
# vocab = [x.lower() for x in vocab]
# word_tokens = " ".join(speech_spoken).lower().split(" ")

# counts = np.zeros(len(vocab), dtype=int)
# for token in word_tokens:
#     try:
#         idx = vocab.index(token)
#         counts[idx] += 1
#     except ValueError:
#         pass

# token_dist = {
#     vocab[i]: counts[i] for i in range(len(vocab))
# }
# token_dist

In [9]:
# Training data
speech_train, file_train = gen_data(n_samples=500)

# Save files
for i in range(len(speech_train)):
    # Convert str to mp3
    myobj = gTTS(text=speech_train[i], lang='en', tld=random.choice(ACCENT), slow=random.choice([True, False]))
    myobj.save("data_gtts/train/" + file_train[i] + ".mp3")

# Create metadata
filenames_train = ["data_gtts/train/" + filename + ".mp3" for filename in file_train]
metadata_train = {"file_name": filenames_train, "transcription": speech_train}
df_train = pd.DataFrame(metadata_train)
df_train.to_csv("./data_gtts/Train/metadata.csv", index=False)

In [None]:
# Testing data
speech_test, file_test = gen_data(n_samples=150)

# Save files
for i in range(len(speech_test)):
    # Convert str to mp3
    myobj = gTTS(text=speech_test[i], lang='en', tld=random.choice(ACCENT), slow=random.choice([True, False]))
    myobj.save("data_gtts/test/" + file_test[i] + ".mp3")

# Create metadata
filenames_test = ["data_gtts/test/" + filename + ".mp3" for filename in file_test]
metadata_test = {"file_name": filenames_test, "transcription": speech_test}
df_test = pd.DataFrame(metadata_test)
df_test.to_csv("./data_gtts/test/metadata.csv", index=False)

In [None]:
# df_metadata = pd.concat([df_train, df_test], ignore_index=True)
# df_metadata.to_csv("./data_gtts/metadata.csv", index=False)

In [None]:
from datasets import load_dataset

dataset = load_dataset("data_gtts", data_dir="I:/Repos/STT_FineTune/nats/data_gtts")

dataset

Using custom data configuration default-9b41ac5bfd8c70c2


Downloading and preparing dataset data_gtts/default to I:/Repos/HFdatasets/data_gtts/default-9b41ac5bfd8c70c2/0.1.0/99611922a2fe30672e990db44b070dc747a16dd2cb691d0d2c33dc670a2e3b68...


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset data_gtts downloaded and prepared to I:/Repos/HFdatasets/data_gtts/default-9b41ac5bfd8c70c2/0.1.0/99611922a2fe30672e990db44b070dc747a16dd2cb691d0d2c33dc670a2e3b68. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 10
    })
    test: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 5
    })
})

In [None]:
# empty folders
import os

def empty_folders():
    dir = './data_gtts/train/'
    for f in os.listdir(dir):
        os.remove(os.path.join(dir, f))

    dir = './data_gtts/test/'
    for f in os.listdir(dir):
        os.remove(os.path.join(dir, f))

# empty_folders()