# Component One: Voice Recorder

In [16]:
import os
import wave
import json

import numpy as np
import sounddevice
from scipy.io.wavfile import write
from vosk import Model, KaldiRecognizer, SetLogLevel

## Step 1: Record Voice

This component will record an audio sample, and save it to an audio file.

### Process
1. Run the cell below
2. Input a number for the amount of time you want the recorder to run
3. (Maybe) grant access for the use of your devices microphone
4. Speak
5. The cell will automatically stop running after the time is completed

### Details
- The number of channels is specific for your device
- The function creates a new file or rewrites the existing one

In [17]:
fs= 44100
second =  int(input("Enter time duration in seconds: "))
print("Recording.....n")
record_voice = sounddevice.rec( int ( second * fs ) , samplerate = fs , channels = 1, dtype=np.int16 ) # might be different depending on machine
sounddevice.wait()
write("audio_file.wav",fs,record_voice)
print("Finished.....nPlease check your output file")

Recording.....n
Finished.....nPlease check your output file


## Step 2: Transcribe Audio

This component takes an audio file and writes its content to a text file

### Process
1. Download the vosk model. The linto french model is suggested
2. Run the cell below

### Details
- the model must be downloaded and have the appropriate name
- the file names are provided at the beginning of the cell

In [18]:
model_name = "linto"
audio_file = './audio_file.wav'
text_file = 'transcription.txt'

model_path = "../../app/back/models/{}".format(model_name)

if not os.path.exists(model_path):
    print("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
    exit(1)

wf = wave.open(audio_file, "rb")


if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
    print ("Audio file must be WAV format mono PCM.")
    exit(1)

model = Model(model_path)
rec = KaldiRecognizer(model, wf.getframerate())
# rec.SetMaxAlternatives(10)
# rec.SetWords(True)

result = []
while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        result.append(json.loads(rec.Result()))

# ret = [sentence["alternatives"][0]["text"] for sentence in result]
ret = result[0]["text"]

with open(text_file, 'w') as file:
    file.write(ret)
file.close()

# Component Two: Natural Language Processing

In [27]:
from langdetect import detect
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import spacy
from nltk import word_tokenize

This component takes a text file and extracts a travel request destination and departure

### Process
1. Run the cell below

### Details
- the model must be downloaded and have the appropriate name

In [35]:
text_file = 'transcription.txt'
model = SentenceTransformer('all-MiniLM-L6-v2')
nlp = spacy.load("fr_core_news_md")

words_before_departure = ['de', 'depuis', 'provenance']
words_before_destination = ['à', 'a', 'en', 'jusqu\'a', 'vers', 'par']
example_travel_sentence = ['Je veux prendre un train de paris à lyon']

def get_cities(sentence):
    """ Take a sentence and return all cities within

    Args:
        sentence (str): any sentence

    Returns:
        Array: A list of cities
    """
    cities = []
    doc = nlp(sentence)
    for entity in doc.ents:
        if entity.label_ == "LOC":
            cities.append(entity.text)
    
    return cities

def check_for_travel_request(sentences):
    """ Take a list of sentences and return the sentence
        containing a request to travel by train

    Args:
        sentences (Array<str>): List of sentences

    Returns:
        str: travel request sentence or SPAM
    """
    sentence_embeddings = model.encode(sentences)
    real_sentence_embedding = model.encode(example_travel_sentence)
    similarities = cosine_similarity(
        [real_sentence_embedding[0]],
        sentence_embeddings
    )
    biggest_number = max(similarities[0])
    if biggest_number < 0.75:
        return False
    best_sentence_ind = np.where(similarities[0] == biggest_number)
    return sentences[best_sentence_ind[0][0]]

def get_destination_and_departure():
    """Takes the pre-defined text file and determines 
    the destination and departure

    Returns:
        dict: of destination and departure 
        OR False if text not valid
    """
    # read the text file
    file = open(text_file, "r")
    sentences = file.read()
    file.close()
    if '.' in sentences:
        sentences.split('.')
    else:
        sentences = [sentences]

    if detect(sentences[0]) != 'fr':
        return False
    
    # check for travel request
    request = check_for_travel_request(sentences)
    if not request:
        return False
    
    # get destination and departure
    departure = []
    destination = []
    
    cities = get_cities(request)
    words = word_tokenize(request)
    for city in cities:
        index = words.index(city)
        if index == 0: continue
        if words[index-1] in words_before_departure: departure.append(city)
        elif words[index-1] in words_before_destination: destination.append(city)
    
    return {
        "departure": departure[0],
        "destination": destination[0]
    }
    
get_destination_and_departure()

{'departure': 'paris', 'destination': 'marseille'}

# Component Three: Pathfinder

This component takes a departure and destination, and calculates the fastest path between the nearest train stations to these locations.

### Process
1. Run the cell below
...

### Details
- ...