In [24]:
import sys
import os

import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

from nltk import word_tokenize
from nltk.corpus import stopwords
from langdetect import detect
from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, LOWER
import spacy
from gensim.models import word2vec
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
nlp = spacy.load("fr_core_news_md")
# nlp_eng = spacy.load("en_core_web_sm")

# matcher = Matcher(nlp.vocab)

# Read Request & Determine Language

In [12]:
eng_text = 'Hi, I would like to travel this winter and go skiing. Normally I will go from paris to grenoble to ski at my favorite resort!'
fr_text = [
    'Bonjour, je m\'appelle Ryan et j\'aimerais voyager cet hivers et faire du ski. Normalement je j\irai à Lucelle depuis Paris pour arriver chez ma station préférée !',
    'Bonjour, je m\'appelle Ryan et j\'aimerais voyager cet hivers et faire du ski. Normalement je j\irai à Lucelle depuis Paris pour arriver chez ma station préférée !',
    'Bonjour, je m\'ppelle Ryan et j\'aimerais Voyager cet Hivers et Faire du Ski. Normalement je j\irai à lucelle Depuis paris pour Arriver chez ma Station préférée !',
    'Bonjour, je m\'appelle Ryan et j\'aimerais voyager cet hivers et faire du ski. Normalement je j\irai à Lyon depuis Marseille pour arriver chez ma station préférée !',
    'Bonjour, je m\'appelle Ryan et j\'aimerais voyager cet hivers et faire du ski. Normalement je j\irai à Foix depuis Strasbourg pour arriver chez ma station préférée !'
]

In [6]:
"""
Detect if text is French
"""
def is_french(text):
    return 'fr' == detect(text)

In [7]:
print("English Text: ", is_french(eng_text))
print("French Text: ", is_french(fr_text))

English Text:  False
French Text:  True


In [5]:
"""
Detect if text is French
"""
def is_french(text):
    return 'fr' == detect(text)

# Extract Departure and Destination

In [18]:
# must download french package with :
# python -m spacy download fr_core_news_sm
for text in fr_text:
    doc = nlp(text)
    for entity in doc.ents:
        print(entity.label_, ' | ', entity.text, entity.start_char, entity.end_char)
    print("===========================")

PER  |  Ryan 22 26
LOC  |  Lucelle 101 108
LOC  |  Paris 116 121
PER  |  Ryan 22 26
LOC  |  Lucelle 101 108
LOC  |  Paris 116 121
PER  |  Ryan 21 25
MISC  |  Voyager cet Hivers 40 58
MISC  |  Faire du Ski 62 74
MISC  |  Arriver chez ma Station préférée ! 126 160
PER  |  Ryan 22 26
LOC  |  Lyon 101 105
LOC  |  Marseille 113 122
PER  |  Ryan 22 26
LOC  |  Foix 101 105
LOC  |  Strasbourg 113 123


In [22]:
words_before_departure = ['de', 'depuis', 'provence']
words_before_destination = ['à', 'a', 'en', 'jusqu\'a']

def get_cities(sentence):
    """ Take a sentence and return all cities within

    Args:
        sentence (str): any sentence

    Returns:
        Array: A list of cities
    """
    cities = []
    doc = nlp(sentence)
    for entity in doc.ents:
        if entity.label_ == "LOC":
            cities.append(entity.text)
    
    return cities

def determine_departure_destination(sentence):
    """ Take a travel request sentence and
        return the departure and destination

    Args:
        sentence (str): Travel request sentence

    Returns:
        dict: departure and destination as keys
    """
    departure = []
    destination = []
    cities = get_cities(sentence)
    words = word_tokenize(sentence)
    # print("SENTENCE ", sentence)
    # print("CITIES ", cities)
    # print("WORDS ", words)
    for city in cities:
        index = words.index(city)
        if index == 0: continue
        if words[index-1] in words_before_departure: departure.append(city)
        elif words[index-1] in words_before_destination: destination.append(city)
    
    return {
        "departure": departure,
        "destination": destination
    }

determine_departure_destination(fr_text[0])

{'departure': ['Paris'], 'destination': ['Lucelle']}

## spaCy Matcher trained with Geonames

In [3]:
# use Geonames file to train spaCy Matcher
fr_cities = pd.read_csv('data/FR_villes.txt', sep="\t", header=None)
fr_cities[1].array

FileNotFoundError: [Errno 2] No such file or directory: 'data/FR_villes.txt'

In [57]:
def skillPattern(skill):
    pattern = []
    for b in skill.split():
        pattern.append({'LOWER':b})  
    return pattern

def buildPatterns(skills):
    pattern = []
    for skill in skills:
        pattern.append(skillPattern(skill))
    return list(zip(skills, pattern))
def on_match(matcher, doc, id, matches):
    return matches

def buildMatcher(patterns):
    name = ""
    list_dict = []
    for pattern in patterns:
        name += pattern[0]
        list_dict.append(pattern[1])    
    matcher.add(name, list_dict)
    return matcher
    
def cityMatcher(matcher, text):
    skills = []
    doc = nlp(text.lower())
    matches = matcher(doc)
    for b in matches:
        match_id, start, end = b
        print(doc[start : end])

In [72]:
cities = [ 'paris',
'grenoble',
'kanpur',
'noida',
'ghaziabad',
'chennai',
'hydrabad',
'luckhnow',
'saharanpur',
'dehradun',
'bombay']

In [77]:
patterns = buildPatterns(fr_cities[1].array)

In [87]:
print(patterns[1])
print(len(patterns))


('Lucelle', [{'LOWER': 'Lucelle'}])
167884


In [79]:
city_matcher = buildMatcher(patterns)
len(city_matcher)

3

In [24]:
vocab = model.wv.key_to_index
vocab

{'et': 0, 'je': 1}

In [25]:
model.train(fr_text, total_examples=1,epochs=1)

(0, 161)

## Sentence Similarity

In [50]:
test_sentences = [
    'Voyager en train de lille à lyon',
    'Les trains sont mieux. J\'irai de Lille à Lyon',
    'A toulon et prendre un bus à marseille',
    'A toulon et prendre un avion à marseille',
    'A toulon et marcher à marseille',
    'Manger des fruits',
    'Nager a la plage'    
]
fr_text = ['Je veux prendre un train de paris à lyon']

test_sentences_embeddings = model.encode(test_sentences)
real_sentence_embedding = model.encode(fr_text)

In [28]:
cosine_similarity(
    [real_sentence_embedding[0]],
    test_sentences_embeddings[0:]
)

array([[0.7666555 , 0.79194474, 0.6420541 , 0.60242796, 0.49063894,
        0.34201652, 0.41831568]], dtype=float32)

In [61]:
print(cityMatcher(city_matcher, fr_text))

paris
None
