In [50]:
import sys
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk import word_tokenize
from nltk.corpus import stopwords
from langdetect import detect
from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, LOWER
import spacy

nlp = spacy.load("fr_core_news_sm")
matcher = Matcher(nlp.vocab)

# Read Request & Determine Language

In [85]:
eng_text = 'Hi, I would like to travel this winter and go skiing. Normally I will go from paris to grenoble to ski at my favorite resort!'
fr_text = 'Bonjour, je m\'appelle Ryan et j\'aimerais voyager cet hivers et faire du ski. Normalement je j\irai à Lucelle depuis Paris pour arriver chez ma station préférée !'

In [6]:
print("English Text: ", is_french(eng_text))
print("French Text: ", is_french(fr_text))

English Text:  False
French Text:  True


In [5]:
"""
Detect if text is French
"""
def is_french(text):
    return 'fr' == detect(text)

# Extract Departure and Destination

In [18]:
# must download french package with :
# python -m spacy download fr_core_news_sm
doc = nlp(fr_text)
for entity in doc.ents:
    print(entity.label_, ' | ', entity.text)

LOC  |  Bonjour
PER  |  Ryan
LOC  |  Paris


## spaCy Matcher trained with Geonames

In [56]:
# use Geonames file to train spaCy Matcher
fr_cities = pd.read_csv('data/FR_villes.txt', sep="\t", header=None)
fr_cities[1].array

  interactivity=interactivity, compiler=compiler, result=result)


<PandasArray>
[                                    'Col de Recon',
                                          'Lucelle',
                            'Les Cornettes de Bise',
                                        'Lertzbach',
                                  'Le Cheval Blanc',
                                         'Jougnena',
                                           'London',
                                       'Wolfesberg',
                                       'Saar River',
                                         'Rosselle',
 ...
               'Abbaye de Saint-Florent-lès-Saumur',
                           'Abbatiale Saint-Pierre',
           'Ancienne Abbaye Saint-Pierre de Corbie',
             "Site archéologique d'Alba-la-Romaine",
                    'Église Saint-Hilaire le Grand',
                               'Abbaye Saint-Winoc',
               'Église Saint-Mathias de Barbezieux',
 'Basilique Saint-Étienne de Neuvy-Saint-Sépulchre',
                           

In [57]:
def skillPattern(skill):
    pattern = []
    for b in skill.split():
        pattern.append({'LOWER':b})  
    return pattern

def buildPatterns(skills):
    pattern = []
    for skill in skills:
        pattern.append(skillPattern(skill))
    return list(zip(skills, pattern))
def on_match(matcher, doc, id, matches):
    return matches

def buildMatcher(patterns):
    name = ""
    list_dict = []
    for pattern in patterns:
        name += pattern[0]
        list_dict.append(pattern[1])    
    matcher.add(name, list_dict)
    return matcher
    
def cityMatcher(matcher, text):
    skills = []
    doc = nlp(text.lower())
    matches = matcher(doc)
    for b in matches:
        match_id, start, end = b
        print(doc[start : end])

In [72]:
cities = [ 'paris',
'grenoble',
'kanpur',
'noida',
'ghaziabad',
'chennai',
'hydrabad',
'luckhnow',
'saharanpur',
'dehradun',
'bombay']

In [77]:
patterns = buildPatterns(fr_cities[1].array)

In [87]:
print(patterns[1])
print(len(patterns))


('Lucelle', [{'LOWER': 'Lucelle'}])
167884


In [79]:
city_matcher = buildMatcher(patterns)
len(city_matcher)

3

In [86]:
print(cityMatcher(city_matcher, fr_text))

paris
None
