In [9]:
import sys
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from langdetect import detect
from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, LOWER
import spacy
from gensim.models import word2vec

nlp = spacy.load("fr_core_news_sm")
matcher = Matcher(nlp.vocab)

# Read Request & Determine Language

In [10]:
eng_text = 'Hi, I would like to travel this winter and go skiing. Normally I will go from paris to grenoble to ski at my favorite resort!'
fr_text = 'Bonjour, je m\'appelle Ryan et j\'aimerais voyager cet hivers et faire du ski. Normalement je j\irai à Lucelle depuis Paris pour arriver chez ma station préférée !'

In [11]:
"""
Detect if text is French
"""
def is_french(text):
    return 'fr' == detect(text)

In [12]:
print("English Text: ", is_french(eng_text))
print("French Text: ", is_french(fr_text))

English Text:  False
French Text:  True


# Extract Departure and Destination

In [13]:
# must download french package with :
# python -m spacy download fr_core_news_sm
doc = nlp(fr_text)
for entity in doc.ents:
    print(entity.label_, ' | ', entity.text)

LOC  |  Bonjour
PER  |  Ryan
LOC  |  Lucelle
LOC  |  Paris


## spaCy Matcher trained with Geonames

In [53]:
# use Geonames file to train spaCy Matcher
fr_cities = pd.read_csv('data/FR_villes.txt', sep="\t", header=None)
fr_cities[1].array

<PandasArray>
[                                    'Col de Recon',
                                          'Lucelle',
                            'Les Cornettes de Bise',
                                        'Lertzbach',
                                  'Le Cheval Blanc',
                                         'Jougnena',
                                           'London',
                                       'Wolfesberg',
                                       'Saar River',
                                         'Rosselle',
 ...
               'Abbaye de Saint-Florent-lès-Saumur',
                           'Abbatiale Saint-Pierre',
           'Ancienne Abbaye Saint-Pierre de Corbie',
             "Site archéologique d'Alba-la-Romaine",
                    'Église Saint-Hilaire le Grand',
                               'Abbaye Saint-Winoc',
               'Église Saint-Mathias de Barbezieux',
 'Basilique Saint-Étienne de Neuvy-Saint-Sépulchre',
                           

In [15]:
def skillPattern(skill):
    pattern = []
    for b in skill.split():
        pattern.append({'LOWER':b})  
    return pattern

def buildPatterns(skills):
    pattern = []
    for skill in skills:
        pattern.append(skillPattern(skill))
    return list(zip(skills, pattern))
def on_match(matcher, doc, id, matches):
    return matches

def buildMatcher(patterns):
    name = ""
    list_dict = []
    for pattern in patterns:
        name += pattern[0]
        list_dict.append(pattern[1])    
    matcher.add(name, list_dict)
    return matcher
    
def cityMatcher(matcher, text):
    skills = []
    doc = nlp(text.lower())
    matches = matcher(doc)
    for b in matches:
        match_id, start, end = b
        print(doc[start : end])

In [16]:
cities = [ 'paris',
'grenoble',
'kanpur',
'noida',
'ghaziabad',
'chennai',
'hydrabad',
'luckhnow',
'saharanpur',
'dehradun',
'bombay']

In [17]:
patterns = buildPatterns(fr_cities[1].array)

In [18]:
print(patterns[1])
print(len(patterns))


('Lucelle', [{'LOWER': 'Lucelle'}])
167884


In [54]:
city_matcher = buildMatcher(patterns)

<spacy.matcher.matcher.Matcher object at 0x17f43d050>


In [62]:
len(city_matcher)
print(cityMatcher(city_matcher, fr_text[0]))

None


## Word2Vec

In [21]:
# tokenize 
text_tokenized = word_tokenize(fr_text)
text_tokenized

['Bonjour',
 ',',
 'je',
 "m'appelle",
 'Ryan',
 'et',
 "j'aimerais",
 'voyager',
 'cet',
 'hivers',
 'et',
 'faire',
 'du',
 'ski',
 '.',
 'Normalement',
 'je',
 'j\\irai',
 'à',
 'Lucelle',
 'depuis',
 'Paris',
 'pour',
 'arriver',
 'chez',
 'ma',
 'station',
 'préférée',
 '!']

In [22]:
porter = PorterStemmer()
text_stem = [porter.stem(word) for word in text_tokenized]
text_stem

['bonjour',
 ',',
 'je',
 "m'appel",
 'ryan',
 'et',
 "j'aimerai",
 'voyag',
 'cet',
 'hiver',
 'et',
 'fair',
 'du',
 'ski',
 '.',
 'normal',
 'je',
 'j\\irai',
 'à',
 'lucel',
 'depui',
 'pari',
 'pour',
 'arriv',
 'chez',
 'ma',
 'station',
 'préférée',
 '!']

In [23]:
model = word2vec.Word2Vec(
    [text_stem],
    window=20,
    min_count=2,
    workers=1
)
model.corpus_count

1

In [24]:
vocab = model.wv.key_to_index
vocab

{'et': 0, 'je': 1}

In [25]:
model.train(fr_text, total_examples=1,epochs=1)

(0, 161)

## Sentence Similarity

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')


In [50]:
test_sentences = [
    'Voyager en train de lille à lyon',
    'Les trains sont mieux. J\'irai de Lille à Lyon',
    'A toulon et prendre un bus à marseille',
    'A toulon et prendre un avion à marseille',
    'A toulon et marcher à marseille',
    'Manger des fruits',
    'Nager a la plage'    
]
fr_text = ['Je veux prendre un train de paris à lyon']

test_sentences_embeddings = model.encode(test_sentences)
real_sentence_embedding = model.encode(fr_text)

In [28]:
cosine_similarity(
    [real_sentence_embedding[0]],
    test_sentences_embeddings[0:]
)

array([[0.7666555 , 0.79194474, 0.6420541 , 0.60242796, 0.49063894,
        0.34201652, 0.41831568]], dtype=float32)

In [61]:
'''
Prendre un paragraph et renvoyer les endroits
SI il existe une demande de transport
'''
def extract_cities(sentences):
    # model must already be loaded
    sentence_embeddings = model.encode(sentences)
    similarities = cosine_similarity(
        [real_sentence_embedding[0]],
        sentence_embeddings
    )
    biggest_number = max(similarities[0])
    if biggest_number < 0.75:
        return "SPAM"
    best_sentence_ind = np.where(similarities[0] == biggest_number)
    best_sentence = sentences[best_sentence_ind[0][0]]
    
    return cityMatcher(city_matcher, best_sentence)
    
print(extract_cities(test_sentences))

<class 'str'>
None
