In [3]:
import sys
import os

import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

from nltk import word_tokenize
from nltk.corpus import stopwords
from langdetect import detect
from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, LOWER
import spacy
from gensim.models import word2vec
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import mpu

model = SentenceTransformer('all-MiniLM-L6-v2')
nlp = spacy.load("fr_core_news_md")
# nlp_eng = spacy.load("en_core_web_sm")

# matcher = Matcher(nlp.vocab)

# Read Request & Determine Language

In [2]:
eng_text = 'Hi, I would like to travel this winter and go skiing. Normally I will go from paris to grenoble to ski at my favorite resort!'
fr_text = [
    'Bonjour, je m\'appelle Ryan et j\'aimerais voyager cet hivers et faire du ski. Normalement je j\irai à Lucelle depuis Paris pour arriver chez ma station préférée !',
    'Bonjour, je m\'appelle Ryan et j\'aimerais voyager cet hivers et faire du ski. Normalement je j\irai à Lucelle depuis Paris pour arriver chez ma station préférée !',
    'Bonjour, je m\'ppelle Ryan et j\'aimerais Voyager cet Hivers et Faire du Ski. Normalement je j\irai à lucelle Depuis paris pour Arriver chez ma Station préférée !',
    'Bonjour, je m\'appelle Ryan et j\'aimerais voyager cet hivers et faire du ski. Normalement je j\irai à Lyon depuis Marseille pour arriver chez ma station préférée !',
    'Bonjour, je m\'appelle Ryan et j\'aimerais voyager cet hivers et faire du ski. Normalement je j\irai à Foix depuis Strasbourg pour arriver chez ma station préférée !'
]

In [3]:
"""
Detect if text is French
"""
def is_french(text):
    return 'fr' == detect(text)

In [5]:
print("English Text: ", is_french(eng_text))
print("French Text: ", is_french(fr_text[0]))

English Text:  False
French Text:  True


In [6]:
"""
Detect if text is French
"""
def is_french(text):
    return 'fr' == detect(text)

# Extract Departure and Destination

In [7]:
# must download french package with :
# python -m spacy download fr_core_news_sm
for text in fr_text:
    doc = nlp(text)
    for entity in doc.ents:
        print(entity.label_, ' | ', entity.text, entity.start_char, entity.end_char)
    print("===========================")

PER  |  Ryan 22 26
LOC  |  Lucelle 101 108
LOC  |  Paris 116 121
PER  |  Ryan 22 26
LOC  |  Lucelle 101 108
LOC  |  Paris 116 121
PER  |  Ryan 21 25
MISC  |  Voyager cet Hivers 40 58
MISC  |  Faire du Ski 62 74
MISC  |  Arriver chez ma Station préférée ! 126 160
PER  |  Ryan 22 26
LOC  |  Lyon 101 105
LOC  |  Marseille 113 122
PER  |  Ryan 22 26
LOC  |  Foix 101 105
LOC  |  Strasbourg 113 123


In [8]:
words_before_departure = ['de', 'depuis', 'provence']
words_before_destination = ['à', 'a', 'en', 'jusqu\'a']

def get_cities(sentence):
    """ Take a sentence and return all cities within

    Args:
        sentence (str): any sentence

    Returns:
        Array: A list of cities
    """
    cities = []
    doc = nlp(sentence)
    for entity in doc.ents:
        if entity.label_ == "LOC":
            cities.append(entity.text)
    
    return cities

def determine_departure_destination(sentence):
    """ Take a travel request sentence and
        return the departure and destination

    Args:
        sentence (str): Travel request sentence

    Returns:
        dict: departure and destination as keys
    """
    departure = []
    destination = []
    cities = get_cities(sentence)
    words = word_tokenize(sentence)
    # print("SENTENCE ", sentence)
    # print("CITIES ", cities)
    # print("WORDS ", words)
    for city in cities:
        index = words.index(city)
        if index == 0: continue
        if words[index-1] in words_before_departure: departure.append(city)
        elif words[index-1] in words_before_destination: destination.append(city)
    
    return {
        "departure": departure,
        "destination": destination
    }

determine_departure_destination(fr_text[0])

{'departure': ['Paris'], 'destination': ['Lucelle']}

## spaCy Matcher trained with Geonames

In [16]:
doc1 = nlp("lucelle")
doc2 = nlp("Lucelle")
doc1.similarity(doc2)

  "__main__", mod_spec)


0.0

In [11]:
# use Geonames file to train spaCy Matcher
# fr_cities = pd.read_csv('../../app/data/FR_villes.txt', sep="\t", header=None)
# fr_cities[1].array
fr_cities = ['Lucelle', 'Paris', 'Strasbourg', 'Marseille']
# token_cities = nlp(' '.join(fr_cities))
search = 'lucelle'
token_search = nlp(search)
for t in fr_cities:
    token = nlp(t)
    print("token  ", token.has_vector)
    print("search ", token_search.has_vector)
    print(token_search)
    print(token)
    print(token_search.similarity(token))

    """
    Decent idea but too problematic since word vectors are inexistant (sentences must be used)
    """

token   True
search  True
lucelle
Lucelle
0.0
token   True
search  True
lucelle
Paris
0.0
token   True
search  True
lucelle
Strasbourg
0.0
token   True
search  True
lucelle
Marseille
0.0


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


In [57]:
def skillPattern(skill):
    pattern = []
    for b in skill.split():
        pattern.append({'LOWER':b})  
    return pattern

def buildPatterns(skills):
    pattern = []
    for skill in skills:
        pattern.append(skillPattern(skill))
    return list(zip(skills, pattern))
def on_match(matcher, doc, id, matches):
    return matches

def buildMatcher(patterns):
    name = ""
    list_dict = []
    for pattern in patterns:
        name += pattern[0]
        list_dict.append(pattern[1])    
    matcher.add(name, list_dict)
    return matcher
    
def cityMatcher(matcher, text):
    skills = []
    doc = nlp(text.lower())
    matches = matcher(doc)
    for b in matches:
        match_id, start, end = b
        print(doc[start : end])

In [72]:
cities = [ 'paris',
'grenoble',
'kanpur',
'noida',
'ghaziabad',
'chennai',
'hydrabad',
'luckhnow',
'saharanpur',
'dehradun',
'bombay']

In [77]:
patterns = buildPatterns(fr_cities[1].array)

In [87]:
print(patterns[1])
print(len(patterns))


('Lucelle', [{'LOWER': 'Lucelle'}])
167884


In [79]:
city_matcher = buildMatcher(patterns)
len(city_matcher)

3

In [24]:
vocab = model.wv.key_to_index
vocab

{'et': 0, 'je': 1}

In [25]:
model.train(fr_text, total_examples=1,epochs=1)

(0, 161)

## GeoLocation

In [22]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="travel_request")
location = geolocator.geocode("paris")
location

Location(Paris, Île-de-France, France métropolitaine, France, (48.8588897, 2.3200410217200766, 0.0))

In [24]:
location[0]

'Paris, Île-de-France, France métropolitaine, France'

## Sentence Similarity

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')


In [26]:
test_sentences = [
    'Voyager en train de lille à lyon',
    'Les trains sont mieux. J\'irai de Lille à Lyon',
    'A toulon et prendre un bus à marseille',
    'A toulon et prendre un avion à marseille',
    'A toulon et marcher à marseille',
    'Manger des fruits',
    'Nager a la plage' ,
    'je veux aller du rez de chaussée jusqu\'au quatrieme etage',
    'Je veux prendre un train de paris à lyon'   
]
fr_text = ['Je veux prendre un train de paris à lyon']

test_sentences_embeddings = model.encode(test_sentences)
real_sentence_embedding = model.encode(fr_text)

In [27]:
cosine_similarity(
    [real_sentence_embedding[0]],
    test_sentences_embeddings[0:]
)

array([[0.76665556, 0.7919448 , 0.64205396, 0.60242796, 0.49063888,
        0.34201646, 0.4183157 , 0.4673643 , 1.        ]], dtype=float32)

In [61]:
'''
Prendre un paragraph et renvoyer les endroits
SI il existe une demande de transport
'''
def extract_travel_request(sentences):
    # model must already be loaded
    sentence_embeddings = model.encode(sentences)
    similarities = cosine_similarity(
        [real_sentence_embedding[0]],
        sentence_embeddings
    )
    biggest_number = max(similarities[0])
    if biggest_number < 0.75:
        return "SPAM"
    best_sentence_ind = np.where(similarities[0] == biggest_number)
    best_sentence = sentences[best_sentence_ind[0][0]]
    
    return cityMatcher(city_matcher, best_sentence)
    
print(extract_cities(test_sentences))

<class 'str'>
None


# Epitech Dataset

In [50]:
# Get data paths
calendar_dates_path = "./epitech_data/data_sncf/calendar_dates.txt"
calendar_path = "./epitech_data/data_sncf/calendar.txt"
routes_path = "./epitech_data/data_sncf/routes.txt"
stop_times_path = "./epitech_data/data_sncf/stop_times.txt"
stops_path = "./epitech_data/data_sncf/stops.txt"
transfers_path = "./epitech_data/data_sncf/transfers.txt"
trips_path = "./epitech_data/data_sncf/trips.txt"
timetable_path = "./epitech_data/timetables.csv"

In [52]:
calendar_dates = pd.read_csv(calendar_dates_path)
calendar = pd.read_csv(calendar_path)
routes = pd.read_csv(routes_path)
stop_times = pd.read_csv(stop_times_path)
stops = pd.read_csv(stops_path)
transfers = pd.read_csv(transfers_path)
trips = pd.read_csv(trips_path)
timetables = pd.read_table(timetable_path)

## Table Types

### Calendar Dates

In [32]:
calendar_dates.dtypes
# dates -> DATE

service_id        int64
date              int64
exception_type    int64
dtype: object

### Calendar

In [33]:
calendar.dtypes
# start_date, end_date -> DATE

service_id    int64
monday        int64
tuesday       int64
wednesday     int64
thursday      int64
friday        int64
saturday      int64
sunday        int64
start_date    int64
end_date      int64
dtype: object

### Routes

In [58]:
routes.dtypes
# all floats are by default but should be string

route_id             object
agency_id            object
route_short_name    float64
route_long_name      object
route_desc          float64
route_type            int64
route_url           float64
route_color         float64
route_text_color    float64
dtype: object

### Trips

In [26]:
trips.dtypes

route_id          object
service_id         int64
trip_id           object
trip_headsign      int64
direction_id       int64
block_id         float64
shape_id         float64
dtype: object

### Stop Times

In [36]:
stop_times.dtypes
# arrival_time, departure_time -> TIME
# shape_dist_traveled ?

trip_id                 object
arrival_time            object
departure_time          object
stop_id                 object
stop_sequence            int64
stop_headsign          float64
pickup_type              int64
drop_off_type            int64
shape_dist_traveled    float64
dtype: object

### Stops

In [37]:
stops.dtypes
# zone_id, stop_url, stop_desc are NULL

stop_id            object
stop_name          object
stop_desc         float64
stop_lat          float64
stop_lon          float64
zone_id           float64
stop_url          float64
location_type       int64
parent_station     object
dtype: object

### Transfers

In [38]:
transfers.dtypes

from_stop_id         object
to_stop_id           object
transfer_type        object
min_transfer_time    object
dtype: object

### Timetables

In [53]:
timetables.dtypes

trip_id    object
trajet     object
duree       int64
dtype: object

## Table Length

In [57]:
# print(len(timetables["trip_id"].unique()))
timetables.shape

(1575, 3)

In [49]:
trips.shape

(24111, 7)

In [66]:
# print(stops.iloc[0])
stops.shape

(9176, 9)

In [41]:
calendar_dates.shape

(38160, 3)

In [42]:
calendar.shape

(5015, 10)

## Get closest station

In [128]:
def get_distance(lat1, lon1, lat2, lon2):
    """Calculate the distance between to geographical points

    Args:
        lat_1 (float): first latitude
        long_1 (float): first longitude
        lat_2 (float): second latitude
        long_2 (float): second longitude
    """
    return mpu.haversine_distance((lat1, lon1), (lat2, lon2))

## Get routes from station

In [115]:
def get_routes(stop_id):
    """Get all routes passing through the station 

    Args:
        stop_id (str): train station id in stops table
    """
    try:
        # sometimes stop_id are not in the stop_times
        stop_times_with_stop = stop_times[stop_times.stop_id == stop_id]
    except Exception as e:
        return e.message
    trips_with_stop_times = trips[trips.trip_id.isin(stop_times_with_stop.trip_id)]
    routes_with_trips = routes[routes.route_id.isin(trips_with_stop_times.route_id)]
    # print(routes_with_trips)
    
    return routes_with_trips
    
# get_routes("StopArea:OCE87381509")  NE MARCHE PAS  
get_routes("StopPoint:OCECar TER-87381509")

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,OCE1506035,OCESN,,Paris-Vernon-Rouen-Le Havre,,2,,,
172,OCE1526128,OCESN,,Paris Caen,,2,,,


## Get stations in route

In [127]:
def get_stations(route_id):
    """Get all station in a route

    Args:
        route_id (str): route id in routes tables
    """
    trips_with_route = trips[trips.route_id == route_id]
    stop_times_with_trips = stop_times[stop_times.trip_id.isin(trips_with_route.trip_id)]
    stops_with_stop_times = stops[stops.stop_id.isin(stop_times_with_trips.stop_id)]
    # print(stop_times_with_trips)
    
    return stops_with_stop_times

get_stations("OCE1506035")

Unnamed: 0,stop_id,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station
3836,StopPoint:OCECar TER-87381509,Gare de Mantes-la-Jolie,,48.989687,1.703294,,,0,StopArea:OCE87381509
3837,StopPoint:OCECar TER-87415604,Gare de Vernon-Giverny,,49.091286,1.478363,,,0,StopArea:OCE87415604
3838,StopPoint:OCECar TER-87415620,Gare de Gaillon-Aubevoye,,49.174632,1.352518,,,0,StopArea:OCE87415620
3839,StopPoint:OCECar TER-87415877,Gare de Val-de-Reuil,,49.275399,1.224609,,,0,StopArea:OCE87415877
3840,StopPoint:OCECar TER-87411207,Gare de Oissel,,49.343042,1.101821,,,0,StopArea:OCE87411207
3841,StopPoint:OCECar TER-87411017,Gare de Rouen-Rive-Droite,,49.44903,1.094154,,,0,StopArea:OCE87411017
3842,StopPoint:OCETrain TER-87384008,Gare de Paris-St-Lazare,,48.877865,2.324433,,,0,StopArea:OCE87384008
3843,StopPoint:OCETrain TER-87411017,Gare de Rouen-Rive-Droite,,49.44903,1.094154,,,0,StopArea:OCE87411017
3844,StopPoint:OCETrain TER-87411801,Gare de Montville,,49.551033,1.069749,,,0,StopArea:OCE87411801
3845,StopPoint:OCETrain TER-87411819,Gare de Clères,,49.59908,1.106174,,,0,StopArea:OCE87411819
