In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from geopy.distance import vincenty

In [2]:
def health_note_to_num(note):
    switcher = {
        "A": 0,  # Everythin is OK
        "B": 1,  # Minor observations
        "C": 2,  # 
        "D": 2,
    }
    return switcher.get(note, 0)

In [3]:
reviews_file = "./data/base_DFG_1k_pp"
reviews = pd.read_csv(reviews_file + ".csv")

In [4]:
# Add coords column
reviews["coords"] = reviews["latitude"].map(str) + "," + reviews["longitude"].map(str)
reviews.head(5)

Unnamed: 0,Adresse,Code Postal,Commentaire,Date du commentaire,Note,Origine,Resto,Ville,Id,Date,latitude,longitude,coords
0,63 AV MOZART,75016,Un accueil hyper chaleureux! Les gérants sont ...,30/04/2016,5,TripAdvisor,macis cafe,Paris,0,20160430.0,48.854056,2.269267,"48.8540565,2.2692668"
1,63 AV MOZART,75016,"Nous cherchions à déjeuner, seul bémol, l'heur...",21/04/2016,4,TripAdvisor,macis cafe,Paris,1,20160421.0,48.854056,2.269267,"48.8540565,2.2692668"
2,63 AV MOZART,75016,Des plats réalisés à partir de produits frais ...,10/02/2016,3,TripAdvisor,macis cafe,Paris,2,20160210.0,48.854056,2.269267,"48.8540565,2.2692668"
3,90 Rue des Orteaux,75020,Restaurant Sushi plutot correct dans l ensembl...,27/02/2016,3,TripAdvisor,sushi tomi,Paris,3,20160227.0,48.855704,2.407683,"48.8557043,2.4076831"
4,90 Rue des Orteaux,75020,Déçue de ma dernière visite car impossible de ...,17/01/2016,3,TripAdvisor,sushi tomi,Paris,4,20160117.0,48.855704,2.407683,"48.8557043,2.4076831"


In [5]:
inspections_file =  "./data/inspections-restaurants_pp"
inspections = pd.read_csv(inspections_file + ".csv")

In [6]:
inspections.head(5)

Unnamed: 0,Nom,SIRET,Adresse,Code postal,Localité,Date inspection,Note globale,Fermeture,Note,Evaluation,Coordonnées géographiques,Date extraction
0,thesteackfrites,79157517800029,10 RUE GOMBOUST,75001,PARIS 1,2015-06-22,C,0,note D,20,"48.867387155, 2.332735426",2015-01-07
1,umami,80041038300023,7 RUE DU VINGT NEUF JUILLET,75001,PARIS 1,2015-04-28,C,0,note D,20,"48.863195893, 2.336133862",2015-01-07
2,s&h,80320458500019,9 RUE DES PETITS CHAMPS,75001,PARIS 1,2015-04-24,B,0,note B,10,"48.866272815, 2.338753853",2015-01-07
3,societe de gerance des francs bourgeoi,80497595100018,12 RUE COQUILLIERE,75001,PARIS,2015-04-07,C,0,note D,20,"48.863546941, 2.343246508",2015-01-07
4,berthelot jean marc,40845066600016,22 RUE DES CAPUCINES,75002,PARIS 2,2015-05-07,C,0,note D,20,"48.869444864, 2.328647621",2015-01-07


In [7]:
# Split coordinate sinto latitude and longitude
inspections = pd.concat([inspections, pd.DataFrame(inspections["Coordonnées géographiques"].str.split(',',1).tolist(),
                           columns = ['latitude','longitude'])], axis=1)
inspections['longitude'] = inspections['longitude'].astype(float)
inspections['latitude'] = inspections['latitude'].astype(float)

In [8]:
inspections.head(5)

Unnamed: 0,Nom,SIRET,Adresse,Code postal,Localité,Date inspection,Note globale,Fermeture,Note,Evaluation,Coordonnées géographiques,Date extraction,latitude,longitude
0,thesteackfrites,79157517800029,10 RUE GOMBOUST,75001,PARIS 1,2015-06-22,C,0,note D,20,"48.867387155, 2.332735426",2015-01-07,48.867387,2.332735
1,umami,80041038300023,7 RUE DU VINGT NEUF JUILLET,75001,PARIS 1,2015-04-28,C,0,note D,20,"48.863195893, 2.336133862",2015-01-07,48.863196,2.336134
2,s&h,80320458500019,9 RUE DES PETITS CHAMPS,75001,PARIS 1,2015-04-24,B,0,note B,10,"48.866272815, 2.338753853",2015-01-07,48.866273,2.338754
3,societe de gerance des francs bourgeoi,80497595100018,12 RUE COQUILLIERE,75001,PARIS,2015-04-07,C,0,note D,20,"48.863546941, 2.343246508",2015-01-07,48.863547,2.343247
4,berthelot jean marc,40845066600016,22 RUE DES CAPUCINES,75002,PARIS 2,2015-05-07,C,0,note D,20,"48.869444864, 2.328647621",2015-01-07,48.869445,2.328648


In [9]:
# Initialize merged DataFrame
merged = pd.DataFrame(columns=list(reviews.columns.values))

In [10]:
idx = 0
# Loop through inspected restaurant
for (nom, lat, lng), grp in inspections.groupby(["Nom", "latitude", "longitude"], as_index=False):
    if idx < 6:
        print(nom, lat, lng)
        print(grp["Note globale"])
    idx2 = 0
    # Loop thorugh reviews
    for index, row in reviews.iterrows():
        review_loc = (row["latitude"], row["longitude"])
        inspection_loc = (lat, lng)
        # Check name-location match
        if (fuzz.ratio(row["Resto"], nom) >= 90) & (vincenty(review_loc, inspection_loc).meters < 40):
            print(type(grp["Note globale"].values[0]))   # TODO This is a workaround, we need to support multiple notes
            healthy_note_num = health_note_to_num(grp["Note globale"].values[0])
            merged_entry = row.append(pd.Series(healthy_note_num))
            merged = merged.append(merged_entry, ignore_index=True)
            print("Match!!!\n", row["Resto"], nom)
#             print(merged_entry)
        idx2 += 1
    idx += 1
print(idx)

2 b c 48.862613813 2.27656728
626    C
Name: Note globale, dtype: object
2 dea 48.840816512 2.344770086
51    C
Name: Note globale, dtype: object
383 48.855692978 2.362831585
335    C
Name: Note globale, dtype: object
39 champs elysees 48.870173596 2.306268651
66    B
Name: Note globale, dtype: object
850 48.853680253 2.349358924
245    C
Name: Note globale, dtype: object
a frog at the opera 48.868804676 2.329548017
128    C
Name: Note globale, dtype: object
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez je



Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<class 'str'>
Match!!!
 chez jenny chez jenny
<c

In [11]:
# Merged data frame
merged_cols = list(reviews.columns)
merged_cols.append("healthy_note")
print(merged_cols)

merged.columns = merged_cols
merged

['Adresse', 'Code Postal', 'Commentaire', 'Date du commentaire', 'Note', 'Origine', 'Resto', 'Ville', 'Id', 'Date', 'latitude', 'longitude', 'coords', 'healthy_note']


Unnamed: 0,Adresse,Code Postal,Commentaire,Date du commentaire,Note,Origine,Resto,Ville,Id,Date,latitude,longitude,coords,healthy_note
0,"39,boulevard du Temple",75003,"Une visite Chez Jenny a Paris, s'impose. Pour ...",03/05/2016,5.0,TripAdvisor,chez jenny,Paris,542.0,20160503.0,48.866212,2.364562,"48.866212,2.3645617",1.0
1,"39,boulevard du Temple",75003,Accueil et vestiaire très aimable. Si les plat...,02/05/2016,3.0,TripAdvisor,chez jenny,Paris,543.0,20160502.0,48.866212,2.364562,"48.866212,2.3645617",1.0
2,"39,boulevard du Temple",75003,"La spécialité : la choucroute, bien sûr ! si v...",28/04/2016,5.0,TripAdvisor,chez jenny,Paris,544.0,20160428.0,48.866212,2.364562,"48.866212,2.3645617",1.0
3,"39,boulevard du Temple",75003,Nous étions partis pour faire un restaurant pl...,28/04/2016,5.0,TripAdvisor,chez jenny,Paris,545.0,20160428.0,48.866212,2.364562,"48.866212,2.3645617",1.0
4,"39,boulevard du Temple",75003,Bonnes huitres Pas de praires Escargots moyens...,25/04/2016,3.0,TripAdvisor,chez jenny,Paris,546.0,20160425.0,48.866212,2.364562,"48.866212,2.3645617",1.0
5,"39,boulevard du Temple",75003,Jai mangé très bien en plus les prix sont abor...,24/04/2016,5.0,TripAdvisor,chez jenny,Paris,547.0,20160424.0,48.866212,2.364562,"48.866212,2.3645617",1.0
6,"39,boulevard du Temple",75003,Accueil très pro ! Un peu d'attente mais cadre...,23/04/2016,4.0,TripAdvisor,chez jenny,Paris,548.0,20160423.0,48.866212,2.364562,"48.866212,2.3645617",1.0
7,"39,boulevard du Temple",75003,J'ai diné Chez Jenny juste avant d'aller au th...,22/04/2016,5.0,TripAdvisor,chez jenny,Paris,549.0,20160422.0,48.866212,2.364562,"48.866212,2.3645617",1.0
8,"39,boulevard du Temple",75003,Nous avions réservé dans cette brasserie afin ...,19/04/2016,4.0,TripAdvisor,chez jenny,Paris,550.0,20160419.0,48.866212,2.364562,"48.866212,2.3645617",1.0
9,"39,boulevard du Temple",75003,"C'est grand, il y a du monde, mais tout se pas...",17/04/2016,5.0,TripAdvisor,chez jenny,Paris,551.0,20160417.0,48.866212,2.364562,"48.866212,2.3645617",1.0


In [12]:
merged.shape

(72, 14)