In [1]:
import pandas as pd
import numpy as np
import googlemaps

# Restaurant inspections data set

In [2]:
inspections_file = "./data/inspections-restaurants"
inspections = pd.read_csv(inspections_file + ".csv", delimiter=";")

In [3]:
inspections.head(5)

Unnamed: 0,Nom,Enseigne,Raison sociale,SIRET,Classe atelier,Adresse,Code postal,Localité,Code commune,Libellé commune,...,Note globale,Fermeture,MED,Coordonnées géographiques,Libellé établissement,Date extraction,Note,Evaluation,departement,est_75_ou_84
0,THESTEACKFRITES,CAFE DES ABATTOIRS,THESTEACKFRITES,79157517800029,Restauration commerciale rapide - Cuisine,10 RUE GOMBOUST,75001,PARIS 1,75101,PARIS 1ER,...,C,0,0,"48.867387155, 2.332735426",SAS THESTEACKFRITES,2015-01-07,note D,20,75,oui
1,UMAMI,UMAMI,UMAMI,80041038300023,Restauration commerciale traditionnelle - Cuisine,7 RUE DU VINGT NEUF JUILLET,75001,PARIS 1,75101,PARIS 1ER,...,C,0,0,"48.863195893, 2.336133862",UMAMI SAS,2015-01-07,note D,20,75,oui
2,S&H,L'AVANT PREMIERE,S&H,80320458500019,Restauration commerciale traditionnelle - Cuisine,9 RUE DES PETITS CHAMPS,75001,PARIS 1,75101,PARIS 1ER,...,B,0,0,"48.866272815, 2.338753853",SAS S&H,2015-01-07,note B,10,75,oui
3,SOCIETE DE GERANCE DES FRANCS BOURGEOI,DONATO,SOCIETE DE GERANCE DES FRANCS BOURGEOI,80497595100018,Restauration commerciale traditionnelle - Cuisine,12 RUE COQUILLIERE,75001,PARIS,75101,PARIS 1ER,...,C,0,0,"48.863546941, 2.343246508",SGFB,2015-01-07,note D,20,75,oui
4,BERTHELOT JEAN MARC,BISTROT CAPUCINES,BERTHELOT JEAN MARC,40845066600016,Restauration commerciale traditionnelle - Cuisine,22 RUE DES CAPUCINES,75002,PARIS 2,75102,PARIS 2EME,...,C,0,0,"48.869444864, 2.328647621",BERTHELOT JEAN-MARC,2015-01-07,note D,20,75,oui


In [4]:
inspections.columns

Index(['Nom', 'Enseigne', 'Raison sociale', 'SIRET', 'Classe atelier',
       'Adresse', 'Code postal', 'Localité', 'Code commune', 'Libellé commune',
       'Date inspection', 'Note globale', 'Fermeture', 'MED',
       'Coordonnées géographiques', 'Libellé établissement', 'Date extraction',
       'Note', 'Evaluation', 'departement', 'est_75_ou_84'],
      dtype='object')

In [5]:
inspections["Localité"].unique()

array(['PARIS 1', 'PARIS', 'PARIS 2', 'PARIS 3', 'PARIS 4', 'PARIS 5',
       'PARIS 6', 'PARIS 7', 'PARIS 8', 'PARIS 9', 'PARIS 10', 'PARIS 11',
       'PARIS 12', 'PARIS 17', 'PARIS 13', 'PARIS 14', 'PARIS 15',
       'PARIS 18', 'PARIS 19', 'PARIS 20', 'AVIGNON', 'CAVAILLON',
       'MONTFAVET'], dtype=object)

In [6]:
# Only keeep Paris data
inspections = inspections[inspections["Localité"].str.contains("PARIS")]
inspections["Localité"].unique()

array(['PARIS 1', 'PARIS', 'PARIS 2', 'PARIS 3', 'PARIS 4', 'PARIS 5',
       'PARIS 6', 'PARIS 7', 'PARIS 8', 'PARIS 9', 'PARIS 10', 'PARIS 11',
       'PARIS 12', 'PARIS 17', 'PARIS 13', 'PARIS 14', 'PARIS 15',
       'PARIS 18', 'PARIS 19', 'PARIS 20'], dtype=object)

In [7]:
inspections.shape

(843, 21)

In [8]:
# Keep relevant data
keep_col = ['Nom', 'SIRET', 'Adresse', 'Code postal', 'Localité', 
            'Date inspection', 'Note globale', 'Fermeture',  'Note', 'Evaluation',
            'Coordonnées géographiques', 'Date extraction']
inspections = inspections[keep_col]

In [9]:
inspections.head(5)

Unnamed: 0,Nom,SIRET,Adresse,Code postal,Localité,Date inspection,Note globale,Fermeture,Note,Evaluation,Coordonnées géographiques,Date extraction
0,THESTEACKFRITES,79157517800029,10 RUE GOMBOUST,75001,PARIS 1,2015-06-22,C,0,note D,20,"48.867387155, 2.332735426",2015-01-07
1,UMAMI,80041038300023,7 RUE DU VINGT NEUF JUILLET,75001,PARIS 1,2015-04-28,C,0,note D,20,"48.863195893, 2.336133862",2015-01-07
2,S&H,80320458500019,9 RUE DES PETITS CHAMPS,75001,PARIS 1,2015-04-24,B,0,note B,10,"48.866272815, 2.338753853",2015-01-07
3,SOCIETE DE GERANCE DES FRANCS BOURGEOI,80497595100018,12 RUE COQUILLIERE,75001,PARIS,2015-04-07,C,0,note D,20,"48.863546941, 2.343246508",2015-01-07
4,BERTHELOT JEAN MARC,40845066600016,22 RUE DES CAPUCINES,75002,PARIS 2,2015-05-07,C,0,note D,20,"48.869444864, 2.328647621",2015-01-07


In [10]:
# Split coordinates into latitude and longitude
inspections = pd.concat([inspections, pd.DataFrame(inspections["Coordonnées géographiques"].str.split(',',1).tolist(),
                           columns = ['latitude','longitude'])], axis=1)
inspections['longitude'] = inspections['longitude'].astype(float)
inspections['latitude'] = inspections['latitude'].astype(float)
inspections.head(5)

Unnamed: 0,Nom,SIRET,Adresse,Code postal,Localité,Date inspection,Note globale,Fermeture,Note,Evaluation,Coordonnées géographiques,Date extraction,latitude,longitude
0,THESTEACKFRITES,79157520000000.0,10 RUE GOMBOUST,75001.0,PARIS 1,2015-06-22,C,0.0,note D,20.0,"48.867387155, 2.332735426",2015-01-07,48.867387,2.332735
1,UMAMI,80041040000000.0,7 RUE DU VINGT NEUF JUILLET,75001.0,PARIS 1,2015-04-28,C,0.0,note D,20.0,"48.863195893, 2.336133862",2015-01-07,48.863196,2.336134
2,S&H,80320460000000.0,9 RUE DES PETITS CHAMPS,75001.0,PARIS 1,2015-04-24,B,0.0,note B,10.0,"48.866272815, 2.338753853",2015-01-07,48.866273,2.338754
3,SOCIETE DE GERANCE DES FRANCS BOURGEOI,80497600000000.0,12 RUE COQUILLIERE,75001.0,PARIS,2015-04-07,C,0.0,note D,20.0,"48.863546941, 2.343246508",2015-01-07,48.863547,2.343247
4,BERTHELOT JEAN MARC,40845070000000.0,22 RUE DES CAPUCINES,75002.0,PARIS 2,2015-05-07,C,0.0,note D,20.0,"48.869444864, 2.328647621",2015-01-07,48.869445,2.328648


In [11]:
inspections.to_csv(inspections_file + "_pp.csv", index=False)

# Reviews data set

In [2]:
reviews_file = "./data/base_DFG_note_feat"
reviews = pd.read_csv(reviews_file + ".csv")

In [3]:
reviews.head(5)

Unnamed: 0,Adresse,Code Postal,Resto,Ville,Note_resto,Note_hygiène_resto,Variance_note_resto,Variance_note_hygiène_resto,16,alimentair,...,sept,sourd,tomb,traversent,ventr,vom,écrev,épic,éton,rev_cnt
0,55 Boulevard Saint Marcel,75013,0 d'attente,Paris,3.9,3.67,0.29,2.2696,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,10.0
1,"128, rue du Faubourg Saint Martin",75010,0039 ristorante italiano,Paris,3.222222,4.846111,2.17284,0.020053,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
2,60 rue Albert,75013,015 gang nam,Paris,4.333333,4.844444,0.222222,0.010617,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
3,161 Avenue D'Italie,75013,1 pot,Paris,4.0,4.916667,0.666667,0.000278,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,55 Boulevard Saint Marcel,75013,0 d'attente,Paris,3.9,3.67,0.29,2.2696,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,10.0


In [4]:
reviews["Ville"].unique()

array(['Paris'], dtype=object)

In [5]:
# Unique restaurant names in the data set
unique_restaurants = reviews["Resto"].unique()
# Number of restaurant in the data set
np.size(unique_restaurants)

9022

In [6]:
# Add latitude and longitude columns
reviews["latitude"] = 0.0
reviews["longitude"] = 0.0

In [7]:
reviews.head(5)

Unnamed: 0,Adresse,Code Postal,Resto,Ville,Note_resto,Note_hygiène_resto,Variance_note_resto,Variance_note_hygiène_resto,16,alimentair,...,tomb,traversent,ventr,vom,écrev,épic,éton,rev_cnt,latitude,longitude
0,55 Boulevard Saint Marcel,75013,0 d'attente,Paris,3.9,3.67,0.29,2.2696,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,1.0,10.0,0.0,0.0
1,"128, rue du Faubourg Saint Martin",75010,0039 ristorante italiano,Paris,3.222222,4.846111,2.17284,0.020053,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0
2,60 rue Albert,75013,015 gang nam,Paris,4.333333,4.844444,0.222222,0.010617,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0
3,161 Avenue D'Italie,75013,1 pot,Paris,4.0,4.916667,0.666667,0.000278,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0
4,55 Boulevard Saint Marcel,75013,0 d'attente,Paris,3.9,3.67,0.29,2.2696,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,1.0,10.0,0.0,0.0


In [34]:
idx = 0
gmaps = googlemaps.Client(key='AIzaSyBiM5WGsHE0DI4eJGluhWQYPgi5m68xAe8')
for (adresse, cp), grp in reviews.groupby(["Adresse", "Code Postal"]):
    query = adresse + ", " + cp + ", " + "Paris, France"
    if (idx >= 9213):
        #print(query)
        #print(reviews.loc[(reviews["Adresse"]==adresse) & (reviews["Code Postal"]==cp),["longitude"]])
        # Geocoding an address
        geocode_result = gmaps.geocode(query)
        reviews.loc[(reviews["Adresse"]==adresse) & (reviews["Code Postal"]==cp),["latitude"]] = geocode_result[0]['geometry']['location']['lat']
        reviews.loc[(reviews["Adresse"]==adresse) & (reviews["Code Postal"]==cp),["longitude"]] = geocode_result[0]['geometry']['location']['lng']
    idx += 1

In [35]:
print(idx)
# Add coords column
reviews["coords"] = reviews["latitude"].map(str) + "," + reviews["longitude"].map(str)
reviews.head(5)

9518


Unnamed: 0,Adresse,Code Postal,Resto,Ville,Note_resto,Note_hygiène_resto,Variance_note_resto,Variance_note_hygiène_resto,16,alimentair,...,traversent,ventr,vom,écrev,épic,éton,rev_cnt,latitude,longitude,coords
0,55 Boulevard Saint Marcel,75013,0 d'attente,Paris,3.9,3.67,0.29,2.2696,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,1.0,10.0,48.837656,2.355498,"48.8376561,2.3554979"
1,"128, rue du Faubourg Saint Martin",75010,0039 ristorante italiano,Paris,3.222222,4.846111,2.17284,0.020053,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.0,48.874388,2.358907,"48.8743883,2.3589068"
2,60 rue Albert,75013,015 gang nam,Paris,4.333333,4.844444,0.222222,0.010617,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6.0,48.826235,2.371878,"48.82623539999999,2.371877500000001"
3,161 Avenue D'Italie,75013,1 pot,Paris,4.0,4.916667,0.666667,0.000278,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6.0,48.81953,2.359702,"48.8195302,2.3597021"
4,55 Boulevard Saint Marcel,75013,0 d'attente,Paris,3.9,3.67,0.29,2.2696,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,1.0,10.0,48.837656,2.355498,"48.8376561,2.3554979"


In [36]:
reviews.to_csv(reviews_file + "_pp_loc.csv", index=False, sep=";")