In [3]:
# Setting up the workplace
import pandas as pd

# Importing functions with updated names
from functions import (
    description_cleaner, 
    vocabulary_creator, 
    reverse_index_creator, 
    compute_TF, 
    compute_IDF, 
    compute_TF_IDF,
    drop_down_menu,
    extract_facilities
)
# Importing engines
from engine import non_ranked_engine, ranked_engine, upgraded_ranked_engine
from jupyter_ui_poll import ui_events
import time
restaurants_df = pd.read_csv("restaurants_i.tsv", sep="\t", header = 0)

### ***Main preprocessing***

In [7]:
# Extract the 'description' column from the DataFrame, which contains restaurant descriptions
restournats_descriptions = [description for description in restaurants_df.description]

# Clean and preprocess the restaurant descriptions using the custom description_cleaner function
parsed_descriptions = description_cleaner(restournats_descriptions)

# Create a vocabulary and convert descriptions into numerical IDs using the vocabulary_creator function
# The function returns a list of IDs for each description and a vocabulary dictionary
ID_descritpion, vocabulary = vocabulary_creator(parsed_descriptions)

# Create a reverse index from the numerical IDs of words to document IDs using reverse_index_creator
reverse_index = reverse_index_creator(ID_descritpion)

# Save the vocabulary (word-to-ID mapping) to a CSV file for future use
pd.Series(vocabulary).to_csv("vocabulary.csv", index=True, encoding="utf-8", header=False)


### ***Not ranked serch engine***

In [3]:
# Initialize a flag (done) to control the while loop for searching
done = False

# Example query for testing the non-ranked search engine
sample_input = "modern seasonal cuisine"
top_k_to_print = 5

# Loop for querying the search engine until the search is completed only usefull in case of custom serch
while (not done):
    # Uncomment the next line to enable user input
    # sample_input = input("what do you want to eat?\n")
    
    # Call the non_ranked_engine function with the input query, the restaurant DataFrame,
    # the vocabulary, and the reverse index to find matching restaurants
    done = non_ranked_engine(sample_input, restaurants_df, vocabulary, reverse_index, top_k_to_print)

We found 41 matches!

╭─────────────────────────┬─────────────────────────┬───────────────────────────┬───────────────────────────╮
│ Restaurant Name         │ Address                 │ Description               │ Website                   │
├─────────────────────────┼─────────────────────────┼───────────────────────────┼───────────────────────────┤
│ Mima                    │ via Madonnelle 9        │ You’ll be won over by the │ http://www.domo20.com/res │
│                         │                         │ seasonal Mediterranea...  │ taurant                   │
├─────────────────────────┼─────────────────────────┼───────────────────────────┼───────────────────────────┤
│ Materia | Spazio Cucina │ via Teatro Massimo 29   │ The entrance to this      │ https://www.materiaspazio │
│                         │                         │ restaurant is typical of  │ cucina.it/                │
│                         │                         │ a...                      │                 

### ***Computing TF-IDF***

In [9]:
TF_by_restournats = compute_TF(ID_descritpion)
total_documents = len(ID_descritpion)
IDF_by_words = compute_IDF(reverse_index, total_documents)

reverse_index_tf_idf = compute_TF_IDF(TF_by_restournats, IDF_by_words)


### ***Ranked serch engine***

In [10]:
# Initialize a flag (done) to control the while loop for searching
done = False

# Example query for testing the non-ranked search engine
sample_input = "modern seasonal cuisine"
top_k_to_print = 3
# Loop for querying the search engine until the search is completed, only usefull in case of custom serch
while (done == False):
    # Uncomment the next line to enable user input
    # sample_input = input("what do you want to eat?\n")
    top_rest = ranked_engine(sample_input, restaurants_df, vocabulary, reverse_index_tf_idf, IDF_by_words, top_k_to_print)
    done = True



We found 1396 matches!

╭─────────────────────┬──────────────────────┬───────────────────────────┬───────────────────────────┬──────────╮
│ Restaurant Name     │ Address              │ Description               │ Website                   │   Cosine │
├─────────────────────┼──────────────────────┼───────────────────────────┼───────────────────────────┼──────────┤
│ Fracia              │ località Fracia      │ Park the car and walk up  │ https://www.ristorantefra │ 0.824838 │
│                     │                      │ a short track (about 5... │ cia.it/                   │          │
├─────────────────────┼──────────────────────┼───────────────────────────┼───────────────────────────┼──────────┤
│ Amo Bistrot         │ Vicoletto Due Mori 5 │ Traditional features such │ http://www.amobistrot.it  │ 0.824838 │
│                     │                      │ as exposed stone wall...  │                           │          │
├─────────────────────┼──────────────────────┼──────────────────

### ***Verifing cosine similarty computation***

To verify if the cosine similarity is computed correctly we can querry the description of a restournat and it should return 1 as the cosine similarity with a huge gap to the second place

##### description from restournat L'Acciuga :
Anchovies, king prawns, cuttlefish and freshly caught fish are just some of the options available in this restaurant specialising in fish and seafood, where guests will be delighted by the fresh flavours of the ingredients. The small wine list, which also includes a few non-Italian labels, offers good value for money. The maritime-style decor evokes the interior of an old ship, while the warm welcome is typical of the region. All in all, an excellent choice!

In [6]:
# Specifc querry for the ranked serch engine, since there are two l'Acciuga we specify the one to pick
sample_input = restaurants_df[restaurants_df.restaurantName == "L'Acciuga"].description.iloc[0]
# Loop for querying the search engine until the search is completed
top_k_to_print = 3
done = ranked_engine(sample_input, restaurants_df, vocabulary, reverse_index_tf_idf, IDF_by_words, top_k_to_print)

╭───────────────────┬───────────────────────────┬───────────────────────────┬───────────────────────────┬──────────╮
│ Restaurant Name   │ Address                   │ Description               │ Website                   │   Cosine │
├───────────────────┼───────────────────────────┼───────────────────────────┼───────────────────────────┼──────────┤
│ L'Acciuga         │ viale Francesco Baracca   │ Anchovies, king prawns,   │ https://lacciugaosteria.e │ 1        │
│                   │ 74                        │ cuttlefish and freshly    │ atbu.com/?lang=it         │          │
│                   │                           │ ...                       │                           │          │
├───────────────────┼───────────────────────────┼───────────────────────────┼───────────────────────────┼──────────┤
│ Cetaria           │ piazza della Repubblica 9 │ This beautiful restaurant │ https://www.cetariaristor │ 0.400065 │
│                   │                           │ inland from Sa

### ***Upgraded raked engine***

In [7]:

facilities = extract_facilities(restaurants_df["facilitiesServices"])
cusine_types = extract_facilities(restaurants_df["cuisineType"])


top_result, restoraunt_cost = upgraded_ranked_engine(facilities, cusine_types, vocabulary, IDF_by_words, reverse_index_tf_idf, restaurants_df)
restoraunt_cost

Text(value='modern seasonal cuisine', description='what do you want to eat?', placeholder='type something', st…

BoundedIntText(value=5, description='How many restourants to display?', layout=Layout(width='45%'), style=Desc…

SelectionRangeSlider(description='Choose a price range:', index=(0, 3), layout=Layout(width='45%'), options=((…

Dropdown(description='Choose the cusine specialty: ', options=("don't care", 'calabrian', 'country cooking', '…

VBox(children=(HBox(children=(Checkbox(value=False, description="don't care", layout=Layout(max_width='auto', …

Button(description='Serch for some restournats', layout=Layout(width='250px'), style=ButtonStyle())

╭───────────────────┬──────────────────────────┬───────────────────────────┬───────────────────────────┬──────────╮
│ Restaurant Name   │ Address                  │ Description               │ Website                   │   Cosine │
├───────────────────┼──────────────────────────┼───────────────────────────┼───────────────────────────┼──────────┤
│ Ca' Del Moro      │ località Erbin 31        │ Situated within the La    │ https://www.cadelmoro.win │  1.4     │
│                   │                          │ Collina dei Ciliegi       │ e/it                      │          │
│                   │                          │ wine...                   │                           │          │
├───────────────────┼──────────────────────────┼───────────────────────────┼───────────────────────────┼──────────┤
│ La Bandiera       │ contrada Pastini 4       │ Although it takes a while │ https://www.labandiera.it │  1.38629 │
│                   │                          │ to reach this restaur..

['€€€', '€€€', '€€', '€€€', '€€€', '€€€€', '€€€', '€€€€']

### ***4. Visualizing the Most Relevant Restaurants***


In [60]:
import pandas as pd
import plotly.express as px
from geocode_restaurants import get_region_and_coordinates

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

In [61]:

restaurants_df = pd.read_csv("restaurants_i.tsv", sep="\t", header=0)
restaurants_df["city"] = restaurants_df["city"].str.strip()

data_restaurants = restaurants_df[["restaurantName", "city", "priceRange"]]


### ***Geocode Locations***
We used Google Geocoding API for finding region, latitude, longitude for each city.
We created a python file, and through it we sent requests to the API, which contained the names of the cities of Italy.

Structure of query
https://maps.googleapis.com/maps/api/geocode/json?address={city},Italy&key={api_key}

API KEY in .env file, which forbidden to upload to GitHub(To save my money! It costs 15 $ but I had free credits).

```bash
geocode_restaurants.py
```



In [43]:
cities = data_restaurants["city"].unique()

city_region_coords_list = []

for city in cities:
    region, latitude, longitude = get_region_and_coordinates(city)
    city_region_coords_list.append({
        "city": city,
        "region": region,
        "latitude": latitude,
        "longitude": longitude
    })

city_region_coords_df = pd.DataFrame(city_region_coords_list)
#save to file
city_region_coords_df.to_csv("italy_cities_regions.tsv", index=False, encoding="utf-8", sep="\t")
print(city_region_coords_df)


                city     region  latitude  longitude
0        Ventimiglia    Liguria    43.791      7.608
1         Acqualagna     Marche    43.619     12.673
2             Teglio   Lombardy    46.172     10.067
3            Suzzara  Lombardia    44.991     10.745
4             Verona     Veneto    45.438     10.992
...              ...        ...       ...        ...
1201        L'Aquila    Abruzzo    42.351     13.400
1202        Altamura     Apulia    40.824     16.553
1203          Robbio   Lombardy    45.289      8.594
1204  Isola Dovarese   Lombardy    45.175     10.312
1205          Cecina    Tuscany    43.309     10.519

[1206 rows x 4 columns]


In [63]:
cities_region = pd.read_csv("italy_cities_regions.tsv", sep="\t", header=0)
cities_region

Unnamed: 0,city,region,latitude,longitude
0,Ventimiglia,Liguria,43.791,7.608
1,Acqualagna,Marche,43.619,12.673
2,Teglio,Lombardy,46.172,10.067
3,Suzzara,Lombardia,44.991,10.745
4,Verona,Veneto,45.438,10.992
...,...,...,...,...
1201,L'Aquila,Abruzzo,42.351,13.400
1202,Altamura,Apulia,40.824,16.553
1203,Robbio,Lombardy,45.289,8.594
1204,Isola Dovarese,Lombardy,45.175,10.312


In [62]:
data_restaurants

Unnamed: 0,restaurantName,city,priceRange
0,Il Giardino del Gusto,Ventimiglia,€€€
1,Anticofurlo,Acqualagna,€€
2,Fracia,Teglio,€
3,Mangiare Bere Uomo Donna,Suzzara,€
4,Amo Bistrot,Verona,€€
...,...,...,...
1977,uovodiseppia Milano,Milan,€€€
1978,Anto e Robi,Robbio,€€
1979,Caffè La Crepa,Isola Dovarese,€€
1980,Il Doretto,Cecina,€€


In [39]:

merged_restaurant_city = pd.merge(data_restaurants, cities_region, on="city", how="left")
merged_restaurant_city = merged_restaurant_city.dropna()


### ***Encoding Price Ranges***


In [64]:
# Group by city to count the number of restaurants and display price ranges
#I used price ranges because some cities have different restaurants with different priceRange so i create new column to update it
restaurant_count_by_city = merged_restaurant_city.groupby("city").agg(
    restaurant_count=('restaurantName', 'count'),
    price_ranges=('price_range', 'first'),
    latitude=('latitude', 'first'),
    longitude=('longitude', 'first')
).reset_index()

# Sort by descending number of restaurants
restaurant_count_by_city = restaurant_count_by_city.sort_values(by='restaurant_count', ascending=False).reset_index(
    drop=True)

restaurant_count_by_city


Unnamed: 0,city,restaurant_count,price_ranges,latitude,longitude
0,Milan,87,€-€€€€,45.469,9.182
1,Rome,62,€-€€€€,41.897,12.482
2,Florence,32,€-€€€€,43.770,11.258
3,Turin,32,€-€€€€,45.070,7.687
4,Naples,27,€-€€€€,40.852,14.268
...,...,...,...,...,...
1175,sestiere Cannaregio 3628,1,€€,45.444,12.335
1176,sestiere Castello 3303,1,€€€€,45.437,12.341
1177,sestiere Castello 3499,1,€€€,45.437,12.341
1178,sestiere Castello 3886,1,€€€,45.437,12.341


### ***Map Setup***


In [68]:
# I multiplied 50 for better visualization 
restaurant_count_by_city['adjusted_restaurant_count'] = restaurant_count_by_city['restaurant_count'] * 50
# Create Map color is price range size - count restaurant by city
fig = px.scatter_mapbox(
    restaurant_count_by_city,
    lat="latitude",
    lon="longitude",
    size="adjusted_restaurant_count",
    color="price_ranges",
    hover_name="city",
    hover_data={"restaurant_count": True, "price_ranges": True},
    title="Distribution of restaurants by city and their price ranges",
    size_max=100,
    zoom=5,
    height=600
)

fig.update_layout(
    mapbox_style="open-street-map",
    margin={"r": 0, "t": 50, "l": 0, "b": 0}
)

fig.show()


### ***Plot Top-K Restaurants:***
