In [52]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import numpy as np
import random
from transformers import BertTokenizer,TFBertModel,TFBertForSequenceClassification
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model

## Loading data to train the model for restaurent recommendation system.

In [4]:
df = pd.read_csv('data/restaurants_data_analysis.csv')
df = df.dropna(subset=['latitude', 'longitude', 'main_cuisine', 'budget'])
df.info()

  df = pd.read_csv('data/restaurants_data_analysis.csv')


<class 'pandas.core.frame.DataFrame'>
Index: 250329 entries, 8 to 267377
Data columns (total 23 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   budget                      250329 non-null  int64  
 1   is_new_until                248498 non-null  object 
 2   latitude                    250329 non-null  float64
 3   longitude                   250329 non-null  float64
 4   minimum_delivery_time       250329 non-null  int64  
 5   minimum_order_amount        250329 non-null  int64  
 6   minimum_pickup_time         250329 non-null  int64  
 7   name                        250329 non-null  object 
 8   post_code                   250317 non-null  object 
 9   rating                      250329 non-null  float64
 10  review_number               250329 non-null  int64  
 11  review_with_comment_number  250329 non-null  int64  
 12  vertical                    250328 non-null  object 
 13  vertical_parent    

## Making and training the model for restaurent recommendation

In [3]:
# Step 1: Encode the target labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(df['main_cuisine'])
with open('labelEncoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Step 2: One-hot encode the target labels
onehot_encoder = OneHotEncoder(sparse=False)
y_train_onehot = onehot_encoder.fit_transform(y_train_encoded.reshape(-1, 1))

# Step 3: Tokenize and vectorize the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['main_cuisine'])
sequences = tokenizer.texts_to_sequences(df['main_cuisine'])
input_data = pad_sequences(sequences)

# Define the model architecture
embedding_dim = 50
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=input_data.shape[1]),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dense(y_train_onehot.shape[1], activation='softmax')  # Output layer with softmax activation
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(input_data, y_train_onehot, epochs=10, batch_size=32)

# Save the trained model to a file
model.save('restaurant_recommendation_model.h5')



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Testing if the model is working or not

In [19]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the CSV data
df = pd.read_csv('data/restaurants_data_analysis.csv')
df = df.dropna(subset=['latitude', 'longitude', 'main_cuisine', 'budget'])

model = tf.keras.models.load_model('restaurant_recommendation_model.h5')

# Load the label encoder used during training
# Replace 'your_label_encoder_path.pkl' with the actual file path of the saved label encoder.
import pickle
with open('labelEncoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

# Tokenize and vectorize the data (using the same tokenizer as before)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['main_cuisine'])

# Function to get multiple restaurant recommendations
def get_recommendations(location, cuisine_preference, budget_constraint, num_recommendations=5):
    user_input = cuisine_preference
    user_sequence = tokenizer.texts_to_sequences([user_input])
    user_data = pad_sequences(user_sequence, maxlen=3)  # Assuming the input sequence length is 22 (same as during training)

    # Predict the user's preferences using the loaded model
    user_preferences = np.array(model.predict(user_data))

    # Get the index of the predicted category with the highest probability
    predicted_category_index = tf.argmax(user_preferences, axis=1).numpy()[0]

    # Get the predicted category using the label encoder
    predicted_category = label_encoder.inverse_transform([predicted_category_index])[0]

    # Filter restaurants based on the predicted category
    filtered_restaurants = df[df['main_cuisine'] == predicted_category]

    filtered_restaurants_data = filtered_restaurants['main_cuisine'] + ' ' + filtered_restaurants['budget'].astype(str)
    filtered_sequence = tokenizer.texts_to_sequences(filtered_restaurants_data)
    filtered_data = pad_sequences(filtered_sequence, maxlen=3)
    filtered_predictions = model.predict(filtered_data)
    filtered_predictions = np.array(filtered_predictions)
    
    # Calculate cosine similarity
    cosine_sim = cosine_similarity(user_preferences, filtered_predictions)

    # Get the index of restaurants with highest similarity
    top_indices = cosine_sim.argsort()[0][-num_recommendations:][::-1]

    # Create a list to store the recommended restaurants and their Google Maps links
    recommended_restaurants = []

    for index in top_indices:
        restaurant_name = filtered_restaurants.iloc[index]['name']
        latitude = filtered_restaurants.iloc[index]['latitude']
        longitude = filtered_restaurants.iloc[index]['longitude']

        # Construct the Google Maps link
        google_maps_link = f"https://www.google.com/maps/search/?api=1&query={latitude},{longitude}"

        # Combine the restaurant name and Google Maps link
        restaurant_info = f"{restaurant_name} - {google_maps_link}"
        recommended_restaurants.append(restaurant_info)

    return recommended_restaurants

# Example Usage:
location = "Lahore"  # Replace with the user's location
cuisine_preference = "Fast Food"  # Replace with the user's cuisine preference
budget_constraint = 20  # Replace with the user's budget constraint

recommended_restaurants = get_recommendations(location, cuisine_preference, budget_constraint)
print(recommended_restaurants)

['R.F.C Biriyani - https://www.google.com/maps/search/?api=1&query=23.1773611,90.20375', 'Cheezy Bite - https://www.google.com/maps/search/?api=1&query=30.2400717,71.4924828', 'Foodlicious - https://www.google.com/maps/search/?api=1&query=31.4255232,73.0726706', 'Golden Bite - https://www.google.com/maps/search/?api=1&query=31.41584198,73.04084615', "Sariya's Sip N Bite - Gulberg - https://www.google.com/maps/search/?api=1&query=31.4214801,73.0631505"]


## Generating Data for NER Model

In [24]:
unique_cities = df['city'].unique()
unique_cuisines = df['main_cuisine'].unique()

# Phrases
city_phrases = [
    "I live in city",
    "I reside in city",
    "city is my location",
    "My location is city",
    "I'm located in city",
    "My current city is city",
    "From city here",
    "Hailing from city",
    "I am currently living in city",
    "city is my current location",
    "I'm in city",
    "city is my current city",
    "Living in city",
    "city is where I live",
    "city is where I reside",
    "In city right now",
    "Currently in city",
    "city is my place",
    "city is my home",
    "My home city is city"
]

cuisine_phrases = [
    "I love [cuisine] food",
    "I'm in the mood for [cuisine]",
    "I'm craving [cuisine]",
    "I would like [cuisine] cuisine",
    "How about [cuisine]",
    "I want to try [cuisine]",
    "Let's go for [cuisine]",
    "I enjoy [cuisine]",
    "My favorite is [cuisine]",
    "I'm interested in [cuisine]",
    "I'm looking for [cuisine]",
    "[cuisine] is my favorite"
]

money_phrases = [
    "I have [amount] to spend",
    "My budget is [amount]",
    "I can afford [amount]",
    "I'm willing to pay [amount]",
    "I'm ready to spend [amount]",
    "I want to spend around [amount]",
    "I have around [amount] for the meal",
    "I can spend up to [amount]",
    "I'm looking for something within [amount]",
    "I'm comfortable spending [amount]"
]

# Generate sentences with city names, cuisine names, and random money values
sentences = []
entities = []
outputs = []
for _ in range(10000):
    # Randomly choose a category: city, cuisine, or money
    category = random.choice(['City', 'Cuisine', 'Money'])

    if category == 'City':
        city = random.choice(unique_cities)
        phrase = random.choice(city_phrases).replace("city", city)
        entity = 'Location'
        output = city

    elif category == 'Cuisine':
        cuisine = random.choice(unique_cuisines)
        phrase = random.choice(cuisine_phrases).replace("[cuisine]", cuisine)
        entity = 'Cuisine'
        output = cuisine

    else:  # category == 'Money'
        amount = random.randint(10, 1000)  # Generate a random integer between 10 and 1000
        phrase = random.choice(money_phrases).replace("[amount]", str(amount))
        entity = 'Money'
        output = str(amount)

    sentences.append(phrase)
    entities.append(entity)
    outputs.append(output)

# Create a DataFrame with the generated data
data = pd.DataFrame({'Sentence': sentences, 'Entity': entities, 'Output': outputs})

# Save the DataFrame to a CSV file
data.to_csv('restaurant_recommendation_data_combined.csv', index=False)


## Making, training and saving NER Model

In [None]:

# Load the data
data = pd.read_csv('data/restaurant_recommendation_data.csv')

# Preprocess the data
data['Input'] = data['Sentence'] + ' [SEP] ' + data['Entity']
input_texts = data['Input'].values
labels, unique_labels = pd.factorize(data['Output'])
num_classes = len(unique_labels)

# Tokenize the input texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_texts)
sequences = tokenizer.texts_to_sequences(input_texts)
input_sequences = pad_sequences(sequences, padding='post')

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(input_sequences, tf.keras.utils.to_categorical(labels), test_size=0.2, random_state=42)

# Build the LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=X_train.shape[1]),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=14, validation_data=(X_test, y_test))

# Save the trained model
model.save('saved_model.h5')


I trained this model on google colab and results were:
<div style="font-family: 'Courier New', Courier, monospace; font-size: 15px; font-weight: bold" >
Epoch 90/100<br>
572/572 [==============================] - 4s 7ms/step - loss: 0.0145 - accuracy: 0.9984 - val_loss: 0.5237 - val_accuracy: 0.9705<br>
Epoch 91/100<br>
572/572 [==============================] - 5s 9ms/step - loss: 0.0101 - accuracy: 0.9986 - val_loss: 0.5124 - val_accuracy: 0.9750<br>
Epoch 92/100<br>
572/572 [==============================] - 4s 7ms/step - loss: 0.0131 - accuracy: 0.9979 - val_loss: 0.5026 - val_accuracy: 0.9760<br>
Epoch 93/100<br>
572/572 [==============================] - 4s 6ms/step - loss: 0.0260 - accuracy: 0.9940 - val_loss: 0.5112 - val_accuracy: 0.9720<br>
Epoch 94/100<br>
572/572 [==============================] - 5s 8ms/step - loss: 0.0139 - accuracy: 0.9979 - val_loss: 0.4917 - val_accuracy: 0.9750<br>
Epoch 95/100<br>
572/572 [==============================] - 4s 7ms/step - loss: 0.0088 - accuracy: 0.9987 - val_loss: 0.4691 - val_accuracy: 0.9765<br>
Epoch 96/100<br>
572/572 [==============================] - 4s 6ms/step - loss: 0.0061 - accuracy: 0.9989 - val_loss: 0.4875 - val_accuracy: 0.9760<br>
Epoch 97/100<br>
572/572 [==============================] - 4s 6ms/step - loss: 0.0048 - accuracy: 0.9989 - val_loss: 0.4883 - val_accuracy: 0.9765<br>
Epoch 98/100<br>
572/572 [==============================] - 5s 9ms/step - loss: 0.0709 - accuracy: 0.9821 - val_loss: 0.5418 - val_accuracy: 0.9620<br>
Epoch 99/100<br>
572/572 [==============================] - 4s 7ms/step - loss: 0.0225 - accuracy: 0.9967 - val_loss: 0.4609 - val_accuracy: 0.9730<br>
Epoch 100/100<br>
572/572 [==============================] - 4s 6ms/step - loss: 0.0144 - accuracy: 0.9975 - val_loss: 0.4873 - val_accuracy: 0.9725<br>
</div>

In [62]:
with open('unique_labels.pkl', 'wb') as f:
    pickle.dump(unique_labels, f)
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

## Testing the NER Model

In [57]:
# import tensorflow as tf
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import load_model
model = load_model('saved_model.h5')

# Preprocess the input sentence and entity
def preprocess_input(sentence, entity):
    input_text = sentence + ' [SEP] ' + entity
    input_sequences = tokenizer.texts_to_sequences([input_text])
    input_sequences = pad_sequences(input_sequences, padding='post', maxlen=X_train.shape[1])
    return input_sequences

# Function to predict the output category
def predict_category(sentence, entity):
    input_sequences = preprocess_input(sentence, entity)
    predictions = model.predict(input_sequences)
    predicted_class_index = tf.argmax(predictions, axis=1).numpy()[0]
    predicted_class = unique_labels[predicted_class_index]
    return predicted_class

sentence = "I live in Lahore"
entity = "Location"
predicted_category = predict_category(sentence, entity)
print(predicted_category)





Lahore


In [2]:
def preprocess_input(input_text):
    # Tokenize the input text
    tokens = word_tokenize(input_text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    # Get Part-of-Speech tags for the remaining words
    tagged_tokens = pos_tag(filtered_tokens)

    return tagged_tokens