## Webscraping Project

### This is a code that can be applied after creating a "merged_result.json" file that combines the web scraped data.

In [None]:
# Mount Google Drive to Colab runtime
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Generation of Key Features for Each Restaurant Using OpenAI

In [None]:
!pip install --upgrade openai

Collecting openai
  Downloading openai-1.58.1-py3-none-any.whl.metadata (27 kB)
Downloading openai-1.58.1-py3-none-any.whl (454 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m454.3/454.3 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.57.4
    Uninstalling openai-1.57.4:
      Successfully uninstalled openai-1.57.4
Successfully installed openai-1.58.1


In [None]:
!pip show openai

Name: openai
Version: 1.58.1
Summary: The official Python library for the openai API
Home-page: https://github.com/openai/openai-python
Author: 
Author-email: OpenAI <support@openai.com>
License: 
Location: /usr/local/lib/python3.10/dist-packages
Requires: anyio, distro, httpx, jiter, pydantic, sniffio, tqdm, typing-extensions
Required-by: 


In [None]:
import os

# Setting environment variables
os.environ["OPENAI_API_KEY"] = "sk-3xnQF3OtnJcnHzacFqhLMqz_Mzo1WeuSI1ssP0bVZRT3BlbkFJDlGker4wzhxO_9DJ_tEAgQVNPDhQu5-qVgZiNslkoA"

In [None]:
import os
import pandas as pd
from openai import OpenAI
import pandas as pd
import json

# Set OpenAI API Key
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Paths for input and output files
input_file = "/content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/merged_result.json"
output_file = "/content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/merged_result_keywords.json"

# Function to generate keywords using OpenAI's API, leveraging its existing knowledge of the restaurant
def generate_keywords_with_gpt(reviews, restaurant_name):
    prompt = (
        "Extract a list of concise, descriptive keywords or key phrases that represent the most "
        "important aspects of the following reviews and the restaurant's context based on your knowledge. Include cuisine type, unique dishes, ambiance, "
        "special features, and anything mentioned positively about the restaurant:\n"
        f"\nRestaurant Name: {restaurant_name}\n\nReviews:\n{reviews}\n\nKeywords:"
    )
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an expert at summarizing text into concise keywords."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150
    )
    return response.choices[0].message.content.strip()

# Check if the output file already exists
if not os.path.exists(output_file):
    print(f"Output file not found. Generating keywords and creating {output_file}.")

    # Load the merged_result.json file
    with open(input_file, "r") as file:
        restaurants = json.load(file)

    # Add keywords to each restaurant
    for restaurant in restaurants:
        reviews = "\n".join(restaurant.get("Reviews", []))
        restaurant_name = restaurant.get("Name", "")
        keywords = generate_keywords_with_gpt(reviews, restaurant_name)
        restaurant["Keywords"] = keywords

    # Save the updated data to a new JSON file
    with open(output_file, "w") as file:
        json.dump(restaurants, file, indent=4, ensure_ascii=False)

    print(f"Keywords added and saved to {output_file}")
else:
    print(f"Output file {output_file} already exists. Skipping keyword generation.")


Output file /content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/merged_result_keywords.json already exists. Skipping keyword generation.


#### Merging text data by restaurant

In [None]:
import os
import pandas as pd
import json

# Paths for input and output files
input_file = '/content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/merged_result_keywords.json'
output_file = '/content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/merged_data.json'

# Check if the output file already exists
if not os.path.exists(output_file):
    print(f"Output file not found. Processing and creating {output_file}.")

    # Load JSON file
    with open(input_file, 'r') as file:
        data = json.load(file)

    # Create a DataFrame
    df = pd.DataFrame(data)

    # Combine 'Keywords', 'Category', and 'Reviews' into 'Merged_data'
    df['Merged_data'] = (
        df['Keywords'].astype(str) + " " +
        df['Category'].astype(str) + " " +
        df['Reviews'].apply(lambda reviews: " ".join(reviews) if isinstance(reviews, list) else "")
    )

    # Save updated DataFrame to a new file
    df.to_json(output_file, orient='records', indent=4, force_ascii=False)

    print(f"Merged data saved to {output_file}")
else:
    print(f"Output file {output_file} already exists. Skipping processing.")

Output file /content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/merged_data.json already exists. Skipping processing.


### 'output/merged_data.json', one review written in Chinese was manually translated into English and the result was stored in merged_data_en.json

### Text Data Preprecessing
#### Lowercase conversion, remove special characters, lowercase letters, remove stopwords.

In [None]:
import os
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK resources (stopwords and punkt tokenizer)
nltk.download('stopwords')
nltk.download('punkt')

# Load the English stopwords
stop_words = set(stopwords.words('english'))

# Additional words to remove
additional_words = {"restaurant", "name", "nan"}
all_stopwords = stop_words.union(additional_words)

# Function to clean text
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and punctuation
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    # Remove emojis (optional, but handles many cases)
    text = re.sub(r"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+", "", text)

    # Normalize spaces
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in all_stopwords]

    # Join tokens back into a single string
    return " ".join(tokens)

# Paths for input and output files
input_file = '/content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/merged_data_en.json'
output_file = '/content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/merged_data_en_cleaned.json'

# Check if the output file already exists
if not os.path.exists(output_file):
    print(f"Output file not found. Processing and creating {output_file}.")

    # Load JSON file
    with open(input_file, 'r') as file:
        data = pd.read_json(file)

    # Apply the preprocessing function to the 'Merged_data' column
    data['Merged_data'] = data['Merged_data'].apply(preprocess_text)

    # Save the cleaned DataFrame to a new JSON file
    data.to_json(output_file, orient='records', indent=4, force_ascii=False)

    print(f"Cleaned data saved to {output_file}")
else:
    print(f"Output file {output_file} already exists. Skipping processing.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Output file /content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/merged_data_en_cleaned.json already exists. Skipping processing.


### BERT Embedding


In [None]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd

# Paths for input and output files
input_file = '/content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/merged_data_en_cleaned.json'
output_file = '/content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/bert_embeddings.npy'

# Check if the output file already exists
if not os.path.exists(output_file):
    print(f"Output file not found. Processing and creating {output_file}.")

    # Load preprocessed data
    data = pd.read_json(input_file)

    # Load BERT model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Generate BERT embeddings
    embeddings = np.array([model.encode(text) for text in data['Merged_data']])

    # Save embeddings to .npy file
    np.save(output_file, embeddings)

    print(f"BERT embeddings saved to {output_file}")
else:
    print(f"Output file {output_file} already exists. Skipping processing.")



Output file /content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/bert_embeddings.npy already exists. Skipping processing.


### Word2Vec Embedding

In [None]:
import numpy as np
from gensim.models import Word2Vec
import pandas as pd
import nltk
nltk.download('punkt_tab')

# Paths for input and output files
input_file = '/content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/merged_data_en_cleaned.json'
output_file = '/content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/word2vec_embeddings.npy'

# Check if the output file already exists
if not os.path.exists(output_file):
    print(f"Output file not found. Processing and creating {output_file}.")

    # Load preprocessed data
    data = pd.read_json(input_file)

    # Tokenize Merged_data
    nltk.download('punkt')
    data['Tokenized'] = data['Merged_data'].apply(nltk.word_tokenize)

    # Train Word2Vec model
    w2v_model = Word2Vec(sentences=data['Tokenized'], vector_size=100, window=5, min_count=2, workers=4)

    # Generate document embeddings (average of word vectors)
    def get_avg_embedding(tokens, model):
        vectors = [model.wv[word] for word in tokens if word in model.wv]
        if len(vectors) > 0:
            return np.mean(vectors, axis=0)
        else:
            return np.zeros(model.vector_size)

    embeddings = np.array([get_avg_embedding(tokens, w2v_model) for tokens in data['Tokenized']])

    # Save embeddings to .npy file
    np.save(output_file, embeddings)

    print(f"Word2Vec embeddings saved to {output_file}")
else:
    print(f"Output file {output_file} already exists. Skipping processing.")


Output file /content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/word2vec_embeddings.npy already exists. Skipping processing.


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


### BM25+BERT Model and Test

In [None]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import re

# Set file path
merged_data_file = '/content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/merged_data_en_cleaned.json'
word2vec_embeddings_file = '/content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/word2vec_embeddings.npy'
bert_embeddings_file = '/content/drive/MyDrive/0_2024DIA/Webscraping/p1/output/bert_embeddings.npy'

# Loading data and embeddings
data = pd.read_json(merged_data_file)
word2vec_embeddings = np.load(word2vec_embeddings_file)
bert_embeddings = np.load(bert_embeddings_file)

# User search queries and conditions
user_query = "cozy vegan desserts"
user_rating_threshold = 4.5  # Minimum rating
user_min_reviews = 100  # Minimum number of reviews
user_districts = [1, 3, 4, 10, 11]  # Paris district selected by the user
alpha = 0.5  # BM25 weight
beta = 0.3   # BERT weight
gamma = 0.2  # Rating weight

# 1. 'Rating' and 'Reviews_Count' values Preprocessing
data['Rating'] = pd.to_numeric(data['Rating'], errors='coerce')
data['Reviews_Count'] = pd.to_numeric(data['Reviews_Count'], errors='coerce')
data['Rating'].fillna(3.0, inplace=True)
data['Reviews_Count'].fillna(0, inplace=True)

# 2. Extract Paris district number from address
def extract_district(address):
    match = re.search(r'750(\d{2})', address)
    if match:
        return int(match.group(1))
    return None

data['District'] = data['Address'].apply(extract_district)

# 3. Calculating BM25 scores
tokenized_corpus = [doc.split() for doc in data['Merged_data']]
bm25 = BM25Okapi(tokenized_corpus)
tokenized_query = user_query.split()
bm25_scores = bm25.get_scores(tokenized_query)

# 4. Calculating BERT similarity 
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
query_embedding = bert_model.encode(user_query).reshape(1, -1)
bert_similarities = cosine_similarity(query_embedding, bert_embeddings).flatten()

# 5. Scoring and Sorting
data['BM25_Score'] = bm25_scores
data['BERT_Similarity'] = bert_similarities
data['Final_Score'] = (alpha * data['BM25_Score'] +
                       beta * data['BERT_Similarity'] +
                       gamma * data['Rating'])

# 6. Filter by rating, number of reviews, and Paris district
filtered_data = data[
    (data['Rating'] >= user_rating_threshold) &
    (data['Reviews_Count'] >= user_min_reviews) &
    (data['District'].isin(user_districts))
]

# 7. Sorting by "Final_Score"
top_recommendations = filtered_data.sort_values(by='Final_Score', ascending=False).head(10)

# 8. Output
print("Top Recommendations:")
print(top_recommendations[['Name', 'Address', 'District', 'Rating', 'Reviews_Count', 'Final_Score']])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Rating'].fillna(3.0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Reviews_Count'].fillna(0, inplace=True)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Top Recommendations:
                             Name  \
28           the friendly kitchen   
6          Jah Jah By Le Tricycle   
90  Fellows - Restaurant Paris 10   
96                     Galerie 88   
81              East Side Burgers   
79                       Le Stand   
0              Aujourd'hui Demain   
2                        B.Better   
24                     Chez Funda   
23                Les Tontons Veg   

                                              Address  District  Rating  \
28              8 Rue Popincourt, 75011 Paris, France      11.0     4.8   
6      11 R. des Petites Écuries, 75010 Paris, France      10.0     4.7   
90  84 Rue du Faubourg Saint-Denis, 75010 Paris, F...      10.0     4.8   
96   88 Quai de l'Hôtel de ville, 75004 Paris, France       4.0     4.5   
81                60 Bd Voltaire, 75011 Paris, France      11.0     4.5   
79            39 Rue de Bretagne, 75003 Paris, France       3.0     5.0   
0          42 Rue du Chemin Vert, 75011 Paris,

## Streamlit Application

In [None]:
!pip install streamlit
!wget https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz
!tar -xvzf ngrok-v3-stable-linux-amd64.tgz

Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m116.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m118.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m


In [None]:
!pip install streamlit folium streamlit-folium

Collecting streamlit-folium
  Downloading streamlit_folium-0.24.0-py3-none-any.whl.metadata (413 bytes)
Downloading streamlit_folium-0.24.0-py3-none-any.whl (328 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.5/328.5 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: streamlit-folium
Successfully installed streamlit-folium-0.24.0


In [None]:
!./ngrok config add-authtoken 2owPTbiR9nwp0wQ18e9ZA2nV9j4_6ELTy7QAWSyNyECWBugTD # <your_authtoken>   Jinyoung

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
!lsof -i:8501  # Check for Port 8501 Occupancy
!kill -9 <PID>  # Kill the Occupying Process by PID

/bin/bash: -c: line 1: syntax error near unexpected token `newline'
/bin/bash: -c: line 1: `kill -9 <PID>  # Kill the Occupying Process by PID'


In [None]:
import os
import time
import requests

# Run ngrok in the background
def run_ngrok():
    os.system("./ngrok http 8501 > ngrok.log &")  # Save ngrok logs to a file

# Retrieve the ngrok URL
def get_ngrok_url():
    time.sleep(2)  # Wait for ngrok to start
    try:
        response = requests.get("http://localhost:4040/api/tunnels")  # Use ngrok's API
        data = response.json()
        public_url = data['tunnels'][0]['public_url']
        return public_url
    except Exception as e:
        print("Error while fetching the ngrok URL:", e)
        return None

# Run ngrok
run_ngrok()
ngrok_url = get_ngrok_url()

if ngrok_url:
    print(f"ngrok started successfully: {ngrok_url}")
else:
    print("There was an issue starting ngrok.")


ngrok started successfully: https://2441-34-124-252-202.ngrok-free.app


In [None]:
!streamlit run "/content/drive/MyDrive/0_2024DIA/Webscraping/p1/app.py" --server.port 8501 &                ####################### Jingyoung


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.124.252.202:8501[0m
[0m
2024-12-23 17:33:19.782027: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-23 17:33:19.806410: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-23 17:33:19.813830: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-23 17:33:23.230 E