## Data

In [None]:
%pip install pandas numpy

# Import necessary libraries
import pandas as pd
import numpy as np
import re

# Load the dataset
df = pd.read_csv('final_task7_dataset.csv')

# Clean the dataset
df = df.dropna(subset=['review_text'])
df = df[df['review_text'].str.strip() != '']
df = df.dropna(subset=['rating_x'])

# Reset the index
df = df.reset_index(drop=True)

print(f"Dataset shape after cleaning: {df.shape}")
df.head()


In [None]:
%pip install nltk
%pip install spacy
!python -m spacy download en_core_web_sm

## Text Preprocessing

In [None]:
# NLP libraries
import spacy
import nltk
from nltk.corpus import stopwords


nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))


#preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove punctuation and special characters
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize and remove stopwords
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and not token.is_stop and not token.is_punct]
    # Rejoin tokens into clean string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

df['processed_text'] = df['review_text'].apply(preprocess_text)

print(f"Shape after processing: {df.shape}")
df[['review_text', 'processed_text']].head()


## TF-IDF vectorization


In [None]:
%pip install scikit-learn pandas

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF Vectorization on reviews grouped by place
grouped = df.groupby('place_name')['processed_text'].apply(lambda x: ' '.join(x)).reset_index()
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)
tfidf_matrix = tfidf.fit_transform(grouped['processed_text'])

print(f"TF-IDF feature matrix shape: {tfidf_matrix.shape}")

# Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(f"Cosine similarity matrix shape: {cosine_sim.shape}")

# Reverse index
indices = pd.Series(grouped.index, index=grouped['place_name']).drop_duplicates()

# Reccomender function
def recommend_places(place_name, cosine_sim=cosine_sim, df=grouped):
    if place_name not in indices:
        return f"Place '{place_name}' not found in the dataset."
     
    idx = indices[place_name]

    # Pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:min(6, len(sim_scores))]  # added min(6, len(sim_scores)) to not go past available data

    place_indices = [i[0] for i in sim_scores]
    return df['place_name'].iloc[place_indices]

# Test Recommender
recommended_bars = recommend_places('The Dead Rabbit')
print("Top recommended similar bars:")
print(recommended_bars)


In [None]:
# Save cosine similarity matrix
np.save("cosine_sim.npy", cosine_sim)

# Save the reverse index mapping
indices.to_pickle("place_indices.pkl")

## Word2Vec vectorization

In [None]:
import gensim
from gensim.models import Word2Vec

# Tokenizing processed text
df['tokens'] = df['processed_text'].apply(lambda x: x.split())

# Training Word2Vec model
w2v_model = Word2Vec(
    sentences=df['tokens'],
    vector_size=100,
    window=5,
    min_count=2,     
    workers=4,      
    sg=1            
)

# Function to average word vectors for a document
def document_vector(doc):
    doc = [word for word in doc if word in w2v_model.wv]
    if len(doc) == 0:
        return np.zeros(100)  # Return zero vector if no valid words
    return np.mean(w2v_model.wv[doc], axis=0)

# Creating document vectors
doc_vectors = np.array([document_vector(tokens) for tokens in df['tokens']])

print(f"Shape of document vectors: {doc_vectors.shape}")


In [None]:
# Shape: (2599, 100), where each row is a document vector

# Convert to DataFrame
word2vec_df = pd.DataFrame(doc_vectors)
word2vec_df.columns = [f"w2v_{i}" for i in range(word2vec_df.shape[1])]


df_reset = df.reset_index(drop=True)
# assert len(df_reset) == len(word2vec_df), "Mismatch in row count!" # used to check alignment


df_word2vec = pd.concat([df_reset, word2vec_df], axis=1)


df_word2vec.to_csv("word2vec_reviews.csv", index=False)

print("Word2Vec document vectors saved to 'word2vec_reviews.csv'")
print(f"Final shape: {df_word2vec.shape}")


## LDA

In [None]:
from gensim import corpora
from gensim.models import LdaModel

df['tokens'] = df['processed_text'].apply(lambda x: x.split())

# Creating dictionary and corpus for LDA
dictionary = corpora.Dictionary(df['tokens'])
dictionary.filter_extremes(no_below=5, no_above=0.5)  
corpus = [dictionary.doc2bow(text) for text in df['tokens']]

NUM_TOPICS = 10
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=NUM_TOPICS,
    passes=10,
    random_state=42,
    per_word_topics=True
)

# Top words for each topic without probabilities 
NUM_TOP_WORDS = 10
for i in range(lda_model.num_topics):
    words = lda_model.show_topic(i, topn=NUM_TOP_WORDS)
    print(f"Topic #{i}: " + ", ".join([word for word, prob in words]))


# Top words with probabilities
for i, topic in lda_model.show_topics(num_topics=NUM_TOPICS, formatted=True):
    print(f"Topic #{i}:")
    print(topic)
    print()

# Assigning dominant topic to each review
def get_dominant_topic(lda_model, bow):
    topics = lda_model.get_document_topics(bow)
    if topics:
        return sorted(topics, key=lambda x: x[1], reverse=True)[0][0]
    else:
        return -1

df['lda_dominant_topic'] = [get_dominant_topic(lda_model, doc) for doc in corpus]

NUM_TOP_WORDS = 5
topic_keywords_map = {
    topic_id: ", ".join([word for word, _ in lda_model.show_topic(topic_id, topn=NUM_TOP_WORDS)])
    for topic_id in range(lda_model.num_topics)
}

# Mapping keywords to reviews
df['topic_keywords'] = df['lda_dominant_topic'].map(topic_keywords_map)

df.to_csv("lda_labeled_reviews.csv", index=False)
print("LDA topics assigned and saved to 'lda_labeled_reviews.csv'")

## Plot of Dominant Topics

In [None]:
%pip install matplotlib seaborn

import matplotlib.pyplot as plt
import seaborn as sns

topic_counts = df['lda_dominant_topic'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
sns.barplot(x=topic_counts.index, y=topic_counts.values, palette='muted')
plt.xlabel('LDA Topic #')
plt.ylabel('Number of Reviews')
plt.title('Distribution of Dominant LDA Topics')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Reccomendation System

In [None]:
%pip install scikit-surprise

df_filtered = df[['reviewer_id', 'place_id', 'rating_x']].dropna()

Content-Based TF-IDF Recommender

In [None]:
grouped_reviews = df.groupby('place_name')['processed_text'].apply(lambda x: ' '.join(x)).reset_index()

# TF IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(grouped_reviews['processed_text'])

# Cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
place_indices = pd.Series(grouped_reviews.index, index=grouped_reviews['place_name']).drop_duplicates()

# Recommender
def recommend_places(place_name, cosine_sim=cosine_sim, df=grouped_reviews):
    idx = place_indices[place_name]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]
    place_indices_top = [i[0] for i in sim_scores]
    return df['place_name'].iloc[place_indices_top]

# Examples
print("Content-Based Recommendations:")
print(recommend_places('The Dead Rabbit'))
print(recommend_places('Old Town Bar'))


Collaborative Filtering with Surprise

In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load the dataset into Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_filtered[['reviewer_id', 'place_id', 'rating_x']], reader)

# Split into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Build the model
model = SVD()
model.fit(trainset)

# Evaluate the model
predictions = model.test(testset)
print("\nCollaborative Filtering Evaluation:")
print("RMSE:", accuracy.rmse(predictions))
print("MAE :", accuracy.mae(predictions))

Top-N Recommender for User

In [None]:
def get_top_n_recommendations(user_id, model, df_all, n=5):
    all_bars = df_all['place_id'].unique()
    rated_bars = df_all[df_all['reviewer_id'] == user_id]['place_id'].unique()
    bars_to_predict = [bar for bar in all_bars if bar not in rated_bars]
    predictions = [model.predict(user_id, bar) for bar in bars_to_predict]
    top_n_preds = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    place_id_to_name = dict(zip(df_all['place_id'], df_all['place_name']))
    results = [(place_id_to_name.get(pred.iid, pred.iid), round(pred.est, 2)) for pred in top_n_preds]
    return results

# Example
sample_user = df_filtered['reviewer_id'].iloc[0]
print(f"\nTop-N Recommendations for User: {sample_user}")
for place, score in get_top_n_recommendations(sample_user, model, df):
    print(f"{place} — Predicted rating: {score}")

Test Top-N Recommender

In [None]:
# Real user + bar IDs
user_id = '115364016342485480165'
bar_id = 'ChIJXz1QXE5ZwokRLwJIVmQhyEc'

# Lookup dictionary for place_id → place_name
place_id_to_name = dict(zip(df['place_id'], df['place_name']))

# Predicting single rating
pred = model.predict(user_id, bar_id)
bar_name = place_id_to_name.get(bar_id, "Unknown Bar")
print(f"Predicted rating for user '{user_id}' on bar '{bar_name}': {round(pred.est, 2)}")

# Get and print Top-N recommendations for the same user
top_recs = get_top_n_recommendations(user_id, model, df, n=5)
print(f"\nTop 5 recommended bars for user '{user_id}':")
for bar_name, score in top_recs:
    print(f"{bar_name} — Predicted rating: {score}")