#### Part 1 Data Preprocessing and Transformation

In [None]:
# Importing Libraries
import pandas as pd
import json
import ast
import matplotlib.pyplot as plt
import seaborn as sns   
import numpy as np  


In [None]:
#Importing Jason mapping file

with open('mappings.json_(DS_A-L2).json', 'r') as f:
    data = json.load(f)
rows = [
    {'tag': tag, 'offering': values[0], 'destination': values[1]}
    for tag, values in data['tags_mapping'].items()
]

df_tags = pd.DataFrame(rows)
df_tags

In [None]:
print(df_tags['offering'].unique())

In [None]:
print(df_tags['destination'].unique())

In [None]:
#Import Customer feedback dataset

feedback_df = pd.read_csv('dataset.csv_(DS_A-L2).csv')
feedback_df

In [None]:
# Check for missing values
feedback_df.isnull().sum()

In [None]:
#Looking into rows with NULL rating (contnent is not clear --> DROP them)
feedback_df[feedback_df.ratings.isnull()]

In [None]:
#drop row will null rating
feedback_df = feedback_df.dropna(subset=['ratings'])

In [None]:
#Checking for duplicates
feedback_df.drop('tags',axis=1).duplicated().sum()

In [None]:
feedback_df[feedback_df.duplicated(keep=False)].sort_values(by='content')

In [None]:
#Drop duplicated rows
feedback_df = feedback_df.drop_duplicates(subset=feedback_df.columns.difference(['tags']))

In [None]:
#Row with only emojis in content

def is_only_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags
        "\U00002700-\U000027BF"  # Dingbats
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    cleaned = emoji_pattern.sub('', str(text)).strip()
    return cleaned == ''

feedback_df[feedback_df['content'].apply(is_only_emojis)]

In [None]:
#Dropping rows with only emojis
feedback_df = feedback_df[~feedback_df['content'].apply(is_only_emojis)]

In [None]:
#Invesitgating data types
feedback_df.info()

In [None]:
#Convert date column to date format 
feedback_df['date'] = pd.to_datetime(feedback_df['date'])

#Convert a string representation of a list to an actual list
feedback_df['tags'] = feedback_df['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
feedback_df['ratings'] = feedback_df['ratings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


In [None]:
feedback_df

In [None]:
#Date Range
date_range = feedback_df['date'].min().date(), feedback_df['date'].max().date()
print(f"Date Range: {date_range[0]} to {date_range[1]}")

In [None]:
feedback_df['title'].nunique()

In [None]:
#Exploding 'tags' column into individual rows per list entry
feedback_df_exploded = feedback_df.explode('tags')
feedback_df_exploded

In [None]:
#Replacing 'tags' column with dictionary keys (value & sentiment) as new columns
tags_df = feedback_df_exploded['tags'].apply(pd.Series)
feedback_df_exploded = pd.concat([feedback_df_exploded.drop(columns=['tags']), tags_df], axis=1)
feedback_df_exploded.rename(columns={'value': 'tag'}, inplace=True)

#mapping tag with json file (df_tags) --Extracting Offering and Destination
feedback_df_exploded = feedback_df_exploded.merge(df_tags, on='tag', how='left')

feedback_df_exploded

In [None]:
#Replacing 'ratings' column with keys (normalized & raw) as columns
ratings_df = feedback_df_exploded['ratings'].apply(pd.Series)   
feedback_df_exploded = pd.concat([feedback_df_exploded.drop(columns=['ratings']), ratings_df], axis=1)
feedback_df_exploded.rename(columns={'normalized': 'normalized_rating'}, inplace=True)
feedback_df_exploded.rename(columns={'raw': 'actual_rating'}, inplace=True)

#rearranging columns
feedback_df_exploded = feedback_df_exploded[['id', 'content', 'date', 'language', 'title', 'destination', 'offering','tag', 'normalized_rating', 'actual_rating','sentiment']]

feedback_df_exploded

In [None]:
#one-hot encoding for 'offering' column
final_df_encoded = pd.get_dummies(feedback_df_exploded, columns=['offering'],dtype=int)
final_df_encoded['sentiment'] = final_df_encoded['sentiment'].fillna('missing')

final_df = final_df_encoded.groupby(
    ['id', 'content', 'date', 'language', 'title', 'normalized_rating', 'actual_rating', 'sentiment'],
    as_index=False
).agg({
    'offering_Accommodation': 'sum',
    'offering_Food & Beverage': 'sum',
    'offering_Retail': 'sum',
    'offering_Tourism Attractions/ Sites': 'sum',
    'destination': list
})

final_df



In [None]:
# Converting offering_xxxx column into a 0/1 flag only
cols = ['offering_Accommodation', 'offering_Food & Beverage', 'offering_Retail', 'offering_Tourism Attractions/ Sites']
for col in cols:
    final_df[col] = final_df[col].map(lambda x: 1 if x > 0 else 0)

In [None]:
# Keeping more than one destination per content is not useful, so we will keep the most common destination for each content
def get_most_common_destination(destinations):
    destination_counts_dict = {}
    for destination in destinations:
        if destination in destination_counts_dict:
            destination_counts_dict[destination] += 1
        else:
            destination_counts_dict[destination] = 1
    return max(destination_counts_dict, key=destination_counts_dict.get)

# Apply the function to the 'destination' column
final_df['destination'] = final_df['destination'].apply(get_most_common_destination)

final_df

In [None]:
#Adding a sentiment to each row - METHOD 1  (Lexicon Based Approach)

# For English Reviews
from textblob import TextBlob
def get_sentiment(text):
    polarity = TextBlob(str(text)).sentiment.polarity
    if polarity > 0.1:
        return 'positive'
    elif polarity < -0.1:
        return 'negative'
    else:
        return 'neutral'
    
# For Arabic Reviews
from camel_tools.sentiment import SentimentAnalyzer
analyzer = SentimentAnalyzer.pretrained()

def get_arabic_sentiment(text):
    return analyzer.predict(text)


# Apply sentiment analysis for English reviews
final_df.loc[final_df['language'] == 'eng', 'sentiment'] = final_df.loc[final_df['language'] == 'eng', 'content'].apply(get_sentiment)

# Apply sentiment analysis for Arabic reviews
final_df.loc[final_df['language'] == 'ara', 'sentiment'] = final_df.loc[final_df['language'] == 'ara', 'content'].apply(get_arabic_sentiment)

final_df

In [None]:
#Adding a sentiment to each row - METHOD 2  (Task-Specific Large Language Model : XLM-RoBERTa pre-trained LLM model + Neural Network Classification model)

from transformers import pipeline

# Load a multilingual sentiment analysis pipeline 
# XLM-RoBERTa converts each text into a embedding/vector then Neural Network layer classifies as Postive/Negative/Neutral
sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")

# Apply sentiment analysis to the 'content' column
final_df['sentiment_LLM'] = final_df['content'].apply(lambda x: sentiment_pipeline(str(x))[0]['label'])

# See the results
print(final_df[['content', 'sentiment_LLM']].head())




In [None]:
import torch, sys
print("python:", sys.executable)
print("torch:", getattr(torch,"__version__",None))
print("torch file:", getattr(torch,"__file__",None))
print("cuda available:", torch.cuda.is_available())

#### Part 2: Text Cleaning & NLP Analysis

In [None]:
#Some content has both arabic (original) and english (translation) text both
final_df[final_df.content.str.contains('(Translated by Google)')]

In [None]:
#Some content has both arabic (original) and english (translation) text both, so we keep english version only and make sure 'language' is set to 'eng'
def clean_content_and_language(row):
    content = row['content']
    language = row['language']
    if '(Translated by Google)' in content:
        content = content.split('(Translated by Google)')[-1]
        content = content.split('(Original)')[0]
        language = 'eng'
    return pd.Series([content, language])

final_df[['content', 'language']] = final_df.apply(clean_content_and_language, axis=1)

final_df

In [None]:
#Check if language is correctly identified 
import re

def detect_language_rule_based(text):
    text = str(text)
    has_english = re.search(r'[a-zA-Z]', text) is not None
    has_arabic = re.search(r'[\u0600-\u06FF]', text) is not None
    if has_english and has_arabic:
        return 'mixed'
    elif has_english:
        return 'eng'
    elif has_arabic:
        return 'ara'
    else:
        return 'unknown'

final_df['language_2'] = final_df['content'].apply(detect_language_rule_based)

mismatched_lang = final_df[final_df['language'] != final_df['language_2']]
mismatched_lang[['id','content','language', 'language_2']].sort_values(by='language_2')

In [None]:
#There are few rows that are misidentified
mismatched_lang.groupby('language_2')['id'].count()

In [None]:
#update the language column where language_2 is 'eng' or 'ara' only
final_df.loc[final_df['language_2'].isin(['eng', 'ara']), 'language'] = final_df['language_2']
final_df.drop('language_2', axis=1, inplace=True)

In [None]:
# For English reviews
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
from textblob import TextBlob


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_english_text(text):
    # Remove non-alphabetic characters (remove punctuation and numbers)
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))

    # Lowercase
    text = text.lower()

    # Tokenize and remove stopwords
    words = [word for word in text.split() if word not in stop_words]

    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

    # # Stemming
    # words = [stemmer.stem(word) for word in words]
    # return ' '.join(words)



final_df['clean_content'] = final_df.apply(lambda row: clean_english_text(row['content']) if row['language'] == 'eng' else None,axis=1)

In [None]:
# For Arabic reviews
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar
from camel_tools.stem import ArabicLightStemmer
from camel_tools.stopwords import stopwords_list

arabic_stopwords = set(stopwords_list())
stemmer_ar = ArabicLightStemmer()

def clean_arabic_text(text):
    # Remove diacritics
    text = dediac_ar(str(text))
    # Remove non-Arabic letters (keep spaces)
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)
    # Tokenize
    words = simple_word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in arabic_stopwords]
    # lemmatization
    words = [stemmer_ar.lemmatize(word) for word in words]
    return ' '.join(words)

final_df['clean_content'] = final_df.apply(lambda row: clean_arabic_text(row['content']) if row['language'] == 'ara' else None,axis=1)

In [None]:
final_df.loc[final_df['language'] == 'eng', ['content', 'clean_content']]

In [None]:
#Text Analysis (Common Keywords) - Frequency Based Approach (Term Frequency-Inverse Document Frequency: TF-IDF)

from sklearn.feature_extraction.text import TfidfVectorizer

# Compute TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(final_df['clean_content'].fillna(''))
feature_names = vectorizer.get_feature_names_out()
dense = tfidf_matrix.todense()
denselist = dense.tolist()
df_tfidf = pd.DataFrame(denselist, columns=feature_names)
common_keywords = df_tfidf.sum().nlargest(20)
print(common_keywords)



In [None]:
#Represent common words in a word cloud viz
from wordcloud import WordCloud
wordcloud = WordCloud(width=500, height=200, background_color='white').generate_from_frequencies(common_keywords)
plt.figure(figsize=(7, 3))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
#Text Analysis (Themes)  - METHOD 1 Topic Modeling with LDA (Classic Probabilistic Approach)

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Prepare the text data (replace None with empty string)
texts = final_df['clean_content'].fillna('')

# Vectorize the text
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)

# Fit LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

# Show n top words for each topic
n=10
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx+1}: " + ", ".join([feature_names[i] for i in topic.argsort()[:-n:-1]]))

In [None]:
# Create a mapping dictionary after reviewing the keywords for each topic
topic_to_theme = {
    1: "Restaurant & Service Experience",
    2: "Family Outings & Parks",
    3: "Hotels & Cleanliness",
    4: "Parks & Value for Money",
    5: "Religious Sites & Worship"
}

In [None]:
#Mapping topic with hihgets probability for each review

doc_topic_dist = lda.transform(X)
# Get the most probable topic 
most_probable_topic = np.argmax(doc_topic_dist, axis=1) +1
final_df['lda_topic'] = most_probable_topic

#Mapping the most probable topic to themes
final_df['theme_LDA'] = final_df['lda_topic'].map(topic_to_theme)

final_df

In [None]:
#Text Analysis (Themes)  - METHOD 2 Using Ensemble Model *BERTopic* (LLM + Clustering + Keyword Extraction)
#BERTopic: Use BERT pre-trained LLM model for embedding + Clustering algorithm on embeddings via HDBSCAN + Keyword Extraction via TF-IDF for each cluster

from bertopic import BERTopic

# Prepare your review texts
texts = final_df['content'].astype(str).tolist()

# Create and fit BERTopic model (multilingual embeddings)
topic_model = BERTopic(language="multilingual")

topics, probs = topic_model.fit_transform(texts)
final_df['theme_topic'] = topics

# View topics and their keywords (Default is top 10 keywords per topic base on TF-IDF)
topic_info = topic_model.get_topic_info()
print(topic_info)

# Show keywords for a specific topic
print(topic_model.get_topic(0))  # Topic 0 keywords

In [None]:
# Create a mapping dictionary after reviewing the keywords for each topic
topic_to_theme = {
    0: "Food & Restaurants",
    1: "Religious Sites",
    2: "Shopping",
    3: "Accommodation",
    4: "Transport"
    # ...add as needed
}
final_df['theme'] = final_df['theme_topic'].map(topic_to_theme)

In [None]:
#ADDING TOPIC REPRESENTATIONS LAYER
#Use chat-gpt from open AI to fine tune topic representations instead of manually coming up with themes

import openai
client= openai.OpenAI(api_key='sk-...')
representation_model=OpenAI(client, model='gpt-4o-mini',chat=True)
topic_model=BERTopic(representation_model=representation_model, language="multilingual") #Adding presentation model to BERTopic

texts = final_df['content'].astype(str).tolist()
topics, probs = topic_model.fit_transform(texts)
final_df['theme_topic_chatgpt'] = topics

topic_info = topic_model.get_topic_info()
print(topic_info[['Topic', 'Name']])  # 'Name' column contains the theme

# Create a mapping from topic number to theme name
topic_to_theme = dict(zip(topic_info['Topic'], topic_info['Name']))

# Assign theme to each review
final_df['theme'] = final_df['theme_topic_chatgpt'].map(topic_to_theme)



In [None]:
topic_model.visualize_topics()

#### Part 3: EDA

In [None]:
#Distribution of sentiments, offerings, destinations, and ratings.


In [None]:
# Sentiment vs Offering


In [None]:
# Sentiment vs Destination


In [None]:
# Sentiment vs Rating

#### Part4: Future Scope 

The data now is cleaned and labeled with themes --> ready to train an NLP model