In [None]:
#Phase1_SentimentalCode.py
import pandas as pd
import nltk
import re
import emoji
from transformers import pipeline, BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from tqdm import tqdm

# Step 1: Load Data
csv_file_path = 'NYC_2021_airbnb_reviews_data1.csv'
print("Loading data...")
data = pd.read_csv(csv_file_path)

# Step 2: Date Parsing
print("Parsing dates...")
data['review_posted_date'] = pd.to_datetime(data['review_posted_date'], format='%B %Y', errors='coerce')
data = data.dropna(subset=['review_posted_date'])  # Drop rows with invalid dates

# Step 3: Handle Missing Values
print("Handling missing values...")
if 'review' in data.columns:
    num_missing_reviews = data['review'].isna().sum()
    if num_missing_reviews > 0:
        print(f"Number of missing reviews: {num_missing_reviews}")
        data.dropna(subset=['review'], inplace=True)
    data.reset_index(drop=True, inplace=True)
else:
    raise KeyError("The dataset does not contain a 'review' column.")

# Step 4: Detect Language
print("Detecting language...")
from langdetect import detect
tqdm.pandas(desc="Detecting language")

def detect_language(text):
    try:
        return detect(text.strip()) if len(text.strip()) > 3 else 'unknown'
    except Exception:
        return 'unknown'

data['language'] = data['review'].progress_apply(detect_language)
data = data[data['language'] == 'en']
data.reset_index(drop=True, inplace=True)

# Step 5: Text Preprocessing
print("Expanding contractions and cleaning text...")
import contractions

def expand_contractions(text):
    try:
        return contractions.fix(text)
    except Exception:
        return text

def clean_text(text):
    try:
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = emoji.demojize(text)          # Convert emojis to text descriptions
        text = re.sub(r'@\w+|#\w+', '', text)  # Remove mentions and hashtags
        # Keep colons and underscores to preserve emoji descriptions
        text = re.sub(r'[^a-zA-Z0-9\s_:]', '', text)  # Remove special characters
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
        return text
    except Exception:
        return text

data['review_expanded'] = data['review'].apply(expand_contractions)
data['review_cleaned_text'] = data['review_expanded'].apply(clean_text)


# Preprocess further for tokenization
print("Further preprocessing for tokenization...")
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) - {
    'not', 'no', 'nor', 'never',   # Negations
    'very', 'extremely', 'really', # Intensifiers
    'i', 'we', 'my', 'you', 'your', 'our', 'us',   # Pronouns (lowercased)
}

def preprocess_text(text):
    text = text.lower()
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

data['cleaned_review'] = data['review_cleaned_text'].apply(preprocess_text)

# Step 6: Initial Sentiment Analysis with Hugging Face Dataset
print("Performing initial sentiment analysis efficiently...")
sentiment_pipeline = pipeline(
    'sentiment-analysis',
    model='distilbert-base-uncased-finetuned-sst-2-english',
    device=0  # Set to -1 if no GPU available
)

# Create a Hugging Face Dataset from the data
review_dataset = Dataset.from_pandas(data[['cleaned_review']])

# Apply the sentiment pipeline directly to the dataset
def analyze_sentiment(batch):
    # Truncate reviews to a maximum of 512 characters
    truncated_reviews = [review[:512] for review in batch['cleaned_review']]
    batch['sentiment'] = sentiment_pipeline(truncated_reviews)
    return batch

# Map the function to the dataset
review_dataset = review_dataset.map(analyze_sentiment, batched=True)

# Extract sentiment results
data['initial_sentiment'] = [result['label'] for result in review_dataset['sentiment']]
data['sentiment_label'] = data['initial_sentiment'].map({'POSITIVE': 1, 'NEGATIVE': 0})

# Step 7: Train-Test Split with Stratification
print("Splitting data into training and validation sets...")
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['cleaned_review'], data['sentiment_label'], test_size=0.2, random_state=42, stratify=data['sentiment_label']
)

# Step 8: Prepare Dataset for Hugging Face
print("Preparing datasets...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(
        examples['cleaned_review'],
        truncation=True,
        padding="max_length",
        max_length=512
    )

train_data = Dataset.from_pandas(pd.DataFrame({'cleaned_review': train_texts, 'label': train_labels}))
val_data = Dataset.from_pandas(pd.DataFrame({'cleaned_review': val_texts, 'label': val_labels}))

train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Step 9: Define Model and Trainer
print("Initializing BERT model...")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    eval_strategy='epoch',  # Updated to the correct argument
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics
)

# Step 10: Train and Evaluate
print("Training the model...")
trainer.train()
print("Evaluating the model...")
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

# Save Processed Data
print("Saving processed data...")
data.to_csv("processed_reviews_SentimentLabels.csv", index=False)
print("Process completed successfully!")


In [None]:
#Sentiment_plots.ipynb

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.corpus import stopwords
from collections import Counter
from nltk.util import ngrams
from wordcloud import WordCloud

# Ensure plots are displayed inline in Jupyter
%matplotlib inline

# Set seaborn style for aesthetics
sns.set(style='whitegrid', context='talk', palette='muted')

# Step 1: Load Processed Data
file_path = 'processed_reviews_SentimentLabels.csv'  # Replace with the correct file path
print("Loading processed data...")
data = pd.read_csv(file_path)
data.head()


# Count the number of positive and negative sentiments
sentiment_counts = data['sentiment_label'].value_counts().sort_index()

# Map numerical labels to sentiment names
sentiment_counts.index = sentiment_counts.index.map({0: 'Negative', 1: 'Positive'})

# Plotting the bar graph
plt.figure(figsize=(8, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=['#FF6F61', '#6B5B95'])

# Adding titles and labels
plt.title('Sentiment Distribution of Airbnb Reviews', fontsize=18, fontweight='bold')
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Number of Reviews', fontsize=14)

# Annotate the bars with counts
for i, count in enumerate(sentiment_counts.values):
    plt.text(i, count + 50, f'{count}', ha='center', fontsize=12)

plt.show()


# Ensure 'review_posted_date' is in datetime format
data['review_posted_date'] = pd.to_datetime(data['review_posted_date'])

# Extract year for grouping
data['year'] = data['review_posted_date'].dt.to_period('Y')

# Group by year and sentiment
sentiment_over_year = data.groupby(['year', 'sentiment_label']).size().unstack(fill_value=0)
sentiment_over_year.columns = sentiment_over_year.columns.map({0: 'Negative', 1: 'Positive'})

# Plotting the time series by year
plt.figure(figsize=(14, 7))
ax = plt.gca()  # Get current axis

# Color and style settings for yearly data
colors = {'Positive': '#6B5B95', 'Negative': '#FF6F61'}
marker_styles = {'Positive': 'o', 'Negative': 'X'}

# Plot each column with custom settings
for column in sentiment_over_year.columns:
    sentiment_over_year[column].plot(
        kind='line',
        marker=marker_styles[column],
        color=colors[column],
        ax=ax,
        linewidth=2,
        markersize=8,
        label=column
    )

# Adding titles and labels with enhanced formatting
plt.title('Yearly Sentiment Trend for Airbnb Reviews', fontsize=20, fontweight='bold', color='#333333')
plt.xlabel('Year', fontsize=16, color='#333333')
plt.ylabel('Number of Reviews', fontsize=16, color='#333333')
plt.xticks(rotation=0, fontsize=12, color='#666666')
plt.yticks(fontsize=12, color='#666666')

# Adding a grid
plt.grid(True, linestyle='--', linewidth=0.5, color='grey', alpha=0.7)

# Enhance legend
plt.legend(title="Sentiment Type", title_fontsize='13', fontsize='12', loc='upper left')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# ABSA_DataProcessing.py

# Import standard libraries
import pandas as pd
import numpy as np
import nltk
import gensim
import warnings
import random
import os  # To handle directory operations
import json  # For loading manual cluster labels if using external file

# Define a fixed seed for reproducibility
SEED = 42

# Set seeds for random, numpy, and gensim
random.seed(SEED)
np.random.seed(SEED)

# Suppress warnings for clean output
warnings.filterwarnings('ignore')

# Import libraries for word embeddings and clustering
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Import NLTK modules
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# Step 1: Loading Data
print("\nStep 1: Loading Data")
print("Loading the preprocessed data from 'processed_reviews_SentimentLabels.csv'...")
data = pd.read_csv('processed_reviews_SentimentLabels.csv')

# Check for necessary columns
required_columns = ['listing_id', 'review_posted_date', 'cleaned_review', 'sentiment_label']
if all(column in data.columns for column in required_columns):
    print("All required columns are present.")
else:
    missing_cols = [col for col in required_columns if col not in data.columns]
    raise ValueError(f"Missing columns: {missing_cols}")

# Display sample data
print("\nSample data:")
print(data.head())

# Step 2: Tokenization and Lemmatization
print("\nStep 2: Tokenization and Lemmatization")
print("Initializing lemmatizer and tokenizing reviews...")

lemmatizer = WordNetLemmatizer()


def tokenize_and_lemmatize(text):
    try:
        tokens = word_tokenize(text.lower())
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha()]
        return lemmatized_tokens
    except Exception as e:
        print(f"Error in tokenization and lemmatization: {e}")
        return []


data['tokens'] = data['cleaned_review'].apply(tokenize_and_lemmatize)

# Display sample tokens
print("\nSample tokens after lemmatization:")
print(data[['listing_id', 'review_posted_date', 'cleaned_review', 'tokens']].head())

# Step 3: Training the Word2Vec Model
print("\nStep 3: Training the Word2Vec Model")
print("Training Word2Vec model on tokenized reviews...")
sentences = data['tokens'].tolist()

w2v_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=5,
    workers=1,  # Set workers=1 for determinism
    epochs=10,
    seed=SEED  # Set the seed here
)

print("Word2Vec model training completed.")
print(f"Vocabulary size: {len(w2v_model.wv)} words")

# Step 4: Extracting Nouns from Vocabulary
print("\nStep 4: Extracting Nouns from Vocabulary")
print("Performing POS tagging to extract nouns from the vocabulary...")
vocab = sorted(list(w2v_model.wv.index_to_key))  # Sort the vocabulary for consistency
pos_tags = nltk.pos_tag(vocab)
nouns = [word for word, pos in pos_tags if pos.startswith('NN')]

print(f"Total words in vocabulary: {len(vocab)}")
print(f"Total nouns extracted: {len(nouns)}")
print("\nSample nouns:")
print(nouns[:20])

# Step 5: Obtaining Embeddings for Nouns
print("\nStep 5: Obtaining Embeddings for Nouns")
print("Retrieving embeddings for the extracted nouns...")
noun_embeddings = []
nouns_filtered = []
for noun in nouns:
    if noun in w2v_model.wv:
        noun_embeddings.append(w2v_model.wv[noun])
        nouns_filtered.append(noun)

noun_embeddings = np.array(noun_embeddings)
nouns = sorted(nouns_filtered)  # Sort nouns to ensure consistent ordering

print(f"Total nouns with embeddings: {len(noun_embeddings)}")

# Step 6: Clustering Noun Embeddings
print("\nStep 6: Clustering Noun Embeddings")
print("Clustering noun embeddings using KMeans to identify aspects...")
num_clusters = 15
kmeans = KMeans(n_clusters=num_clusters, random_state=SEED, n_init=10)  # Set n_init explicitly
nouns_clusters = pd.DataFrame({'noun': nouns})
nouns_clusters['cluster'] = kmeans.fit_predict(noun_embeddings)

print("\nNumber of nouns in each cluster:")
print(nouns_clusters['cluster'].value_counts().sort_index())

# Step 7: Assigning Aspect Names to Clusters
print("\nStep 7: Assigning Aspect Names to Clusters")
print("Printing nouns in each cluster for manual aspect mapping...")

# Create a directory to save cluster contents (optional)
clusters_dir = 'clusters_output'
os.makedirs(clusters_dir, exist_ok=True)

# Iterate through each cluster and print/save its nouns
for cluster_num in range(num_clusters):
    cluster_nouns = nouns_clusters[nouns_clusters['cluster'] == cluster_num]['noun'].tolist()
    print(f"\n--- Cluster {cluster_num} ---")
    print(cluster_nouns)

    # Save to a text file for easier inspection
    with open(os.path.join(clusters_dir, f'cluster_{cluster_num}.txt'), 'w') as f:
        for noun in cluster_nouns:
            f.write(f"{noun}\n")

print("\nAll clusters have been printed and saved to the 'clusters_output' directory.")
print("Please review the clusters and manually assign aspect names accordingly.")

# Define the manual cluster labels based on your review
manual_cluster_labels = {
    0: 'Host Interaction and Personal Experience',
    1: 'Nearby Amenities and Food Options',
    2: 'Accommodation Facilities and Aesthetics',
    3: 'Communication and Check-in Process',
    4: 'People and Personal Interactions',
    5: 'Amenities and Supplies',
    6: 'Nearby Amenities and Food Options',
    7: 'Accommodation Facilities and Aesthetics',
    8: 'Issues and Complaints',
    9: 'Nearby Amenities and Food Options',
    10: 'Location and Transportation',
    11: 'Value and Pricing',
    12: 'Location and Transportation',
    13: 'Transportation',
    14: 'Accommodation Comfort and Issues'
}

# Assign aspects based on manual mapping
nouns_clusters['aspect'] = nouns_clusters['cluster'].map(manual_cluster_labels)

# Handle any missing aspects
nouns_clusters['aspect'].fillna('Other', inplace=True)

print("\nSample nouns with their assigned aspects:")
print(nouns_clusters.head(20))

# Step 8: Associating Aspects with Reviews
print("\nStep 8: Associating Aspects with Reviews")
print("Creating a mapping of nouns to aspects and assigning aspects to each review...")

# Create a dictionary mapping nouns to their aspects
noun_to_aspect = pd.Series(nouns_clusters['aspect'].values, index=nouns_clusters['noun']).to_dict()


def get_aspects_from_tokens(tokens):
    aspects = set()
    for token in tokens:
        if token in noun_to_aspect:
            aspects.add(noun_to_aspect[token])
    return list(aspects)


data['aspects'] = data['tokens'].apply(get_aspects_from_tokens)

print("\nSample reviews with their associated aspects:")
print(data[['listing_id', 'review_posted_date', 'cleaned_review', 'tokens', 'aspects']].head())

# Step 9: Creating a DataFrame for Aspect-Level Sentiments
print("\nStep 9: Creating a DataFrame for Aspect-Level Sentiments")
print("Exploding aspects to create a row for each aspect mentioned in a review...")

# Include 'listing_id' and 'review_posted_date' in the exploded DataFrame
data_exploded = data.explode('aspects')
data_exploded = data_exploded.dropna(subset=['aspects'])

# Select the desired columns, including 'listing_id' and 'review_posted_date'
aspect_sentiments = data_exploded[['listing_id', 'review_posted_date', 'cleaned_review', 'aspects', 'sentiment_label']]

print("\nSample aspect-level sentiment data:")
print(aspect_sentiments.head())

# Step 10: Saving the Aspect-Level Sentiment Data
print("\nStep 10: Saving the Aspect-Level Sentiment Data")
print("Saving the aspect-level sentiment data to 'AspectbasedSentimentAnalysis.csv'...")
aspect_sentiments.to_csv('AspectbasedSentimentAnalysis.csv', index=False)
print("Data saved successfully.")


In [None]:
#ABSA_plots.py

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'AspectbasedSentimentAnalysis.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()


# Prepare data for the heatmap
heatmap_data = data.groupby(['aspects', 'sentiment_label']).size().unstack(fill_value=0)

# Create the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data, annot=True, fmt='d', cmap='Blues', cbar=True)
plt.title('Heatmap of Sentiments Across Aspects')
plt.xlabel('Sentiment Label (0: Negative, 1: Positive)')
plt.ylabel('Aspects')
plt.tight_layout()
plt.show()


# Filter the data for listing ID 1217318
listing_data = data[data['listing_id'] == 1217318]

# Aggregate the sentiment counts for each aspect
radar_data = listing_data.groupby(['aspects', 'sentiment_label']).size().unstack(fill_value=0)

# Preparing data for radar chart
aspects = radar_data.index
positive_counts = radar_data[1] if 1 in radar_data.columns else [0] * len(aspects)
negative_counts = radar_data[0] if 0 in radar_data.columns else [0] * len(aspects)

# Normalize the values to fit a radar chart (optional)
max_count = max(positive_counts.max(), negative_counts.max())
positive_counts_normalized = positive_counts / max_count
negative_counts_normalized = negative_counts / max_count

# Radar chart setup
categories = list(aspects)
categories += categories[:1]  # Close the loop for radar chart

positive_values = list(positive_counts_normalized) + [positive_counts_normalized.iloc[0]]
negative_values = list(negative_counts_normalized) + [negative_counts_normalized.iloc[0]]

# Angle calculation
angles = [n / float(len(categories)) * 2 * 3.14159 for n in range(len(categories))]

# Plotting radar chart
plt.figure(figsize=(8, 8))
ax = plt.subplot(111, polar=True)

# Draw one axe per aspect + add labels
ax.set_theta_offset(3.14159 / 2)
ax.set_theta_direction(-1)
plt.xticks(angles[:-1], aspects, color='grey', size=10)

# Plot data
ax.plot(angles, positive_values, linewidth=1, linestyle='solid', label='Positive Sentiment')
ax.fill(angles, positive_values, 'b', alpha=0.1)

ax.plot(angles, negative_values, linewidth=1, linestyle='solid', label='Negative Sentiment')
ax.fill(angles, negative_values, 'r', alpha=0.1)

# Add legend and title
plt.legend(loc='upper right', bbox_to_anchor=(1.4, 1.3))
plt.title('Sentiment Distribution for Listing ID 1217318 Across Aspects', size=15, color='darkblue')
plt.tight_layout()
plt.show()