In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import gensim
import numpy as np
from gensim import corpora
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm import tqdm
import os

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('vader_lexicon', quiet=True)

tqdm.pandas()

input_path = "C:/Users/User/OneDrive/Desktop/dsm_project/cleaneddataset/st_louis_reviews.csv"
output_dir = "C:/Users/User/OneDrive/Desktop/dsm_project/cleaneddataset/output"
os.makedirs(output_dir, exist_ok=True)
min_docs_needed = 10

df = pd.read_csv(input_path)
assert 'text' in df.columns, "CSV must contain 'text' column"
df['text'] = df['text'].fillna('').astype(str)

def preprocess(text):
    try:
        tokens = nltk.word_tokenize(text.lower())
        tokens = [word for word in tokens if word.isalpha()]
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        return tokens if tokens else ['placeholder']
    except:
        return ['placeholder']

sia = SentimentIntensityAnalyzer()
print("Performing sentiment analysis with adjusted thresholds...")

df['compound'] = df['text'].progress_apply(lambda x: sia.polarity_scores(x)['compound'])

def classify_sentiment(score):
    if score >= 0.3:
        return 'positive'
    elif score <= -0.3:
        return 'negative'
    else:
        return 'neutral'

df['sentiment'] = df['compound'].apply(classify_sentiment)

df[['text', 'compound', 'sentiment']].to_csv(os.path.join(output_dir, "review_sentiment_map.csv"), index=False)

positive_reviews = df[df['sentiment'] == 'positive']['text']
negative_reviews = df[df['sentiment'] == 'negative']['text']

def get_topics(reviews, label, num_topics=5, num_words=6):
    if len(reviews) < min_docs_needed:
        print(f"Not enough {label} reviews ({len(reviews)}).")
        return [], None, [], None, []

    texts = reviews.progress_apply(preprocess).tolist()
    texts = [t for t in texts if len(t) > 0]
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=2, no_above=0.9)
    corpus = [dictionary.doc2bow(text) for text in texts]

    if len(dictionary) < num_topics:
        num_topics = max(2, len(dictionary) // 5)

    lda = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)
    topics = lda.print_topics(num_words=num_words)

    topic_keywords = []
    for i, topic in enumerate(topics):
        try:
            words = [w.split("*")[1].replace('"', '').strip() for w in topic[1].split("+")]
            topic_keywords.append({
                'Sentiment': label,
                'Topic': f"Topic {i+1}",
                'Keywords': ", ".join(words)
            })
        except:
            continue

    return topic_keywords, lda, corpus, dictionary, texts

topics_data = []

pos_topics, lda_pos, corpus_pos, dict_pos, texts_pos = get_topics(positive_reviews, 'positive')
topics_data.extend(pos_topics)

neg_topics, lda_neg, corpus_neg, dict_neg, texts_neg = get_topics(negative_reviews, 'negative')
topics_data.extend(neg_topics)

if topics_data:
    pd.DataFrame(topics_data).to_csv(os.path.join(output_dir, "st_louis_reviews_topics.csv"), index=False)

def assign_topics(corpus, lda_model):
    return [max(lda_model.get_document_topics(doc), key=lambda x: x[1])[0] for doc in corpus]

df_pos = pd.DataFrame({'text': positive_reviews.values})
df_neg = pd.DataFrame({'text': negative_reviews.values})

if lda_pos and corpus_pos:
    df_pos['dominant_topic'] = assign_topics(corpus_pos, lda_pos)

if lda_neg and corpus_neg:
    df_neg['dominant_topic'] = assign_topics(corpus_neg, lda_neg)

sentiment_counts = df['sentiment'].value_counts()
plt.figure(figsize=(6, 6))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', colors=['lightgreen', 'salmon', 'gray'])
plt.title("Sentiment Distribution")
plt.savefig(os.path.join(output_dir, "sentiment_pie_chart.png"))
plt.close()

def plot_keyword_bar(topics, label):
    if not topics:
        return
    names = [t['Topic'] for t in topics]
    counts = [len(t['Keywords'].split(", ")) for t in topics]
    plt.figure(figsize=(10, 6))
    plt.bar(names, counts)
    plt.title(f"Keyword Count per {label.capitalize()} Topic")
    plt.ylabel("Keyword Count")
    plt.savefig(os.path.join(output_dir, f"{label}_topics_bar_chart.png"))
    plt.close()

plot_keyword_bar(pos_topics, 'positive')
plot_keyword_bar(neg_topics, 'negative')

def plot_topic_distribution(df_subset, label):
    if 'dominant_topic' not in df_subset.columns:
        return
    topic_counts = df_subset['dominant_topic'].value_counts().sort_index()
    plt.figure(figsize=(10, 6))
    plt.bar([f"Topic {i+1}" for i in topic_counts.index], topic_counts.values)
    plt.title(f"Review Count per Topic - {label.capitalize()}")
    plt.xlabel("Topic")
    plt.ylabel("Number of Reviews")
    plt.savefig(os.path.join(output_dir, f"{label}_topic_distribution.png"))
    plt.close()

plot_topic_distribution(df_pos, 'positive')
plot_topic_distribution(df_neg, 'negative')

print("Analysis completed successfully with better sentiment thresholds.")


In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import os

nltk.download('stopwords', quiet=True)
tqdm.pandas()

input_path = "C:/Users/User/OneDrive/Desktop/dsm_project/cleaneddataset/st_louis_reviews.csv"
output_dir = "C:/Users/User/OneDrive/Desktop/dsm_project/cleaneddataset/output"
os.makedirs(output_dir, exist_ok=True)
output_csv = os.path.join(output_dir, "ml_rating_predictions.csv")

df = pd.read_csv(input_path)
assert 'text' in df.columns and 'stars' in df.columns, "CSV must contain 'text' and 'stars'"
df['text'] = df['text'].fillna('').astype(str)
df = df[df['stars'].notnull()]
df['stars'] = df['stars'].astype(int)

X = df['text']
y = df['stars']

# --- TF-IDF Vectorization ---
print("Vectorizing reviews with TF-IDF...")
vectorizer = TfidfVectorizer(
    stop_words=stopwords.words('english'),
    max_features=5000,
    ngram_range=(1, 2)
)
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42, stratify=y
)

print("Training logistic regression...")
clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)

print("Predicting on test data...")
y_pred = clf.predict(X_test)

exact_accuracy = accuracy_score(y_test, y_pred)
relaxed_correct = (abs(y_test.values - y_pred) <= 1)
relaxed_accuracy = relaxed_correct.mean()

print(f"\n Exact Match Accuracy: {exact_accuracy * 100:.2f}%")
print(f"Relaxed Accuracy (±1 star): {relaxed_accuracy * 100:.2f}%")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

results_df = pd.DataFrame({
    'text': df.iloc[y_test.index]['text'].values,
    'actual_rating': y_test.values,
    'predicted_rating': y_pred,
    'correct_prediction': relaxed_correct
})
results_df.to_csv(output_csv, index=False)
print(f"Predictions saved to: {output_csv}")

cm_exact = confusion_matrix(y_test, y_pred, labels=[1, 2, 3, 4, 5])
plt.figure(figsize=(8, 6))
sns.heatmap(cm_exact, annot=True, fmt='d', cmap='Blues', xticklabels=[1, 2, 3, 4, 5], yticklabels=[1, 2, 3, 4, 5])
plt.xlabel("Predicted Rating")
plt.ylabel("Actual Rating")
plt.title("Confusion Matrix (Exact Match)")
plt.tight_layout()
exact_cm_path = os.path.join(output_dir, "confusion_matrix_exact.png")
plt.savefig(exact_cm_path)
plt.close()
print(f"Exact confusion matrix saved to: {exact_cm_path}")

def relaxed_cm(true, pred, labels=[1, 2, 3, 4, 5]):
    relaxed_pred = []
    for t, p in zip(true, pred):
        if abs(t - p) <= 1:
            relaxed_pred.append(p)
        else:
            relaxed_pred.append(-1) 
    all_labels = labels + [-1]
    cm = confusion_matrix(true, relaxed_pred, labels=all_labels)
    return cm, all_labels

cm_relaxed, all_labels = relaxed_cm(y_test.values, y_pred)
plt.figure(figsize=(9, 6))
sns.heatmap(
    cm_relaxed,
    annot=True,
    fmt='d',
    cmap='YlOrRd',
    xticklabels=[str(l) if l != -1 else "Off" for l in all_labels],
    yticklabels=[str(l) for l in all_labels if l != -1]
)
plt.xlabel("Predicted Rating (±1 accepted)")
plt.ylabel("Actual Rating")
plt.title("Confusion Matrix (Relaxed ±1)")
plt.tight_layout()
relaxed_cm_path = os.path.join(output_dir, "confusion_matrix_relaxed.png")
plt.savefig(relaxed_cm_path)
plt.close()
print(f"Relaxed confusion matrix saved to: {relaxed_cm_path}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import gensim
import numpy as np
from gensim import corpora
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm import tqdm
import os

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('vader_lexicon', quiet=True)

tqdm.pandas()

input_path = "C:/Users/User/OneDrive/Desktop/dsm_project/cleaneddataset/philly_reviews.csv"
output_dir = "C:/Users/User/OneDrive/Desktop/dsm_project/cleaneddataset/philly_output"
os.makedirs(output_dir, exist_ok=True)
min_docs_needed = 10

df = pd.read_csv(input_path)
assert 'text' in df.columns, "CSV must contain 'text' column"
df['text'] = df['text'].fillna('').astype(str)

def preprocess(text):
    try:
        tokens = nltk.word_tokenize(text.lower())
        tokens = [word for word in tokens if word.isalpha()]
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        return tokens if tokens else ['placeholder']
    except:
        return ['placeholder']

sia = SentimentIntensityAnalyzer()
print("Performing sentiment analysis with adjusted thresholds...")

df['compound'] = df['text'].progress_apply(lambda x: sia.polarity_scores(x)['compound'])

def classify_sentiment(score):
    if score >= 0.3:
        return 'positive'
    elif score <= -0.3:
        return 'negative'
    else:
        return 'neutral'

df['sentiment'] = df['compound'].apply(classify_sentiment)

df[['text', 'compound', 'sentiment']].to_csv(os.path.join(output_dir, "review_sentiment_map.csv"), index=False)

positive_reviews = df[df['sentiment'] == 'positive']['text']
negative_reviews = df[df['sentiment'] == 'negative']['text']

def get_topics(reviews, label, num_topics=5, num_words=6):
    if len(reviews) < min_docs_needed:
        print(f"Not enough {label} reviews ({len(reviews)}).")
        return [], None, [], None, []

    texts = reviews.progress_apply(preprocess).tolist()
    texts = [t for t in texts if len(t) > 0]
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=2, no_above=0.9)
    corpus = [dictionary.doc2bow(text) for text in texts]

    if len(dictionary) < num_topics:
        num_topics = max(2, len(dictionary) // 5)

    lda = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)
    topics = lda.print_topics(num_words=num_words)

    topic_keywords = []
    for i, topic in enumerate(topics):
        try:
            words = [w.split("*")[1].replace('"', '').strip() for w in topic[1].split("+")]
            topic_keywords.append({
                'Sentiment': label,
                'Topic': f"Topic {i+1}",
                'Keywords': ", ".join(words)
            })
        except:
            continue

    return topic_keywords, lda, corpus, dictionary, texts

topics_data = []

pos_topics, lda_pos, corpus_pos, dict_pos, texts_pos = get_topics(positive_reviews, 'positive')
topics_data.extend(pos_topics)

neg_topics, lda_neg, corpus_neg, dict_neg, texts_neg = get_topics(negative_reviews, 'negative')
topics_data.extend(neg_topics)

if topics_data:
    pd.DataFrame(topics_data).to_csv(os.path.join(output_dir, "st_louis_reviews_topics.csv"), index=False)

def assign_topics(corpus, lda_model):
    return [max(lda_model.get_document_topics(doc), key=lambda x: x[1])[0] for doc in corpus]

df_pos = pd.DataFrame({'text': positive_reviews.values})
df_neg = pd.DataFrame({'text': negative_reviews.values})

if lda_pos and corpus_pos:
    df_pos['dominant_topic'] = assign_topics(corpus_pos, lda_pos)

if lda_neg and corpus_neg:
    df_neg['dominant_topic'] = assign_topics(corpus_neg, lda_neg)

sentiment_counts = df['sentiment'].value_counts()
plt.figure(figsize=(6, 6))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', colors=['lightgreen', 'salmon', 'gray'])
plt.title("Sentiment Distribution")
plt.savefig(os.path.join(output_dir, "sentiment_pie_chart.png"))
plt.close()

def plot_keyword_bar(topics, label):
    if not topics:
        return
    names = [t['Topic'] for t in topics]
    counts = [len(t['Keywords'].split(", ")) for t in topics]
    plt.figure(figsize=(10, 6))
    plt.bar(names, counts)
    plt.title(f"Keyword Count per {label.capitalize()} Topic")
    plt.ylabel("Keyword Count")
    plt.savefig(os.path.join(output_dir, f"{label}_topics_bar_chart.png"))
    plt.close()

plot_keyword_bar(pos_topics, 'positive')
plot_keyword_bar(neg_topics, 'negative')

def plot_topic_distribution(df_subset, label):
    if 'dominant_topic' not in df_subset.columns:
        return
    topic_counts = df_subset['dominant_topic'].value_counts().sort_index()
    plt.figure(figsize=(10, 6))
    plt.bar([f"Topic {i+1}" for i in topic_counts.index], topic_counts.values)
    plt.title(f"Review Count per Topic - {label.capitalize()}")
    plt.xlabel("Topic")
    plt.ylabel("Number of Reviews")
    plt.savefig(os.path.join(output_dir, f"{label}_topic_distribution.png"))
    plt.close()

plot_topic_distribution(df_pos, 'positive')
plot_topic_distribution(df_neg, 'negative')

print("Analysis completed successfully with better sentiment thresholds.")

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import os

nltk.download('stopwords', quiet=True)
tqdm.pandas()

input_path = "C:/Users/User/OneDrive/Desktop/dsm_project/cleaneddataset/philly_reviews.csv"
output_dir = "C:/Users/User/OneDrive/Desktop/dsm_project/cleaneddataset/philly_output"
os.makedirs(output_dir, exist_ok=True)
output_csv = os.path.join(output_dir, "ml_rating_predictions.csv")

df = pd.read_csv(input_path)
assert 'text' in df.columns and 'stars' in df.columns, "CSV must contain 'text' and 'stars'"
df['text'] = df['text'].fillna('').astype(str)
df = df[df['stars'].notnull()]
df['stars'] = df['stars'].astype(int)

X = df['text']
y = df['stars']

print("Vectorizing reviews with TF-IDF...")
vectorizer = TfidfVectorizer(
    stop_words=stopwords.words('english'),
    max_features=5000,
    ngram_range=(1, 2)
)
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42, stratify=y
)

print("Training logistic regression...")
clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)

print("Predicting on test data...")
y_pred = clf.predict(X_test)

exact_accuracy = accuracy_score(y_test, y_pred)
relaxed_correct = (abs(y_test.values - y_pred) <= 1)
relaxed_accuracy = relaxed_correct.mean()

print(f"\n Exact Match Accuracy: {exact_accuracy * 100:.2f}%")
print(f" Relaxed Accuracy (±1 star): {relaxed_accuracy * 100:.2f}%")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

results_df = pd.DataFrame({
    'text': df.iloc[y_test.index]['text'].values,
    'actual_rating': y_test.values,
    'predicted_rating': y_pred,
    'correct_prediction': relaxed_correct
})
results_df.to_csv(output_csv, index=False)
print(f"Predictions saved to: {output_csv}")

cm_exact = confusion_matrix(y_test, y_pred, labels=[1, 2, 3, 4, 5])
plt.figure(figsize=(8, 6))
sns.heatmap(cm_exact, annot=True, fmt='d', cmap='Blues', xticklabels=[1, 2, 3, 4, 5], yticklabels=[1, 2, 3, 4, 5])
plt.xlabel("Predicted Rating")
plt.ylabel("Actual Rating")
plt.title("Confusion Matrix (Exact Match)")
plt.tight_layout()
exact_cm_path = os.path.join(output_dir, "confusion_matrix_exact.png")
plt.savefig(exact_cm_path)
plt.close()
print(f"📊 Exact confusion matrix saved to: {exact_cm_path}")

def relaxed_cm(true, pred, labels=[1, 2, 3, 4, 5]):
    relaxed_pred = []
    for t, p in zip(true, pred):
        if abs(t - p) <= 1:
            relaxed_pred.append(p)
        else:
            relaxed_pred.append(-1) 
    all_labels = labels + [-1]
    cm = confusion_matrix(true, relaxed_pred, labels=all_labels)
    return cm, all_labels

cm_relaxed, all_labels = relaxed_cm(y_test.values, y_pred)
plt.figure(figsize=(9, 6))
sns.heatmap(
    cm_relaxed,
    annot=True,
    fmt='d',
    cmap='YlOrRd',
    xticklabels=[str(l) if l != -1 else "Off" for l in all_labels],
    yticklabels=[str(l) for l in all_labels if l != -1]
)
plt.xlabel("Predicted Rating (±1 accepted)")
plt.ylabel("Actual Rating")
plt.title("Confusion Matrix (Relaxed ±1)")
plt.tight_layout()
relaxed_cm_path = os.path.join(output_dir, "confusion_matrix_relaxed.png")
plt.savefig(relaxed_cm_path)
plt.close()
print(f"Relaxed confusion matrix saved to: {relaxed_cm_path}")
