In [3]:
import os
import json
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
import pandas as pd


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [4]:
DATA_DIR = Path("data")
RAW = DATA_DIR / "reddit_data.csv"
CLEAN = DATA_DIR / "reddit_data_clean.csv"
SEM = DATA_DIR / "reddit_data_semantic_clean.csv"
SENT = DATA_DIR / "reddit_data_sentiment.csv"
EVAL = DATA_DIR / "evaluation_results.csv"

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("PINECONE_INDEX", "reddit-insights")

pc = Pinecone(api_key=PINECONE_API_KEY)
emb = OpenAIEmbeddings(model="text-embedding-3-small")
docsearch = PineconeVectorStore.from_existing_index(index_name=INDEX_NAME, embedding=emb)

# Dataset Summary

In [5]:
def describe_file(path):
    if not path.exists():
        return {"exists": False}
    df = pd.read_csv(path)
    return {
        "exists": True,
        "rows": len(df),
        "columns": df.columns.tolist(),
        "missing_values": int(df.isna().sum().sum())
    }

summary = {
    "raw": describe_file(RAW),
    "clean": describe_file(CLEAN),
    "semantic": describe_file(SEM),
    "sentiment": describe_file(SENT)
}

def dict_to_table(d):
    lines = ['File      Exists   Rows   Missing   Columns']
    lines.append('-' * 50)

    for name, info in d.items():
        exists = '✓' if info['exists'] else '✗'
        rows = f"{info.get('rows', '-')}"
        missing = f"{info.get('missing_values', '-')}"
        cols = ', '.join(info.get('columns', []))
        lines.append(f"{name:<9} {exists:<7} {rows:>6} {missing:>9} {cols}")

    return '\n'.join(lines)

print(dict_to_table(summary))


File      Exists   Rows   Missing   Columns
--------------------------------------------------
raw       ✓       182086    348780 id, category, subreddit, title, content, author, score, num_comments, created_utc, edited, type, parent_id
clean     ✓       168434    488193 id, category, subreddit, title, content, author, score, num_comments, created_utc, edited, type, parent_id, full_text, keywords_found, clean_text, word_count, drop_reason
semantic  ✓        13408     45072 id, category, subreddit, title, content, author, score, num_comments, created_utc, edited, type, parent_id, full_text, keywords_found, clean_text, word_count, drop_reason, semantic_category, semantic_score, final_category
sentiment ✓        13408     45072 id, category, subreddit, title, content, author, score, num_comments, created_utc, edited, type, parent_id, full_text, keywords_found, clean_text, word_count, drop_reason, semantic_category, semantic_score, final_category, sentiment


# Semantic Analysis

In [6]:
df_sem = pd.read_csv(SEM)

df_sem["final_category"].value_counts().plot(kind="bar", title="Semantic Category Distribution")
plt.show()

df_sem["semantic_score"].hist(bins=30)
plt.title("Semantic Score Distribution")
plt.show()


  plt.show()
  plt.show()


# Sentiment Analysis

In [7]:
df_sent = pd.read_csv(SENT)

df_sent["sentiment"].value_counts().plot(kind="bar", title="Sentiment Distribution")
plt.show()

# Top negative words
from collections import Counter
neg_words = " ".join(df_sent[df_sent["sentiment"]=="negative"]["clean_text"]).split()
print("Top 25 Negative Words:\n" , Counter(neg_words).most_common(25))



Top 25 Negative Words:
 [('the', 10393), ('to', 9043), ('a', 7902), ('and', 5679), ('i', 5488), ('in', 4207), ('this', 3935), ('of', 3742), ('have', 3043), ('it', 2966), ('or', 2769), ('you', 2656), ('that', 2424), ('is', 2348), ('for', 2085), ('was', 2074), ('law', 2006), ('if', 1797), ('response', 1777), ('brief', 1754), ('on', 1728), ('from', 1692), ('with', 1484), ('all', 1480), ('s', 1472)]


  plt.show()


# Retrieval Evaluation

In [8]:
test_queries = [
    "What issues do users report about Clio?",
    "What frustrations exist with Jira?",
    "What are problems with Procore?",
]

scores = []

for q in test_queries:
    results = docsearch.similarity_search_with_score(q, k=5)
    scores.append([q, len(results), results[0][1] if results else None])

retrieval_df = pd.DataFrame(scores, columns=["query", "docs_retrieved", "top_similarity"])
retrieval_df


Unnamed: 0,query,docs_retrieved,top_similarity
0,What issues do users report about Clio?,5,0.598856
1,What frustrations exist with Jira?,5,0.592826
2,What are problems with Procore?,5,0.667938


# RAG vs LLM Evaluation

In [9]:
eval_df = pd.read_csv(EVAL)
eval_df.head(20)

Unnamed: 0,question,rag_answer,llm_answer,rag_relevance,llm_relevance,documents_used,avg_evidence_score
0,What are the most discussed software tools for...,The most discussed software tools for project ...,"In the construction industry, effective projec...",5,5,20,0.478118
1,Which legal practice management tools are popu...,Popular legal practice management tools among ...,"As of my last update in October 2023, several ...",5,5,18,0.560471
2,What do Reddit users think about using Clio fo...,Reddit users have mixed opinions about using C...,Reddit users generally have a mix of opinions ...,5,5,18,0.568943
3,How does sentiment differ between users discus...,The sentiment regarding Procore is generally p...,To analyze sentiment differences between users...,5,5,20,0.471679
4,Which software tools are most frequently menti...,The most frequently mentioned software tools f...,"In tech companies, several software tools are ...",5,5,18,0.518891
5,What pain points do construction professionals...,Construction professionals express several pai...,Construction professionals often face several ...,5,5,20,0.483099
6,Are there any open-source alternatives to expe...,The context does not provide specific open-sou...,"Yes, there are several open-source alternative...",2,5,20,0.47104
7,What kind of feedback do users give about usin...,I don't know based on the provided Reddit data.,Users often provide a range of feedback about ...,1,5,18,0.397121
8,Which CRM tools are mentioned by users working...,Users in the tech industry mentioned the follo...,Users in the tech industry often mention a var...,5,5,20,0.479207
9,Do Reddit users report any security concerns r...,"Yes, there are reported security concerns rela...","Yes, Reddit users have expressed various secur...",5,5,17,0.483223


# RAG Relevance vs LLM

In [10]:
eval_df["rag_relevance"] = pd.to_numeric(eval_df["rag_relevance"], errors="coerce")
eval_df["llm_relevance"] = pd.to_numeric(eval_df["llm_relevance"], errors="coerce")

plt.figure(figsize=(8,5))
sns.barplot(data=eval_df[["rag_relevance","llm_relevance"]])
plt.title("Average Relevance Score: RAG vs LLM")
plt.show()

eval_df.mean(numeric_only=True)


  plt.show()


rag_relevance          3.200000
llm_relevance          5.000000
documents_used        19.100000
avg_evidence_score     0.410372
dtype: float64

# Generate Final HTML/PDF Report

In [None]:
import pandas as pd
from ydata_profiling import ProfileReport
from pathlib import Path


# Load datasets
clean = pd.read_csv("data/reddit_data_clean.csv")
semantic = pd.read_csv("data/reddit_data_semantic_clean.csv")
sentiment = pd.read_csv("data/reddit_data_sentiment.csv")
eval_df = pd.read_csv("data/evaluation_results.csv")

# REDUCE NOISY COLUMNS (keep only useful ones)
useful_clean_cols = [
    "category", "subreddit", "full_text",
    "keywords_found", "clean_text", "word_count",
]

useful_sem_cols = [
    "semantic_category", "semantic_score", "final_category"
]

useful_sent_cols = ["sentiment"]

useful_eval_cols = [
    "rag_relevance", "llm_relevance", "avg_evidence_score",
]

clean = clean[useful_clean_cols].add_prefix("clean_")
semantic = semantic[useful_sem_cols].add_prefix("semantic_")
sentiment = sentiment[useful_sent_cols].add_prefix("sent_")

# evaluation column names unknown → filter safely
eval_df = eval_df[[c for c in eval_df.columns if c.lower() in {
    "rag_relevance", "llm_relevance", "avg_evidence_score"
}]].add_prefix("eval_")


# COMBINE INTO ANALYTICAL DATAFRAME
analytic_df = pd.concat([clean, semantic, sentiment, eval_df], axis=1)


# CREATE PROFILE REPORT
profile = ProfileReport(
    analytic_df,
    title="Week 7 — Reddit Insights Chatbot Evaluation Report",
    explorative=False,
    minimal=True,
)

try:
    # Enable all meaningful correlations safely
    profile.config.vars.correlations.set(
        {
            "pearson": True,
            "spearman": True,
            "kendall": True,
            "phi_k": True,
        }
    )
except Exception as e:
    print("Correlation config skipped:", e)


# EXPORT REPORT
profile.to_file("week7_data_profile_report.html")
print("Generated → week7_data_profile_report.html")


Correlation config skipped: 'Univariate' object has no attribute 'correlations'


100%|██████████| 13/13 [00:24<00:00,  1.88s/it]6<00:23,  4.66s/it, Describe variable: eval_avg_evidence_score]
Summarize dataset: 100%|██████████| 19/19 [00:26<00:00,  1.40s/it, Completed]                                 
Generate report structure: 100%|██████████| 1/1 [00:07<00:00,  7.66s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  6.34it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 92.28it/s]

Generated → week7_data_profile_report.html



