In [4]:
import os 
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv


In [5]:
import os
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
from urllib.parse import quote_plus

def load_csv_to_postgres():
    """
    Reads a CSV file, adds 'id' and 'analysis_status' columns,
    and loads the contents into a PostgreSQL table.
    """
    # --- 1. Configuration ---
    load_dotenv(override=True)

    db_user = os.getenv("DB_USER")
    db_password = os.getenv("DB_PASSWORD")
    db_host = os.getenv("DB_HOST")
    db_port = os.getenv("DB_PORT")
    db_name = os.getenv("DB_NAME")

    csv_file_path = "data/sampled_amazon_data.csv"
    table_name = "raw_reviews"

    if not all([db_user, db_password, db_host, db_port, db_name]):
        print("Error: Database configuration is missing in the .env file.")
        return

    try:
        # --- 2. Read the CSV file and add new columns ---
        print(f"Reading CSV file from '{csv_file_path}'...")
        df = pd.read_csv(csv_file_path)
        print(f"Successfully loaded {len(df)} rows into a DataFrame.")

        # --- THIS IS THE NEW LINE ---
        # Add a unique 'id' column, starting from 1.
        df.insert(0, 'id', range(1, 1 + len(df)))
        print("Added 'id' column to the DataFrame.")

        # Add the 'analysis_status' column
        df['analysis_status'] = 'pending'
        print("Added 'analysis_status' column to the DataFrame.")

        # --- 3. Connect to the Database ---
        encoded_password = quote_plus(db_password)
        connection_url = f"postgresql://{db_user}:{encoded_password}@{db_host}:{db_port}/{db_name}"
        engine = create_engine(connection_url)
        print(f"Connecting to database '{db_name}'...")

        # --- 4. Load data into PostgreSQL ---
        print(f"Loading data into table '{table_name}'...")
        df.to_sql(
            name=table_name,
            con=engine,
            if_exists='replace',
            index=False
        )
        
        print(f"✅ Success! Data has been loaded into the '{table_name}' table.")

    except FileNotFoundError:
        print(f"Error: The file '{csv_file_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")



In [6]:
load_csv_to_postgres()

Reading CSV file from 'data/sampled_amazon_data.csv'...
Successfully loaded 200 rows into a DataFrame.
Added 'id' column to the DataFrame.
Added 'analysis_status' column to the DataFrame.
Connecting to database 'cx_hackathon_db'...
Loading data into table 'raw_reviews'...
✅ Success! Data has been loaded into the 'raw_reviews' table.


## Getting Data from Postgres

In [14]:
import os
import pandas as pd
from sqlalchemy import create_engine, engine as sqlalchemy_engine
from dotenv import load_dotenv
from urllib.parse import quote_plus
from typing import Optional

In [15]:
def create_db_engine() -> Optional[sqlalchemy_engine.Engine]:
    """
    Reads database credentials from the .env file and creates a
    SQLAlchemy engine.

    Returns:
        sqlalchemy_engine.Engine: A SQLAlchemy engine instance, or None if config is missing.
    """

    load_dotenv()

    db_user = os.getenv("DB_USER")
    db_password = os.getenv("DB_PASSWORD")
    db_host = os.getenv("DB_HOST")
    db_port = os.getenv("DB_PORT")
    db_name = os.getenv("DB_NAME")

    if not all([db_user, db_password, db_host, db_port, db_name]):
        print("Error: Database configuration is missing in the .env file.")
        return None
    
    try:
        encoded_password = quote_plus(db_password)
        connection_url = f"postgresql://{db_user}:{encoded_password}@{db_host}:{db_port}/{db_name}"
        engine = create_engine(connection_url)
        # Test the connection to ensure it's valid
        engine.connect()
        return engine
    except Exception as e:
        print(f"Error creating database engine: {e}")
        return None



In [16]:

def fetch_pending_reviews_from_sql(engine: sqlalchemy_engine.Engine, limit: int = 100) -> pd.DataFrame:
    """
    Fetches a batch of reviews with 'pending' status using a provided database engine.

    Args:
        engine: An active SQLAlchemy engine.
        limit (int): The maximum number of reviews to fetch.

    Returns:
        pd.DataFrame: A pandas DataFrame containing the pending reviews.
    """
    table_name = "raw_reviews"
    sql_query = f"""
        SELECT * FROM {table_name} 
        WHERE analysis_status = 'pending' 
        LIMIT {limit};
    """
    
    try:
        print(f"Fetching up to {limit} pending reviews...")
        df = pd.read_sql_query(sql_query, engine)
        return df
    except Exception as e:
        print(f"Error fetching data: {e}")
        return pd.DataFrame()

In [17]:
print("Connecting to the database")
db_engine = create_db_engine()

if db_engine:
    print("Connection successful.")
    pending_reviews_df = fetch_pending_reviews_from_sql(engine=db_engine, limit=2000)
    
    if not pending_reviews_df.empty:
            print(f"\n✅ Success! Fetched {len(pending_reviews_df)} pending reviews.")
            print("--- DataFrame Head ---")
            print(pending_reviews_df.head())
    else:
        print("\nNo pending reviews found or an error occurred during fetching.")
else:
    print("Could not create database engine. Aborting.")


Connecting to the database
Connection successful.
Fetching up to 2000 pending reviews...

✅ Success! Fetched 200 pending reviews.
--- DataFrame Head ---
         ASIN                                              Title  \
0  B002K6AHQY   CND Vinylux Weekly Nail Polish, Rock Royalty,...   
1  B00176GSEI  HOT TOOLS Professional 24k Gold Extra-Long Bar...   
2  B000ASDGK8                   BaBylissPRO Ceramix Xtreme Dryer   
3  B002K6AHQY   CND Vinylux Weekly Nail Polish, Rock Royalty,...   
4  B00FYSZDQ4                            COLOR WOW Root Cover Up   

                                         Description  \
0  Vinylux weekly polish and weekly top coat are ...   
1  Hot Tools Professional 1110 Curling Iron with ...   
2  2000 Watt ceramic technology dryer with concen...   
3  Vinylux weekly polish and weekly top coat are ...   
4  Color Wow root cover up, winner of 44 major be...   

                                            ImageURL  Rating  Verified  \
0  https://images-na.ssl-im

In [18]:
pending_reviews_df.head()

Unnamed: 0,ASIN,Title,Description,ImageURL,Rating,Verified,ReviewTime,Review,Summary,Domestic Shipping,International Shipping,Sentiment,analysis_status
0,B002K6AHQY,"CND Vinylux Weekly Nail Polish, Rock Royalty,...",Vinylux weekly polish and weekly top coat are ...,https://images-na.ssl-images-amazon.com/images...,3.0,True,2015-10-19,This color not as good as the others,Three Stars,"Currently, item can be shipped only within the...",This item is not eligible for international sh...,,pending
1,B00176GSEI,HOT TOOLS Professional 24k Gold Extra-Long Bar...,Hot Tools Professional 1110 Curling Iron with ...,https://images-na.ssl-images-amazon.com/images...,5.0,True,2015-11-17,I've been using this curling iron for YEARS an...,The Best Curling Iron,,,1.0,pending
2,B000ASDGK8,BaBylissPRO Ceramix Xtreme Dryer,2000 Watt ceramic technology dryer with concen...,https://images-na.ssl-images-amazon.com/images...,5.0,True,2015-09-30,I think it works well and I love it. Much bett...,Five Stars,,,1.0,pending
3,B002K6AHQY,"CND Vinylux Weekly Nail Polish, Rock Royalty,...",Vinylux weekly polish and weekly top coat are ...,https://images-na.ssl-images-amazon.com/images...,2.0,True,2015-07-03,Not my favorite color. I expected more yellow ...,Just eh. Beware of shimmer.,"Currently, item can be shipped only within the...",This item is not eligible for international sh...,-1.0,pending
4,B00FYSZDQ4,COLOR WOW Root Cover Up,"Color Wow root cover up, winner of 44 major be...",https://images-na.ssl-images-amazon.com/images...,5.0,True,2015-04-16,Was recommended by a friend . Easy to apply. ...,nice product,,,1.0,pending


In [19]:
pending_reviews_df.columns

Index(['ASIN', 'Title', 'Description', 'ImageURL', 'Rating', 'Verified',
       'ReviewTime', 'Review', 'Summary', 'Domestic Shipping',
       'International Shipping', 'Sentiment', 'analysis_status'],
      dtype='object')

In [21]:
from sklearn.model_selection import train_test_split
from typing import Tuple

In [22]:
# 1. YOUR WORKING SPLIT FUNCTION (with a minor fix)
def split_data_for_evaluation(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Splits a DataFrame into training and testing sets (80-20).
    The split is stratified by the 'Rating' column.
    """
    if df.empty:
        print("Input DataFrame is empty. Cannot split.")
        return pd.DataFrame(), pd.DataFrame()

    features = df['Review']
    target = df['Rating']

    X_train, X_test, y_train, y_test = train_test_split(
        features,
        target,
        test_size=0.2,
        random_state=42,
        stratify=target
    )
    
    # --- FIX ---
    # Convert the Series back to a DataFrame before adding a new column
    train_df = X_train.to_frame(name='Review')
    train_df['Rating'] = y_train
    
    test_df = X_test.to_frame(name='Review')
    test_df['Rating'] = y_test

    return train_df, test_df


In [23]:
import pandas as pd
from typing import List, Dict

def prepare_data_for_llm(df: pd.DataFrame) -> List[Dict]:
    """
    Converts a DataFrame into a list of dictionaries, selecting and renaming
    the columns needed for the LLM analysis.

    Args:
        df (pd.DataFrame): The DataFrame to prepare (e.g., your train_df).

    Returns:
        List[Dict]: A list of dictionaries, ready for the LLM.
    """
    if 'Review' not in df.columns:
        print("Error: DataFrame is missing the 'Review' column.")
        return []
        
    # Select only the review text column
    df_selected = df[['Review']]
    
    # Rename 'Review' to 'text' for clarity and consistency in the LLM prompt
    df_selected = df_selected.rename(columns={'Review': 'text'})
    
    # Convert the DataFrame to a list of dictionaries (e.g., [{'text': '...'}, {'text': '...'}])
    records = df_selected.to_dict(orient='records')
    
    return records


In [24]:
# 1. Use your existing function to split the data
train_df, test_df = split_data_for_evaluation(pending_reviews_df)
print(f"--- Data Split: Training set created with {len(train_df)} reviews. ---")

    # 2. Use the new function to prepare the training data for the LLM
llm_ready_data = prepare_data_for_llm(train_df)

print("\n--- Data Prepared for LLM ---")
print(f"Successfully converted training set to a list of {len(llm_ready_data)} dictionaries.")
print("First item in the list (ready to be sent to LLM):")
print(llm_ready_data[0])

--- Data Split: Training set created with 160 reviews. ---

--- Data Prepared for LLM ---
Successfully converted training set to a list of 160 dictionaries.
First item in the list (ready to be sent to LLM):
{'text': "Great stuff. I mix right in the lid with about 1.5 to 2 teaspoons of hot water 10 swirls of soap on a well shaken rinsed brush. About 100 swirls and then I squeegee the soap out of the brush with my fingers and apply to my face with my fingers. Produces a thick rich lather that can be thin or thick as you wish water water and swirls. Only complaint is that toward the end of the soap, it can lose its frothiness and then the above technique doesn't seem to work, so I prefer the travel size for that reason (3 oz vice 5oz.). My face never feels dried-out after using. (I'm in my early 50's with an olive complexion).  This stuff lasts me a minimum of 3 months, probably even more. The bottom line is that you don't have to buy the fancy, expensive stuff to get a decent shave."}


In [26]:
import pandas as pd
import json
from typing import Dict, List
from tqdm import tqdm

# --- LangChain and Pydantic for LLM Analysis ---
from pydantic import BaseModel, Field
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser

# 1. Define the desired structured output
class ReviewAnalysis(BaseModel):
    sentiment: str = Field(description="The overall sentiment: 'Positive', 'Negative', or 'Neutral'")
    main_topic: str = Field(description="The single most fitting category, e.g., 'Durability', 'Performance'.")
    key_drivers: Dict[str, str] = Field(description="A dictionary of features to their sentiment, e.g., {'Battery Life': 'Negative'}.")

# 2. Helper function to analyze a SINGLE review
def analyze_review(review_data: Dict, llm_chain) -> Dict:
    """Invokes the LLM chain for a single review dictionary."""
    try:
        # The input to the chain now correctly matches what the chain expects
        result = llm_chain.invoke(review_data)
        return result.dict()
    except Exception as e:
        # It's helpful to print the failing review text for debugging
        print(f"--- Could not analyze review: '{review_data['text']}'. Error: {e} ---")
        return None


    
# --- SETUP (The Corrected Part) ---
# Create the parser ONCE
parser = PydanticOutputParser(pydantic_object=ReviewAnalysis)

# Create the template
prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are an expert CX analyst. Analyze the user's review and respond ONLY with a JSON object that follows these instructions:\n{format_instructions}"),
    ("human", "Here is the review text to analyze: {text}")
])

# --- THIS IS THE FIX ---
# Partially format the prompt with the parser's instructions.
# This "bakes in" the format_instructions, so the chain only needs 'text' later.
prompt = prompt_template.partial(format_instructions=parser.get_format_instructions())

# Initialize the model and create the final chain
model = ChatOllama(model="llama3")
chain = prompt | model | parser
# --- END OF FIX ---


# --- THE ANALYSIS LOOP ---
analysis_results = []
print(f"\n--- Starting analysis on the list of {len(llm_ready_data)} reviews ---")

for review_data in tqdm(llm_ready_data, desc="Analyzing Reviews"):
    result = analyze_review(review_data, chain)
    if result:
        result['original_text'] = review_data['text']
        analysis_results.append(result)

# --- FINAL OUTPUT ---
print("\n\n🏁 --- Analysis complete! ---")
print("--- Final list of all analysis results: ---")
print(json.dumps(analysis_results, indent=2))


--- Starting analysis on the list of 160 reviews ---


Analyzing Reviews:   0%|          | 0/160 [00:00<?, ?it/s]/var/folders/wh/4xk7n0ss7xg9nxqk4znmw1d80000gn/T/ipykernel_12253/33891073.py:24: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return result.dict()
Analyzing Reviews:   1%|          | 1/160 [00:10<27:09, 10.25s/it]/var/folders/wh/4xk7n0ss7xg9nxqk4znmw1d80000gn/T/ipykernel_12253/33891073.py:24: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return result.dict()
Analyzing Reviews:   1%|▏         | 2/160 [00:13<16:52,  6.41s/it]/var/folders/wh/4xk7n0ss7xg9nxqk4znmw1d80000gn/T/ipykernel_12253/33891073.py:24: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Dep

--- Could not analyze review: 'Was hoping that it would make my lashes a bit darker.  BUT, the length is great!!!!'. Error: Invalid json output: Here is the analysis output in JSON format:

{
    "properties": {
        "sentiment": "Positive",
        "main_topic": "Length",
        "key_drivers": {
            "Darker Lashes": "Negative",
            "Length": "Positive"
        }
    },
    "required": ["sentiment", "main_topic", "key_drivers"]
}
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE  ---


/var/folders/wh/4xk7n0ss7xg9nxqk4znmw1d80000gn/T/ipykernel_12253/33891073.py:24: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return result.dict()
Analyzing Reviews:   5%|▌         | 8/160 [00:31<08:46,  3.46s/it]/var/folders/wh/4xk7n0ss7xg9nxqk4znmw1d80000gn/T/ipykernel_12253/33891073.py:24: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return result.dict()
Analyzing Reviews:   6%|▌         | 9/160 [00:35<08:54,  3.54s/it]/var/folders/wh/4xk7n0ss7xg9nxqk4znmw1d80000gn/T/ipykernel_12253/33891073.py:24: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydant

--- Could not analyze review: 'it is best~'. Error: Failed to parse ReviewAnalysis from completion {"properties": {"sentiment": "Positive", "main_topic": "Overall Satisfaction", "key_drivers": {"Best": "Positive"}}, "required": ["sentiment", "main_topic", "key_drivers"]}. Got: 3 validation errors for ReviewAnalysis
sentiment
  Field required [type=missing, input_value={'properties': {'sentimen..._topic', 'key_drivers']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
main_topic
  Field required [type=missing, input_value={'properties': {'sentimen..._topic', 'key_drivers']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
key_drivers
  Field required [type=missing, input_value={'properties': {'sentimen..._topic', 'key_drivers']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
For troubleshooting, visit: https://python.langchain.com/docs/troubleshoot

Analyzing Reviews:   9%|▉         | 14/160 [01:00<10:33,  4.34s/it]


KeyboardInterrupt: 

In [27]:
analysis_results

[{'sentiment': 'Positive',
  'main_topic': 'Durability',
  'key_drivers': {'Lathering Ability': 'Positive',
   'Soap Quality': 'Positive',
   'Shaving Experience': 'Positive'},
  'original_text': "Great stuff. I mix right in the lid with about 1.5 to 2 teaspoons of hot water 10 swirls of soap on a well shaken rinsed brush. About 100 swirls and then I squeegee the soap out of the brush with my fingers and apply to my face with my fingers. Produces a thick rich lather that can be thin or thick as you wish water water and swirls. Only complaint is that toward the end of the soap, it can lose its frothiness and then the above technique doesn't seem to work, so I prefer the travel size for that reason (3 oz vice 5oz.). My face never feels dried-out after using. (I'm in my early 50's with an olive complexion).  This stuff lasts me a minimum of 3 months, probably even more. The bottom line is that you don't have to buy the fancy, expensive stuff to get a decent shave."},
 {'sentiment': 'Posit

In [7]:
# import ollama
from langchain_ollama import ChatOllama

In [12]:
model = ChatOllama(model="llama3", temperature=0)
model.invoke("Hello, world!")

AIMessage(content="Hello there! It's great to meet you! Welcome to the world of AI-powered conversations. I'm here to help answer your questions, provide information, and have some fun with you. What brings you to this corner of the internet today?", additional_kwargs={}, response_metadata={'model': 'llama3', 'created_at': '2025-08-13T06:04:40.511029Z', 'done': True, 'done_reason': 'stop', 'total_duration': 8740289833, 'load_duration': 25427458, 'prompt_eval_count': 14, 'prompt_eval_duration': 280155250, 'eval_count': 50, 'eval_duration': 2817608458, 'model_name': 'llama3'}, id='run--e5212a7d-def9-4791-8bab-1157dd51158d-0', usage_metadata={'input_tokens': 14, 'output_tokens': 50, 'total_tokens': 64})