In [1]:
!pip install load_dotenv
!pip install pydantic
!pip install langchain_huggingface
!pip install langchain_core



In [None]:
import os
os.chdir('/content/565sentiment_DJ/Milestone-1')

print(os.getcwd())

/content/565sentiment_DJ


In [2]:
import os
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
import pandas as pd
from langchain_huggingface import HuggingFacePipeline
from langchain_core.messages import SystemMessage, HumanMessage
from pydantic import BaseModel, Field
import json

In [3]:
# Load LLM
llm = HuggingFacePipeline.from_model_id(
    model_id="ibm-granite/granite-3.2-2b-instruct",
    task="text-generation",
    device_map="cuda",
    pipeline_kwargs={
        "return_full_text": False,
        "max_new_tokens": 200
    }

)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda


# Task1:Sentiment Analysis

In [4]:
# Load data and extract Sentences

def load_df_and_sents(data_path):
  df = pd.read_csv(data_path)
  sentences = list(df.sentence)

  return df, sentences

sentiment_df, sentiment_sentences = load_df_and_sents("data/milestone-1-eng_sentiment.csv")
sentiment_df.head()

Unnamed: 0,sentence id,sentence,class-label
0,1,I absolutely adore how this workout app counts...,mixed
1,2,I can't believe how well these wireless earbud...,mixed
2,3,I wholeheartedly recommend this mental health ...,positive
3,4,So the new streaming service has excellent con...,mixed
4,5,"I guess it's not totally horrible, but I'd be ...",negative


In [5]:
sentiment_sentences[0:3]

["I absolutely adore how this workout app counts steps, but it fails to sync half the time, so I'm stuck redoing workouts.",
 "I can't believe how well these wireless earbuds fit, though I keep losing connection outdoors, which is so annoying.",
 "I wholeheartedly recommend this mental health platform; I've never felt more supported or validated."]

In [6]:
# Define the response schema for the Sentiment Analysis usecase
sentiment_schema = ResponseSchema(
    name="sentiment_label",
    description="result of sentiment analysis, return 'positive', 'negative', 'mixed' or 'neutral'",
)

explanation_schema = ResponseSchema(
    name="explanation",
    description="short explanation for the result of sentiment analysis",
)

response_schemas = [sentiment_schema, explanation_schema]

# Using LangChain's StructuredOutputParser to get a standard output format instruction
sentiment_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = sentiment_parser.get_format_instructions(only_json=True)
print(format_instructions)


```json
{
	"sentiment_label": string  // result of sentiment analysis, return 'positive', 'negative', 'mixed' or 'neutral'
	"explanation": string  // short explanation for the result of sentiment analysis
}
```


In [8]:
# Prompt template to specify task and output format
sentiment_template = """
Please analyze the sentiment of the text and output only the result in json format, with sentiment_label and explanation for the label. Don't output anything else:

TEXT: {text}

{format_instructions}
"""

# Using LangChain's PromptTemplate to populate the Prompt template with input and format instruction
prompt = PromptTemplate(
    template=sentiment_template,
    input_variables=["text"],
    partial_variables={"format_instructions": format_instructions}
)

sentiment_chain = prompt | llm | sentiment_parser

results = []


for text in sentiment_sentences:
    response = sentiment_chain.invoke({"text": text})
    # print(response) # Uncomment for debug
    results.append({
        "text": text,
        "sentiment": response["sentiment_label"],
        "explanation": response["explanation"]
    })

sentiment_pred_df = pd.DataFrame(results)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [9]:
sentiment_pred_df.head()

Unnamed: 0,text,sentiment,explanation
0,I absolutely adore how this workout app counts...,mixed,The text expresses a strong positive sentiment...
1,I can't believe how well these wireless earbud...,mixed,The user expresses satisfaction with the fit o...
2,I wholeheartedly recommend this mental health ...,positive,The user expresses enthusiastic recommendation...
3,So the new streaming service has excellent con...,mixed,The text expresses a mixed sentiment. It ackno...
4,"I guess it's not totally horrible, but I'd be ...",negative,The user expresses a negative sentiment toward...


In [10]:
# Save the result
sentiment_pred_df.to_csv("data/milestone1_result/sentiment_analysis_results.csv", index=False)

# Task 2: Toxicity Detection

In [11]:
# Load data and extract sentences
toxicity_df, toxicity_sents = load_df_and_sents("data/milestone-1-eng_toxicity.csv")
toxicity_df.head()

Unnamed: 0,sentence_id,label,sentence
0,1,toxic,"I don’t care who you know or what you claim, ’..."
1,2,toxic,"You keep flexing like you’re some top dog, but..."
2,3,toxic,"It’s hilarious how you front like a boss, but ..."
3,4,toxic,"Dawg, you keep actin’ all big and bad, but let..."
4,5,toxic,"Man, y’all better get your act together, ’caus..."


In [12]:
toxicity_sents[0:3]

['I don’t care who you know or what you claim, ’cause none of it matters when your skill set is basura, friend.',
 'You keep flexing like you’re some top dog, but news flash: you’re barely relevant around here, señor.',
 'It’s hilarious how you front like a boss, but in reality, tu es un amateur complet—go figure.']

In [13]:
# Define the response schema for the Toxicity Detection usecase
toxicity_schema = ResponseSchema(
    name="toxicity_label",
    description="result of toxicity analysis, return 'toxic', 'negative' or 'non-toxic'",
)

explanation_schema = ResponseSchema(
    name="explanation",
    description="short explanation for the result of toxicity analysis",
)

response_schemas = [toxicity_schema, explanation_schema]


# Using LangChain's StructuredOutputParser to get a standard output format instruction
toxicity_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = toxicity_parser.get_format_instructions(only_json=True)


# Prompt template to specify task and output format
toxicity_template = """
Please analyze the toxicity of the text and output only the result in json format, with toxicity_label and explanation for the label. don't output anything else:

TEXT: {text}

{format_instructions}
"""


# Using LangChain's PromptTemplate to populate the Prompt template with input and format instruction
prompt = PromptTemplate(
    template=toxicity_template,
    input_variables=["text"],
    partial_variables={"format_instructions": format_instructions}
)

In [15]:
toxicity_chain = prompt | llm | toxicity_parser

results = []

for text in toxicity_sents:
    response = toxicity_chain.invoke({"text": text})
    # print(response) # Uncomment for debug
    results.append({
        "text": text,
        "toxicity": response["toxicity_label"],
        "explanation": response["explanation"]
    })

toxicity_detect_df = pd.DataFrame(results)


In [16]:
toxicity_detect_df.head()

Unnamed: 0,text,toxicity,explanation
0,"I don’t care who you know or what you claim, ’...",toxic,The text contains aggressive and derogatory la...
1,"You keep flexing like you’re some top dog, but...",toxic,The text contains a subtle insult and a dismis...
2,"It’s hilarious how you front like a boss, but ...",toxic,The text contains personal attacks and negativ...
3,"Dawg, you keep actin’ all big and bad, but let...",toxic,The text contains a metaphorical threat ('trem...
4,"Man, y’all better get your act together, ’caus...",toxic,The text contains aggressive and dismissive la...


In [17]:
# Save the result
toxicity_detect_df.to_csv("data/milestone1_result/toxicity_detect_results.csv", index=False)

# Combined Integration


## Helper functions

In [None]:
from tqdm import tqdm
from langchain_core.exceptions import OutputParserException

def load_df_and_sents(data_path):
  df = pd.read_csv(data_path)
  sentences = list(df.sentence)

  return df, sentences


def safe_invoke(chain, text, parser, sentence_id, task_name="sentiment", max_retries=3):
    """
    Invokes the chain with retries on OutputParserException.

    If all retries fail, returns a fallback result with 'unknown' label and explanation.

    Args:
        chain: LangChain chain object
        text: The input text
        parser: LangChain output parser
        sentence_id: Index of the current sentence
        task_name: Descriptive name for debug output (e.g., "sentiment", "toxicity")
        max_retries: Number of retry attempts

    Returns:
        Parsed response or fallback dict on failure.
    """
    for attempt in range(1, max_retries + 1):
        try:
            response = chain.invoke({"text": text})
            #print(response)
            parsed = parser.parse(response)
            #print(parsed_response)
            return parsed
        except OutputParserException as e:
            print(f"[{task_name}] Retry {attempt}/{max_retries} for sentence {sentence_id}")
            print(f"   Original Invalid Output: {response}")
            if attempt == max_retries:
                print(f"[{task_name}] Max retries reached for sentence {sentence_id}. Marking as failure.")
                return {
                    f"{task_name.lower()}_label": "unknown",
                    "explanation": "LLM failure"
                }



def run_chain(prompt_template, llm, parser, sents, task_name, format_instructions):
    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["text"],
        partial_variables={"format_instructions": format_instructions}
    )

    chain = prompt | llm
    results = []

    for idx, text in enumerate(tqdm(sents)):
        parsed_response = safe_invoke(
            chain, text, parser, sentence_id=idx, task_name=task_name
        )
        results.append(parsed_response)

    return results


## Pipeline

In [None]:
def run_analysis_pipeline(data_path):
  df, sents = load_df_and_sents(data_path)


  # Define Sentiment Analysis Schema and Parser
  print("Performing Sentiment Analysis")
  sentiment_schema = ResponseSchema(
      name="sentiment_label",
      description="result of sentiment analysis, return 'positive', 'negative', 'mixed' or 'neutral'",
  )
  sentiment_explanation_schema = ResponseSchema(
      name="explanation",
      description="short explanation for the result of sentiment analysis",
  )
  sentiment_response_schemas = [sentiment_schema, explanation_schema]
  sentiment_parser = StructuredOutputParser.from_response_schemas(sentiment_response_schemas)
  sentiment_format_instructions = sentiment_parser.get_format_instructions(only_json=True)

  # Sentiment Analysis
  sentiment_template = """
  Analyze the sentiment of the text and output ONLY a flat JSON object with exactly these two keys:

  - "sentiment_label": one of ["positive", "negative", "mixed", "neutral"]
  - "explanation": a short explanation for the label

  Do NOT include any extra fields, nested objects, or additional text. The response must be a valid JSON object.

  TEXT: {text}

  {format_instructions}
  """

  sentiment_responses = run_chain(sentiment_template, 
                                  llm, 
                                  sentiment_parser, 
                                  sents, 
                                  task_name="sentiment",
                                  format_instructions=sentiment_format_instructions)



  # Define Toxicity Analysis Schema and Parser
  print("Performing Toxicity Analysis")
  toxicity_schema = ResponseSchema(
      name="toxicity_label",
      description="result of toxicity analysis, return 'toxic',or 'non-toxic'",
  )
  toxicity_explanation_schema = ResponseSchema(
      name="explanation",
      description="short explanation for the result of toxicity analysis",
  )
  toxicity_response_schemas = [toxicity_schema, toxicity_explanation_schema]
  toxicity_parser = StructuredOutputParser.from_response_schemas(toxicity_response_schemas)
  toxicity_parser_format_instructions = toxicity_parser.get_format_instructions(only_json=True)

  # Toxicity Analysis
  toxicity_template = """
  Analyze the toxicity of the text and return ONLY a flat JSON object with exactly these two keys:

  - "toxicity_label": one of ["toxic", "non-toxic"]
  - "explanation": a short explanation for the chosen label

  Do NOT include any extra fields, nested objects, or additional text. Return valid JSON only.

  TEXT: {text}

  {format_instructions}
  """

  toxicity_responses = run_chain(toxicity_template, 
                                 llm, 
                                 toxicity_parser, 
                                 sents, 
                                 task_name="toxicity", 
                                 format_instructions=toxicity_format_instructions)


  print("Integrating Results")
  final_result = []
  for i, sent in enumerate(sents):
    sentiment_result = sentiment_responses[i]
    toxicity_result = toxicity_responses[i]
    sent_result = {
        "id": i,
        "sentence": sent,
        "sentiment_prediction": sentiment_result['sentiment_label'],
        "sentiment_prediction_explanation": sentiment_result['explanation'],
        "toxicity_prediction": toxicity_result['toxicity_label'],
        "toxicity_prediction_explanation": toxicity_result['explanation'],
    }
    final_result.append(sent_result)


  print("Result Integrated Successfully!")
  return final_result


In [22]:
final_result = run_analysis_pipeline("data/milestone-1-eng_sentiment.csv")

Performing Sentiment Analysis


100%|██████████| 50/50 [02:39<00:00,  3.19s/it]


Performing Toxicity Analysis


100%|██████████| 50/50 [03:04<00:00,  3.69s/it]

Integrating Results
Result Integrated Successfully!





In [24]:
final_result[0:3]

[{'id': 0,
  'sentence': "I absolutely adore how this workout app counts steps, but it fails to sync half the time, so I'm stuck redoing workouts.",
  'sentiment_prediction': 'mixed',
  'sentiment_prediction_explanation': "The sentiment is mixed due to the user's appreciation for the step-counting feature and their frustration with the syncing issue.",
  'toxicity_prediction': 'non-toxic',
  'toxicity_prediction_explanation': 'The text expresses a preference for a workout app but also mentions a technical issue, which is not a negative or harmful comment towards any individual or group.'},
 {'id': 1,
  'sentence': "I can't believe how well these wireless earbuds fit, though I keep losing connection outdoors, which is so annoying.",
  'sentiment_prediction': 'mixed',
  'sentiment_prediction_explanation': 'The text expresses both positive sentiment (about the fit of the earbuds) and negative sentiment (regarding the connection issues outdoors).',
  'toxicity_prediction': 'non-toxic',
  '