In [None]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AutoModelForSequenceClassification
import re
from google.colab import files
import time
import torch
import re

def is_whole_word_in_text(word, text):
    pattern = r'\b' + re.escape(word) + r'\b'
    plural_pattern = r'\b' + re.escape(word) + r's\b'
    return bool(re.search(pattern, text, re.IGNORECASE) or re.search(plural_pattern, text, re.IGNORECASE))

def extract_and_categorize_aspects(review_text):
    MODEL = "kevinscaria/ate_tk-instruct-base-def-pos-laptops"
    bos_instruction = (
        "Definition: The output will be the aspects (both implicit and explicit) which have an associated opinion that are extracted from the input text. "
        "In cases where there are no aspects the output should be noaspectterm. "
        "Positive example 1- input: I charge it at night and skip taking the cord with me because of the good battery life. output: battery life. "
        "Positive example 2- input: I even got my teenage son one, because of the features that it offers, like, iChat, Photobooth, garage band and more!. output: features, iChat, Photobooth, garage band. "
        "Negative example 1- input: Speaking of the browser, it too has problems. output: browser. "
        "Negative example 2- input: The keyboard is too slick. output: keyboard. "
        "Neutral example 1- input: I took it back for an Asus and same thing- blue screen which required me to remove the battery to reset. output: battery. "
        "Neutral example 2- input: Nightly my computer defrags itself and runs a virus scan. output: virus scan. "
        "Now complete the following example- input: "
    )
    delim_instruct = ""
    eos_instruct = " \noutput:"

    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL)

    prompt = bos_instruction + str(review_text) + delim_instruct + eos_instruct
    tokenized_text = tokenizer(prompt, return_tensors="pt")
    output = model.generate(tokenized_text.input_ids)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    extracted_aspects = [aspect.strip().lower() for aspect in generated_text.split(",")]

    review_text_lower = review_text.lower()

    if len(extracted_aspects) == 1 and "noaspectterm" in extracted_aspects[0]:
        extracted_aspects = []

    categorized_aspects = {
        "Performance": False,
        "Build quality": False,
        "Battery life": False,
        "Display": False,
        "Gaming": False,
        "Graphics": False,
        "Sound": False,
        "Fans": False,
        "Cooling system": False,
        "Weight": False,
        "Price": False,
    }

    combined_aspects = " ".join(extracted_aspects)

    # PERFORMANCE KEYWORDS
    performance_keywords = ["performance", "speed", "specs", "ssd", "hdd", "cpu", "ram", "features",
                          "hard drive", "motherboard", "memory", "processor", "storage", "components",
                          "setup", "run", "running", "work", "boot up", "boot", "processing power", "performs",
                          "space", "start up", "functionality", "fast", "slow"]

    if any(is_whole_word_in_text(keyword, combined_aspects) for keyword in performance_keywords) or any(is_whole_word_in_text(keyword, review_text_lower) for keyword in performance_keywords):
        categorized_aspects["Performance"] = True

    additional_failure_keywords = ["stopped working", "died", "turned black", "shut off", "turn on",
                       "turned off", "black screen", "blue screen", "bluescreen", "crash",
                      "crashed", "freeze", "froze", "dead", "won't start", "wouldn't start", "doesn't start", "went black"]

    if not categorized_aspects["Performance"] and any(phrase in review_text_lower for phrase in additional_failure_keywords):
        categorized_aspects["Performance"] = True

    # BUILD QUALITY KEYWORDS
    build_quality_keywords = ["build quality", "hardware", "touchpad", "touch pad", "mouse", "build",
                            "built", "camera", "webcam", "hinge", "chassis", "durability", "quality control", "material", "fingerprint reader",
                            "cord", "charger", "keyboard", "keys", "backlit keyboard", "keyboard layout", "keyboard backlight", "trackpad",
                            "track pad", "port", "usb port", "hdmi", "jack"]

    if any(is_whole_word_in_text(keyword, combined_aspects) for keyword in build_quality_keywords) or any(is_whole_word_in_text(keyword, review_text_lower) for keyword in build_quality_keywords):
        categorized_aspects["Build quality"] = True

    # BATTERY LIFE KEYWORDS
    battery_keywords = ["battery life", "battery", "charging", "power supply", "charge"]

    if any(is_whole_word_in_text(keyword, combined_aspects) for keyword in battery_keywords) or any(is_whole_word_in_text(keyword, review_text_lower) for keyword in battery_keywords):
        categorized_aspects["Battery life"] = True

    # DISPLAY KEYWORDS
    display_keywords = ["display", "screen", "monitor", "resolution", "screen quality",
                       "screen resolution", "brightness", "size", "screen size", "picture", "frame"]

    if any(is_whole_word_in_text(keyword, combined_aspects) for keyword in display_keywords) or any(is_whole_word_in_text(keyword, review_text_lower) for keyword in display_keywords):
        categorized_aspects["Display"] = True

    # GAMING KEYWORDS
    gaming_keywords = ["gaming", "game", "gaming performance", "gaming laptop", "play"]

    if any(is_whole_word_in_text(keyword, combined_aspects)for keyword in gaming_keywords) or any(is_whole_word_in_text(keyword, review_text_lower) for keyword in gaming_keywords):
        categorized_aspects["Gaming"] = True

    # GRAPHICS KEYWORDS
    graphics_keywords = ["graphics", "gpu", "graphics card", "video card",
                        "graphics settings", "frame rate", "animation"]

    if any(is_whole_word_in_text(keyword, combined_aspects) for keyword in graphics_keywords) or any(is_whole_word_in_text(keyword, review_text_lower) for keyword in graphics_keywords):
        categorized_aspects["Graphics"] = True

    # SOUND KEYWORDS
    sound_keywords = ["sound", "speaker", "audio", "sound quality", "volume"]

    if any(is_whole_word_in_text(keyword, combined_aspects) for keyword in sound_keywords) or any(is_whole_word_in_text(keyword, review_text_lower) for keyword in sound_keywords):
        categorized_aspects["Sound"] = True

    # FANS KEYWORDS
    fans_keywords = ["fan", "fan noise", "noise"]

    if any(is_whole_word_in_text(keyword, combined_aspects) for keyword in fans_keywords) or any(is_whole_word_in_text(keyword, review_text_lower) for keyword in fans_keywords):
        categorized_aspects["Fans"] = True

    additional_fans_keywords = ["noisy", "silent"]

    if not categorized_aspects["Fans"] and any(is_whole_word_in_text(keyword, review_text_lower) for keyword in additional_fans_keywords):
        categorized_aspects["Fans"] = True

    # COOLING SYSTEM KEYWORDS
    cooling_system_keywords = ["cooling", "cooling system", "cooling pad"]

    if any(is_whole_word_in_text(keyword, combined_aspects) for keyword in cooling_system_keywords) or any(is_whole_word_in_text(keyword, review_text_lower) for keyword in cooling_system_keywords):
        categorized_aspects["Cooling system"] = True

    additional_cooling_keywords = ["hot", "heat", "heating", "overheat", "overheating", "thermal", "temperature",
                                  "temp", "ventilation", "vents", "warm", "stays cool"]

    if not categorized_aspects["Cooling system"] and any(is_whole_word_in_text(keyword, review_text_lower) for keyword in additional_cooling_keywords):
        categorized_aspects["Cooling system"] = True


    # WEIGHT KEYWORDS
    weight_keywords = ["weight", "portability", "carry"]

    if any(is_whole_word_in_text(keyword, combined_aspects) for keyword in weight_keywords) or any(is_whole_word_in_text(keyword, review_text_lower) for keyword in weight_keywords):
        categorized_aspects["Weight"] = True

    additional_weight_keywords = ["heavy", "lightweight", "light", "bulky", "bulkiness", "portable", "compact",
                                  "transportable", "mobility", "thin", "thick"]

    if not categorized_aspects["Weight"] and any(is_whole_word_in_text(keyword, review_text_lower) for keyword in additional_weight_keywords):
        categorized_aspects["Weight"] = True

    # PRICE KEYWORDS
    price_keywords = ["price", "cost", "price point", "price tag", "price range", "value"]

    if any(is_whole_word_in_text(keyword, combined_aspects) for keyword in price_keywords) or any(is_whole_word_in_text(keyword, review_text_lower) for keyword in price_keywords):
        categorized_aspects["Price"] = True

    additional_price_keywords = ["money", "budget", "affordable", "expensive", "worth", "costly", "overpriced", "pricier", "pricey", "paid", "deal", "economical"]

    if not categorized_aspects["Price"] and any(is_whole_word_in_text(keyword, review_text_lower) for keyword in additional_price_keywords):
        categorized_aspects["Price"] = True

    return categorized_aspects

def analyze_sentiment(review_text, aspect_category):
    global sentiment_classifier
    review_text = review_text.lower()

    # Mapping the category names to the aspect terms for sentiment analysis
    category_to_aspect = {
        "Performance": "performance",
        "Build quality": "build quality",
        "Battery life": "battery life",
        "Display": "display",
        "Gaming": "gaming",
        "Graphics": "graphics",
        "Sound": "sound",
        "Fans": "fans",
        "Weight": "weight",
        "Price": "price",
        "Cooling system": "cooling system"
    }

    aspect_term = category_to_aspect[aspect_category]

    result = sentiment_classifier(review_text, text_pair=aspect_term)
    sentiment = result[0]['label']
    confidence = round(result[0]['score'], 6)

    if sentiment == 'Positive':
        return 1 * confidence
    elif sentiment == 'Negative':
        return -1 * confidence
    else:
        return 0

def process_laptop_reviews(csv_file, output_file, limit=None):
    print(f"Loading data from {csv_file}...")

    df = pd.read_csv(csv_file)

    if limit:
        df = df.iloc[:limit].copy()
    else:
        df = df.copy()

    feature_columns = [
        "Performance", "Build quality", "Battery life", "Display",
        "Gaming", "Graphics", "Sound", "Fans", "Cooling system", "Weight", "Price"
    ]

    for col in feature_columns:
        if col not in df.columns:
            df[col] = 0.0
        else:
            df[col] = 0.0

    total_reviews = len(df)
    print(f"Processing {total_reviews} reviews...")

    start_time = time.time()
    for idx, row in df.iterrows():
        if idx % 5 == 0:
            elapsed = time.time() - start_time
            avg_time = elapsed / (idx + 1) if idx > 0 else 0
            estimated_remaining = avg_time * (total_reviews - idx - 1)
            print(f"Processing review {idx+1}/{total_reviews} - Est. time remaining: {estimated_remaining:.1f}s")

        #combining review title and text
        review_title = str(row['review_title']) if 'review_title' in row and pd.notna(row['review_title']) else ""
        review_text = str(row['review_text']) if 'review_text' in row and pd.notna(row['review_text']) else ""

        combined_review = review_title + " " + review_text
        combined_review = combined_review.lower()

        if not combined_review.strip():
            continue

        categorized_aspects = extract_and_categorize_aspects(combined_review)

        if not categorized_aspects:
            continue

        for category, is_present in categorized_aspects.items():
            if is_present:
                sentiment_score = analyze_sentiment(combined_review, category)
                df.at[idx, category] = sentiment_score

    print(f"Processing complete. Saving to {output_file}...")
    df.to_csv(output_file, index=False)
    print(f"Data saved. Total processing time: {time.time() - start_time:.1f}s")

    return df

if __name__ == "__main__":
    input_file = "/content/drive/MyDrive/reqs&reviews/Laptop Reviews & Empty Features 17.csv"
    final_output_file = "Features Sentiment from Reviews 17.csv"

    sentiment_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
    sentiment_model = AutoModelForSequenceClassification.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
    sentiment_classifier = pipeline('text-classification', model=sentiment_model, tokenizer=sentiment_tokenizer)

    full_df = process_laptop_reviews(input_file, final_output_file)
    files.download(final_output_file)