# Chapter 7: Benchmarking using Selective Classified Sentiment Analysis

Dear user, we will be conducting benchmarking for our identified components, \
where we scrap Youtube comments from other competitors, and find out if the components from other competitors fair better.

In [6]:
### TO DO SECTION

In [7]:
'''
Dear user, enter your Product here!
'''
product = "Boeing 787 Dreamliner Commercial Plane"

search_terms = product

In [8]:
'''
Dear user, enter 2 competitors here!
'''
competitor_1 = "Airbus 320 Commercial Plane"
competitor_2 = "Embraer 190 Commercial Plane"

In [9]:
""" Initialise and Set up Google API Key """
import os
from googleapiclient.discovery import build
from dotenv import load_dotenv

load_dotenv()

key = os.getenv("GOOGLE_API_KEY")

youtube = build('youtube', 'v3', developerKey=key)

In [17]:
import os
import pickle

def youtube_scrap(competitor):
    max_results = 5
    vid_id = []             	# video id
    vid_page = []       		# video links (https...)
    vid_title = []              # video title
    num_comments = []           # official number of comments
    comment_list = []           # temp. list for storing comments
    comment_resp = []           # comment_response

    request = youtube.search().list(
        q=search_terms,
        maxResults=max_results,
        part="id,snippet",  # Include snippet in the request
        type="video",
        order="relevance"         # Switch to "viewCount" if the number of comments are not sufficient
    )
    search_response = request.execute()

    for item in search_response.get('items', []):
        if 'snippet' in item:
            vid_id.append(item['id']['videoId'])
            vid_title.append(item['snippet']['title'])
            page = "https://www.youtube.com/watch?v=" + item['id']['videoId']
            vid_page.append(page)

    for video_id, title in zip(vid_id, vid_title):
        request = youtube.videos().list(
            part="snippet, statistics",
            id=video_id
        )
        video_response = request.execute()
        num_comments.append(video_response['items'][0]['statistics'].get('commentCount', 0))

        try:
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            comment_response = request.execute()
            comment_resp.append(comment_response)
        except Exception as e:
            print("Error fetching comments:", e)

    for response in comment_resp:
        for item in response.get('items', []):
            try:
                comment_list.append(item['snippet']['topLevelComment']['snippet']['textDisplay'])
            except Exception as e:
                print("Error processing comment:", e)

    # Save comment_list to a file
    directory = "support/%s/competitor/%s" % (search_terms, competitor)
    os.makedirs(directory, exist_ok=True)

    with open(os.path.join(directory, "comment_list.txt"), "w+", encoding="utf-8") as f:
        for comment in comment_list:
            f.write("<<<" + comment + ">>>")

    with open(os.path.join(directory, "comment_list.pkl"), "wb") as f:
        pickle.dump(comment_list, f)

    return vid_title, comment_list  # Return video titles and comments

# Call the function for each competitor
videos_1, comments_1 = youtube_scrap(competitor_1)
videos_2, comments_2 = youtube_scrap(competitor_2)

# Print list of videos
print("Videos for", competitor_1)
for i, title in enumerate(videos_1, start=1):
    print(i, title)

print("\nVideos for", competitor_2)
for i, title in enumerate(videos_2, start=1):
    print(i, title)

Error fetching comments: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet%2Creplies&videoId=zE88Sz2t_gE&key=AIzaSyA5lBkdcbGjl_Fe2xQd0GJJ29ZwTJnJrnI&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


Error fetching comments: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet%2Creplies&videoId=zE88Sz2t_gE&key=AIzaSyA5lBkdcbGjl_Fe2xQd0GJJ29ZwTJnJrnI&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
Videos for Airbus 320 Commercial Plane
1 Boeing Vietnam Airlines 787-9 Dreamliner Vertical Takeoff &amp; Steep Turns 2015 Paris Air Show Prep
2 Boeing 787-10 Dreamliner and 737 MAX 9 Fly Together in Dramatic Display
3 The Insane Engineering of the 787
4 Boeing 787 Dreamliner soars for first flight
5 United — This is the story of new planes

Videos f

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import shutil
import os

def initiate_vector_store(setting):
    """ Combine text files into one """
    with open("combined.txt", "w", encoding="utf-8") as combined_file:
        for file_path in setting:
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
                combined_file.write(content)
    
    """ Load Private Documents of User Manual """
    loader = TextLoader(file_path="combined.txt", encoding='utf-8')
    document = loader.load()
    os.remove('combined.txt')

    """ Split Documents into smaller parts """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50
    )
    splits = text_splitter.split_documents(document)

    """ Use OpenAI Embeddings """
    embedding = OpenAIEmbeddings()

    """ Remove 'persist' directory, if any """
    try:
        shutil.rmtree('support/%s/competitor/persist' % search_terms)       # remove old version
        print("Deleting previous store")
    except:
        print("No store found")

    persist_directory = 'support/%s/competitor/persist' % search_terms     # create new version

    """ Apply embeddings on private documents and save in 'persist' directory """
    vectordb = Chroma.from_documents(
        documents=splits,                           # target the splits created from the documents loaded
        embedding=embedding,                        # use the OpenAI embedding specified
        persist_directory=persist_directory         # store in the persist directory for future use
    )

    vectordb.persist()                              # store vectordb

In [None]:
import pandas as pd
import pickle
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

def private_query(competitor):
    """ Retrieve vectordb created """
    embedding = OpenAIEmbeddings()
    persist_directory = 'support/%s/persist' % search_terms

    vectordb = Chroma(
        persist_directory=persist_directory,
        embedding_function=embedding
        )

    print("Processing folder:", search_terms)
    print("Size of Vector Database", vectordb._collection.count())    # same as before
    
    """ Apply language model and vectordb for Chatbot """
    llm = ChatOpenAI(model_name="gpt-4", temperature=0)

    qa_chain = RetrievalQA.from_chain_type(
        llm,
        # "similarity" finds similar items based on similarity metric.
        # search_kwargs: number of similar items (k) to retrieve
        retriever=vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 15}),     
        return_source_documents=True
        )
    
    """ Ready to use GPT """
    question = " '''What is the airline discussed? Is the sentiment on the service mainly positive or negative? What is a key negative point?''' "  # input question
    template = " If you don't know the answer to the question delimited by triple quotes based on the data given, strictly state 'I don't know'. Keep the answer as concise as possible, with a maximum of three sentences."

    prompt = question + template
    result = qa_chain({"query": prompt})

    components = result["result"]
    print("Components:", components)

    pickle.dump(components, open("support/%s/DSM/%s.pkl" % (search_terms, number), "wb"))