In [None]:
""" Clear everything for a fresh file """
import os
import shutil

# Delete files in 'support' folder if it exists
if os.path.exists('support'):
    shutil.rmtree('support')

# Stage 1: Initial Query with ChatGPT

Here, we initialize our product and question.

In [None]:
"""
Dear user, enter your Product and Question here!
"""

product = "PICO 4 All-in-One VR Headset"
question =  f"Identify and rank ten opportunities for design improvements with the {product}."

In [None]:
! pip install python-dotenv

In [None]:
""" Set up OpenAI API key """
import os
from dotenv import load_dotenv

load_dotenv()

key = os.getenv("OPENAI_API_KEY")

""" Use ChatGPT """
from openai import OpenAI
client = OpenAI()
template = " Express the answer only as a Python dictionary with the key as a physical component and the value as a concised explanation with a maximum of one sentence. If you don't know the answer to the question, strictly state 'I don't know'."
prompt = question + template
print(f"Prompt: {prompt}")
print("\n")

chat_completion = client.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "user", "content": prompt,}],
    temperature=0.5,                                        # adjust persona and temperature
)

result = chat_completion.choices[0].message.content
print(f"Result: {result}")

# Stage 2: Data Collection and Preprocessing

We will be scrapping data from: \
**Social Media** \
[1] Youtube Comments \
[2] Reddit Comments \
**Websites** \
[3] Official Product Website \
[4] Tech Magazine PCGamer \
**PDF file** \
[5] Official User Manual

In [None]:
! pip install requests

### Stage 2.1: Youtube Comments

In [None]:
""" Initialise and Set up Google API Key """
import os
from googleapiclient.discovery import build
from dotenv import load_dotenv

load_dotenv()

key = os.getenv("GOOGLE_API_KEY")

youtube = build('youtube', 'v3', developerKey=key)

In [None]:
"""
Dear user, adjust the number of results to ensure a good sample size of around 3000 comments. 
"""
search_terms = product
max_result = 20             # No. of results (1-50)

In [None]:
""" Define containers to store info """
vid_id = []             	# video id
vid_page = []       		# video links (https...)
vid_title = []              # video title
num_comments = []           # official number of comments
load_error = 0              # error counter
can_load_title = []         # temp. list for storing title w/o loading error
can_load_page = []          # temp. list for storing links w/o loading error
num_page = []               # comment_response page number
page_title = []             # comment_response video title
comment_resp = []           # comment_response
comment_list = []           # temp. list for storing comments
comment_data = []           # comments & replies from comment_response
all_count = 0               # total number of comments

In [None]:
""" Search for Video IDs based on User Inputs """
print("Search for Videos IDs...")
request = youtube.search().list(
    q=search_terms,
    maxResults=max_result,
    part="id",
    type="video",
    order="relevance"         # Switch to "viewCount" if the number of comments are not sufficient
    )
search_response = request.execute()
print(search_response)

In [None]:
""" Create a list of Video IDs and a corresponding list of weblinks """
print("Videos found...")
for i in range(max_result):
    videoId = search_response['items'][i]['id']['videoId']
    print(videoId)
    vid_id.append(videoId)                          # a list of Video IDs
    page = "https://www.youtube.com/watch?v=" + videoId
    print(page)
    print()
    vid_page.append(page)                           # a list of Video links
print("\nThere are", len(vid_page), "videos.")

In [None]:
""" Use the list of Video IDs to get video data """
print("Get video data...")
for i in range(len(vid_id)):
    request = youtube.videos().list(
        part="snippet, statistics",
        id=vid_id[i]
        )
    video_response = request.execute()
    print(video_response)

    title = video_response['items'][0]['snippet']['title']
    vid_title.append(title)
    try:                        # use try/except as some videos might not load
        comment_count = video_response['items'][0]['statistics']['commentCount']
        print("Video", i + 1, "-", title, "-- Comment count: ", comment_count)
        print()
        num_comments.append(comment_count)
    except:
        print("Video", i + 1, "-", title, "-- Comments are turned off")
        print()
        num_comments.append(0)

In [None]:
""" Use the list of Video IDs to get comments (by page) """
print("Get comment data...")
for i in range(len(vid_id)):
    try:                                        # use try/except as some "comments are turned off"
        request = youtube.commentThreads().list(
            part="snippet,replies",
            videoId=vid_id[i]
            )
        comment_response = request.execute()
        print(comment_response)

        comment_resp.append(comment_response)   # append 1 page of comment_response
        pages = 1
        num_page.append(pages)                  # append page number of comment_response
        page_title.append(vid_title[i])         # append video title along with the comment_response

        can_load_page.append(vid_page[i])       # drop link if it can't load (have at least 1 comment page)
        can_load_title.append(vid_title[i])     # drop title if it can't load (have at least 1 comment page)

        test = comment_response.get('nextPageToken', 'nil')         # check for nextPageToken
        while test != 'nil':                                        # keep running until last comment page
            next_page_ = comment_response.get('nextPageToken')
            request = youtube.commentThreads().list(
                part="snippet,replies",
                pageToken=next_page_,
                videoId=vid_id[i]
                )
            comment_response = request.execute()
            print(comment_response)

            comment_resp.append(comment_response)                   # append next page of comment_response
            pages += 1
            num_page.append(pages)                                  # append page number of comment_response
            page_title.append(vid_title[i])                         # append video title along with the comment_response

            test = comment_response.get('nextPageToken', 'nil')     # check for nextPageToken (while loop)
    except:
        load_error += 1

In [None]:
""" Show videos without loading errors """
print("Videos that can load...")
vid_page = can_load_page                    # update vid_page with those with no load error
vid_title = can_load_title                  # update vid_title with those with no load error
for i in range(len(vid_title)):
    if vid_title[i] == 'YouTube':           # default error title is 'YouTube'
        vid_title[i] = 'Video_' + str(i+1)  # replace 'YouTube' with Video_1 format
    print(i + 1, vid_title[i])

In [None]:
""" Sift through and store comments as a list """
print("Get individual comment...")
for k in range(len(comment_resp)):
    count = 0                                                     # comment counter
    comments_found = comment_resp[k]['pageInfo']['totalResults']  # comments on 1 comment_response page
    count = count + comments_found
    for i in range(comments_found):
        try:
            comment_list.append(comment_resp[k]['items'][i]['snippet']['topLevelComment']['snippet']['textDisplay'])
            print(comment_resp[k]['items'][i]['snippet']['topLevelComment']['snippet']['textDisplay'])
        except:
            print("missing comment")                              # or too many comments (e.g. 7.3K comments)

print(comment_list)
print()
print(len(comment_list), "comments in total.")

In [None]:
""" Create directory """
try:                                              # Create directory named after search terms
    os.makedirs("support/%s" % search_terms)
    print("Directory", search_terms, "created")
except FileExistsError:
    print("Directory", search_terms, "exists")

try:                                              # Create directory to store current search terms
    os.makedirs("support/_current_")
    print("Directory _current_ created")
except FileExistsError:
    print("Directory _current_ exists")

In [None]:
""" Save files for future use """
import pickle

f = open("support/%s/comments.txt" % search_terms, "w+", encoding="utf-8")
for i in range(len(comment_list)):
    f.write("<<<" + comment_list[i] + ">>>")
f.close()

pickle.dump(search_terms, open("support/%s/searchTerms.pkl" % search_terms, "wb"))
pickle.dump(comment_list, open("support/%s/comment_list.pkl" % search_terms, "wb"))
pickle.dump(vid_title, open("support/%s/vid_title.pkl" % search_terms, "wb"))
pickle.dump(vid_page, open("support/%s/vid_page.pkl" % search_terms, "wb"))
pickle.dump(vid_id, open("support/%s/vid_id.pkl" % search_terms, "wb"))


In [None]:
""" Save files for next step """
import shutil

source = "support/%s/comments.txt" % search_terms
destination = "support/_current_/comments.txt"
shutil.copyfile(source, destination)

pickle.dump(search_terms, open("support/_current_/searchTerms.pkl", "wb"))

### Stage 2.2: Reddit Comments

In [None]:
"""
Dear user, define the subreddit for the product. 
"""
subreddit = "r/virtualreality"              # Define the subreddit

In [None]:
""" Initialise and Set up Reddit API """
import requests
import os
from dotenv import load_dotenv

load_dotenv()

id = os.getenv("REDDIT_API_ID")
key = os.getenv("REDDIT_API_KEY")
user = os.getenv("REDDIT_API_USER")
pw = os.getenv("REDDIT_API_PW")

auth = requests.auth.HTTPBasicAuth(id, key)

data = {'grant_type': 'password',                                       # Initalize using login method (password), username, and password
        'username': user,
        'password': pw}

headers = {'User-Agent': 'DAI/AID'}                                     # Setup our header info, which gives reddit a brief description of our app

res = requests.post('https://www.reddit.com/api/v1/access_token',       # Send request for an OAuth token
                    auth=auth, data=data, headers=headers)

TOKEN = res.json()['access_token']                                      # Convert response to JSON and pull access_token value

headers = {**headers, **{'Authorization': f"bearer {TOKEN}"}}

requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)     # Token is valid for ~2 hours. If get Response [200], you are good to go!

In [None]:
search_terms = product

In [None]:
""" Define containers to store info """
post_id = []             	     # post id
post_page = []       		     # post links (https...)
post_title = []                  # post title
post_num_comments = []           # official number of comments
post_comment_list = []           # temp. list for storing comments

In [None]:
""" Search for Post IDs based on User Inputs """
print("Search for Post IDs...")
res = requests.get(f"https://oauth.reddit.com/{subreddit}/search/?q={search_terms}&restrict_sr=on&sort=relevance&t=all",   # Restrict search to only r/virtualreality
                   headers=headers)

if res.status_code == 200:                  # Check if the request was successful
    search_response = res.json()            # Parse the response as JSON
    print(search_response)
else:
    print(f"Error: {res.status_code} - {res.reason}")

In [None]:
""" Create a list of Post IDs and a corresponding list of weblinks """
print("Posts found...")

for post in search_response['data']['children']:
    postId = post['data']['id']
    print(postId)
    post_id.append(postId)                           # a list of Post IDs
    page = f"https://oauth.reddit.com/r/virtualreality/comments/{postId}"
    print(page)
    print()
    post_page.append(page)                           # a list of Post links

print(f"\nThere are {len(post_page)} posts.")

In [None]:
""" Use the list of Post IDs to get post data """
print("Get post data...")
for i, post_response in enumerate(search_response['data']['children']):
    try:
        print(post_response)
        title = post_response['data']['title']                                  # Extract title and comment count from post response
        post_title.append(title)
        comment_count = post_response['data']['num_comments']
        print(f"Post {i + 1} - {title} -- Comment count: {comment_count}")
        post_num_comments.append(comment_count)
        print()
        
    except Exception as e:
        print(f"Error fetching post data: {e}")

print(sum(post_num_comments), "comments in total.")

In [None]:
""" Sift through and store comments as a list """
print("Get individual comment...")

def scrape_comments(url, comment_list):
    try:
        page_json = f"{url}.json"

        post_comments_response = requests.get(page_json, headers=headers) 
        if post_comments_response.status_code == 200:                                       # Check if request was successful
            post_comments_data = post_comments_response.json()
            if isinstance(post_comments_data, list) and len(post_comments_data) > 1:
                total_comments = post_comments_data[1]['data']['children'][0]['data']['total_awards_received']
                print(f"Total comments: {total_comments}")
                comments = post_comments_data[1]['data']['children']                        # Extract comments from the response
                for comment in comments:
                    try:
                        comment_body = comment['data']['body']
                        print(comment_body)
                        comment_list.append(comment_body)
                        if 'replies' in comment['data'] and comment['data']['replies']:     # Check for replies and recursively scrape them
                            scrape_replies(comment['data']['replies']['data']['children'], comment_list)
                    except KeyError:
                        print("Error: Missing 'body' attribute for a comment.")
        else:
            print(f"Error fetching comments for page {url}: {post_comments_response.status_code} - {post_comments_response.reason}")
    except Exception as e:
        print(f"Error fetching comments for page {url}:", e)

def scrape_replies(replies, comment_list):                                                  # Function to scrape replies recursively
    for reply in replies:
        try:
            reply_body = reply['data']['body']
            print(reply_body)
            comment_list.append(reply_body)
            # Recursively scrape replies of replies
            if 'replies' in reply['data'] and reply['data']['replies']:
                scrape_replies(reply['data']['replies']['data']['children'], comment_list)
        except KeyError:
            print("Error: Missing 'body' attribute for a reply.")

for page in post_page:
    scrape_comments(page, post_comment_list)

print(post_comment_list)
print()
print(len(post_comment_list), "comments in total.")

In [None]:
""" Create directory """
try:                                              # Create directory named after search terms
    os.makedirs("support/%s" % search_terms)
    print("Directory", search_terms, "created")
except FileExistsError:
    print("Directory", search_terms, "exists")

try:                                              # Create directory to store current search terms
    os.makedirs("support/_current_")
    print("Directory _current_ created")
except FileExistsError:
    print("Directory _current_ exists")

In [None]:
""" Save files for future use """
import pickle

f = open("support/%s/post_comments.txt" % search_terms, "w+", encoding="utf-8")
for i in range(len(post_comment_list)):
    f.write("<<<" + post_comment_list[i] + ">>>")
f.close()

pickle.dump(search_terms, open("support/%s/searchTerms.pkl" % search_terms, "wb"))
pickle.dump(post_comment_list, open("support/%s/post_comment_list.pkl" % search_terms, "wb"))
pickle.dump(post_title, open("support/%s/post_title.pkl" % search_terms, "wb"))
pickle.dump(post_page, open("support/%s/post_page.pkl" % search_terms, "wb"))
pickle.dump(post_id, open("support/%s/post_id.pkl" % search_terms, "wb"))

In [None]:
""" Save files for next step """
import shutil

source = "support/%s/post_comments.txt" % search_terms
destination = "support/_current_/post_comments.txt"
shutil.copyfile(source, destination)

pickle.dump(search_terms, open("support/_current_/searchTerms.pkl", "wb"))

### Stage 2.3: Official Product Website

In [None]:
"""
Dear user, please enter the URL of the website and the container containing what you wish to scrape.
"""
url = "https://www.picoxr.com/sg/products/pico4"
container_tag = "main"
container_class = "tIY88xTQJtQ9ZOGrUS1U"

In [None]:
import requests
from bs4 import BeautifulSoup
import re

response = requests.get(url)

if response.status_code == 200:                                     # Check if the request was successful
    soup = BeautifulSoup(response.content, 'html.parser')           # Parse the HTML content

    title_tag = soup.find("title")                                  # Get the title of the webpage if it exists
    if title_tag:
        title = title_tag.get_text()
        print("Title:", title)
    else:
        print("No title found")

    product_desc = ""

    text = soup.find_all(container_tag, class_=container_class)     # Get text from the container containing the product description
    for i in range(len(text)):
        text[i] = re.sub(r'\<.*?\>', ' ', str(text[i]))
        text[i] = text[i].replace('\n', ' ')
        text[i] = text[i].replace("   ", ' ')
        text[i] = text[i].replace("  ", ' ')
        text[i] = re.sub(r'\s+', ' ', text[i].strip())
    product_desc = " ".join(text)

    print("Product Description:", product_desc)

else:
    print("Failed to retrieve webpage. Status code:", response.status_code)

In [None]:
""" Create directory """
try:                                              # Create directory named after search terms
    os.makedirs("support/%s" % search_terms)
    print("Directory", search_terms, "created")
except FileExistsError:
    print("Directory", search_terms, "exists")

try:                                              # Create directory to store current search terms
    os.makedirs("support/_current_")
    print("Directory _current_ created")
except FileExistsError:
    print("Directory _current_ exists")

In [None]:
""" Save files for future use """
f = open("support/%s/product_desc.txt" % search_terms, "w+", encoding="utf-8")
f.write("<<<" + product_desc + ">>>")
f.close()

pickle.dump(search_terms, open("support/%s/searchTerms.pkl" % search_terms, "wb"))
pickle.dump(product_desc, open("support/%s/product_desc.pkl" % search_terms, "wb"))

In [None]:
""" Save files for next step """
import shutil

source = "support/%s/product_desc.txt" % search_terms
destination = "support/_current_/product_desc.txt"
shutil.copyfile(source, destination)

pickle.dump(search_terms, open("support/_current_/searchTerms.pkl", "wb"))

### Stage 2.4 Tech Magazine Review (PCGamer)

In [None]:
"""
Dear user, please enter the URL of the website.
"""
url = "https://www.pcgamer.com/pico-4-ve-headset-review/"
container_tag = "div"
container_class = "text-copy bodyCopy auto"

In [None]:
import requests
from bs4 import BeautifulSoup
import re

response = requests.get(url)

if response.status_code == 200:                                     # Check if the request was successful
    soup = BeautifulSoup(response.content, 'html.parser')           # Parse the HTML content

    title_tag = soup.find("title")                                  # Get the title of the webpage if it exists
    if title_tag:
        title = title_tag.get_text()
        print("Title:", title)
    else:
        print("No title found")

    pc_gamer_review = ""

    text = soup.find_all(container_tag, class_=container_class)              # Get text from the container containing the product description
    for i in range(len(text)):
        text[i] = re.sub(r'\<script.*?\<\/script\>', '', str(text[i]), flags=re.DOTALL)
        text[i] = re.sub(r'\<.*?\>', ' ', str(text[i]))
        text[i] = text[i].replace('\n', ' ')
        text[i] = text[i].replace("   ", ' ')
        text[i] = text[i].replace("  ", ' ')
        text[i] = re.sub(r'\s+', ' ', text[i].strip())
    pc_gamer_review = " ".join(text)

    print("PC Gamer Review:", pc_gamer_review)

else:
    print("Failed to retrieve webpage. Status code:", response.status_code)

In [None]:
""" Create directory """
try:                                              # Create directory named after search terms
    os.makedirs("support/%s" % search_terms)
    print("Directory", search_terms, "created")
except FileExistsError:
    print("Directory", search_terms, "exists")

try:                                              # Create directory to store current search terms
    os.makedirs("support/_current_")
    print("Directory _current_ created")
except FileExistsError:
    print("Directory _current_ exists")

In [None]:
""" Save files for future use """
f = open("support/%s/pc_gamer_review.txt" % search_terms, "w+", encoding="utf-8")
f.write("<<<" + pc_gamer_review + ">>>")
f.close()

pickle.dump(search_terms, open("support/%s/searchTerms.pkl" % search_terms, "wb"))
pickle.dump(pc_gamer_review, open("support/%s/pc_gamer_review.pkl" % search_terms, "wb"))

In [None]:
""" Save files for next step """
import shutil

source = "support/%s/pc_gamer_review.txt" % search_terms
destination = "support/_current_/pc_gamer_review.txt"
shutil.copyfile(source, destination)

pickle.dump(search_terms, open("support/_current_/searchTerms.pkl", "wb"))

### Stage 2.3.5 Official User Manual (PDF)

In [None]:
! pip install PyPDF2

In [None]:
import PyPDF2
import requests
import os

"""
Dear user, please enter the URL of the PDF file.
"""
url = 'https://pico-web-tob.oss-cn-beijing.aliyuncs.com/20230825/document/1695015503416348672.pdf'

def extract_text_from_pdf(url):
    response = requests.get(url)
    with open('user_manual.pdf', 'wb') as f:                    # Download the PDF file
        f.write(response.content)

    with open('user_manual.pdf', 'rb') as f:                    # Open the PDF file
        reader = PyPDF2.PdfReader(f)
        
        text = ''
        for page_number in range(2, 11):
            page = reader.pages[page_number]
            text += "".join(page.extract_text())                # Extract text from each page
    
    text = text.replace('\n', ' ')
    
    return text

user_manual_text = extract_text_from_pdf(url)
os.remove('user_manual.pdf')

print(user_manual_text)

In [None]:
""" Create directory """
try:                                              # Create directory named after search terms
    os.makedirs("support/%s" % search_terms)
    print("Directory", search_terms, "created")
except FileExistsError:
    print("Directory", search_terms, "exists")

try:                                              # Create directory to store current search terms
    os.makedirs("support/_current_")
    print("Directory _current_ created")
except FileExistsError:
    print("Directory _current_ exists")

In [None]:
""" Save files for future use """
f = open("support/%s/user_manual.txt" % search_terms, "w+", encoding="utf-8")
f.write("<<<" + user_manual_text + ">>>")
f.close()

pickle.dump(search_terms, open("support/%s/searchTerms.pkl" % search_terms, "wb"))
pickle.dump(user_manual_text, open("support/%s/user_manual.pkl" % search_terms, "wb"))

In [None]:
""" Save files for next step """
import shutil

source = "support/%s/user_manual.txt" % search_terms
destination = "support/_current_/user_manual.txt"
shutil.copyfile(source, destination)

pickle.dump(search_terms, open("support/_current_/searchTerms.pkl", "wb"))