# Capstone: Topic Modelling on AMD vs Nvidia GPU

## Contents
- Data Extraction
- Data Cleaning
- EDA
- Model creation
- Model Evaluation

In [1]:
# Common imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# Set the max rows and columns for Pandas
pd.options.display.max_columns = 100
pd.options.display.max_rows = 200

# Data Extraction from Amazon GPU using Selenium 

In [3]:
# Imports the webdriver and the Keys, Keys are used for Return, F1
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
import time 

In [7]:
# Create a Dataframe to hold the data
GPU_df = pd.DataFrame(columns=['GPU Name','Manufacturer', 'Chipset Brand', 'Price', 
                       'Customer Rating', 'Customer Review Title', 
                       'Customer Review'])
GPU_df

Unnamed: 0,GPU Name,Manufacturer,Chipset Brand,Price,Customer Rating,Customer Review Title,Customer Review


In [37]:
j = 1
# Create the Chrome Driver object
driver = webdriver.Chrome()

# To keep track of the last entry appended for the review titles and body
idx_title = 0
idx_body = 0
idx_star = 0


for page in range(1,3):
    
    # Gets the first page of the website
    driver.get(f"https://www.amazon.com/s?k=Computer+Graphics+Cards&i=computers&rh=n:284822&page={page}&_encoding=UTF8&c=ts&qid=1608032958&ts_id=284822")
    main_url = driver.current_url

    # # Check for sponsored post
    sponsored_posts = driver.find_elements_by_xpath('//div[@data-component-type="sp-sponsored-result"]/../../../..')
    lst_index_sponsored = []
    # Loop through the sponsored posts to find the index
    for post in sponsored_posts:
        lst_index_sponsored.append(post.get_attribute('data-index'))

    for i in range(13): # For the first n GPUs

        # Check if the post is sponsored
        while str(i) in lst_index_sponsored:
            i += 1

        # Click the link for the first GPU
        driver.find_element_by_xpath(f'//div[@data-index={str(i)}]//a[@class="a-link-normal a-text-normal"]').click()

        # Gets the url of the main page of the GPU
        gpu_url = driver.current_url

        # Click on the "See all reviews"
        driver.find_element_by_xpath('//a[@data-hook="see-all-reviews-link-foot"]').click()
        # Wait for 1 seconds
        time.sleep(1)


        """
        Loop through the review page and obtain the review title, review body, ratings
        """

        # Number of review pages to loop through for each GPU
        for _ in range(2):

            # Gets the title of the reviews for each page
            title_comment = driver.find_elements_by_xpath('//*[@data-hook = "review-title"]/span')
            # Gets the customer reviews for each page
            review_body = driver.find_elements_by_xpath('//*[@data-hook = "review-body"]/span')

            # Gets the customer ratings
            star_ratings = driver.find_elements_by_xpath('//div[@data-hook="review"]//a[@title]')

            # Loop through the customer ratings and store them in the Customer Rating column 
            for star in star_ratings:
                GPU_df.loc[idx_star, 'Customer Rating'] = star.get_attribute('title')
                idx_star += 1

            # Loop through the title comments and append it to the Customer Review Title
            for title in title_comment:
                GPU_df.loc[idx_title, 'Customer Review Title'] = title.text
                idx_title += 1

            # Loop through the review comments and append it to the Customer Review

            # Count the number of Spans in the "Review Body" class, if more than 1, put it in a list and use "".join()
            # before appending it to the dataframe
            more_than_3 = driver.find_elements_by_xpath('//*[@data-hook = "review-body"]//*[count(*)=3]//../span')
            review_body = driver.find_elements_by_xpath('//*[@data-hook = "review-body"]/span')

            # If there is a review that is split into multiples span
            if more_than_3: 
                # Holds the list of reviews
                review_list = []
                for review in more_than_3:
                    review_list.append(review.text) # Append it into a list
                GPU_df.loc[idx_body, 'Customer Review'] = "".join(review_list)
                idx_body += 1
                for review in review_body[len(more_than_3):]: # Reviews with one span
                    GPU_df.loc[idx_body, 'Customer Review'] = review.text
                    idx_body += 1
            else:
                # Loop through the review comments and append it to the Customer Review
                for review in review_body:
                    GPU_df.loc[idx_body, 'Customer Review'] = review.text
                    idx_body += 1

            # Goes to the next review page    
            driver.find_element_by_xpath('//li[@class="a-last"]/a').click()
            # Sleep
            time.sleep(3)

        # Go back to the GPU review page
        driver.get(gpu_url)

        # Wait for 2 seconds
        time.sleep(2)

        """
        Fill up the null values with their respective attributes
        """

        # Fill up the null values with the GPU name
        GPU_df['GPU Name'].fillna(driver.find_element_by_xpath('//*[@id="productTitle"]').text, inplace=True)
        # Fill up the null values with the Chipset Brand
        try:
            chipset = driver.find_element_by_xpath('//*[@id="productDetails_techSpec_section_1"]/tbody//text()[contains(.,"Chipset Brand")]/../../td').text
            GPU_df['Chipset Brand'].fillna(chipset,inplace=True)
        except NoSuchElementException:
            GPU_df['Chipset Brand'].fillna('NA',inplace=True)

        # Fill up the null values with the manufacturer name
        try:
            manufacturer = driver.find_element_by_xpath('//*[@id="productDetails_techSpec_section_2"]/tbody//th[contains(text(),"Manufacturer")]/../td').text
            GPU_df['Manufacturer'].fillna(manufacturer, inplace=True)
        except NoSuchElementException:
            GPU_df['Manufacturer'].fillna('NA',inplace=True)

        # Fill up the null values with the Price
        try:
            GPU_df['Price'].fillna(driver.find_element_by_xpath('//*[@id="price_inside_buybox"]').text, inplace=True)
        except NoSuchElementException:
             GPU_df['Price'].fillna('NA', inplace=True)

        # Go back to the main page
        driver.get(main_url)


    WebDriverWait(driver, timeout=5)
    driver.quit()

MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=54064): Max retries exceeded with url: /session/eba28dfc2252c137050fcf21d3105312/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000021A680DBCD0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [38]:
lst_index_sponsored

['4', '9', '14', '19', '20', '21']

In [41]:
GPU_df.isnull().sum()

GPU Name                 0
Manufacturer             0
Chipset Brand            0
Price                    0
Customer Rating          7
Customer Review Title    0
Customer Review          0
dtype: int64

In [None]:
# The Last 6 review titles are displaying NaN
# For iloc[20], the reviews are not captured properly, part of it is captured and the other half is pushed to the next entry
# Count the number of Spans in the "Review Body" class, if more than 1, use "".join() and put it in a list 
# before appending it to the dataframe


# GPU_df[['GPU Name','Customer Review','Customer Review Title']].iloc[-8:]
GPU_df[['GPU Name','Customer Review Title', 'Customer Review']].iloc[51].values

In [None]:
len(GPU_df[GPU_df['GPU Name'] == 'XFX Radeon RX 580 GTS XXX Edition 1386MHz OC+, 8GB GDDR5, VR Ready, Dual BIOS, 3xDP HDMI DVI, AMD Graphics Card (RX-580P8DFD6)'])

In [None]:
len(GPU_df[GPU_df['GPU Name'] == 'Gigabyte Geforce GTX 1050 Ti OC Low Profile 4GB GDDR5 128 Bit PCI-E Graphic Card (GV-N105TOC-4GL)'])

In [42]:
# Save to csv file
GPU_df.to_csv('./amazon dataset/gpu_df.csv',index=False)

# Testing the review body for the page

In [None]:
# Testing the review body for the page


# Create the Chrome Driver object
driver = webdriver.Chrome() 
driver.get('https://www.amazon.com/MSI-GT-710-2GD3-LP/product-reviews/B01DOFD0G8/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews')

# Count the number of Spans in the "Review Body" class, if more than 1, put it in a list and use "".join()
# before appending it to the dataframe
more_than_3 = driver.find_elements_by_xpath('//*[@data-hook = "review-body"]//*[count(*)=3]//../span')
review_body = driver.find_elements_by_xpath('//*[@data-hook = "review-body"]/span')

# If there is a review that is split into multiples span
if more_than_3: 
    # Holds the list of reviews
    review_list = []
    for review in more_than_3:
        review_list.append(review.text) # Append it into a list
    GPU_df.loc[idx_body, 'Customer Review'] = "".join(list_of_review)
    idx_body += 1
    for review in review_body[len(more_than_3)+1:]: # Reviews with one span
        GPU_df.loc[idx_body, 'Customer Review'] = review.text
        idx_body += 1
else:
    # Loop through the review comments and append it to the Customer Review
    for review in review_body:
        GPU_df.loc[idx_body, 'Customer Review'] = review.text
        idx_body += 1
    
        
# Wait for 2 seconds
time.sleep(2)
driver.quit()

In [None]:
len(more_than_3)

In [None]:
GPU_df['Customer Review'].loc[41]

# Testing the star ratings of the comment for the page

In [None]:
# Testing the star ratings of the comment for the page

# Holds the list of reviews
list_of_stars = []

# Create the Chrome Driver object
driver = webdriver.Chrome() 
driver.get('https://www.amazon.com/XFX-Radeon-1386MHz-Graphics-RX-580P8DFD6/product-reviews/B06Y66K3XD/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews')

#data = driver.find_elements_by_css_selector('#cm_cr-review_list')
star_ratings = driver.find_elements_by_xpath('//div[@data-hook="review"]//a[@title]')

# Get the Profile name 
# star_ratings = driver.find_elements_by_xpath('//div[@data-hook="review"]//span[@class= "a-profile-name"]')


for star in star_ratings:
    list_of_stars.append(star.get_attribute('title'))
    
# Wait for 2 seconds
time.sleep(2)
driver.quit()

In [None]:
list_of_stars

In [None]:
GPU_df.loc[6] = ['','','','','','']
GPU_df.loc[7]