#### Extract the links of techical reviews

In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import Firefox
import pandas as pd

filename = 'processors.txt'

# Read processor names from file
with open(filename, 'r') as file:
    processor_names = [line.strip() for line in file.readlines()]

driver = Firefox()
driver.maximize_window()

# Test search query
test_query = 'test query'
try:
    driver.get(f'https://www.google.com/search?q={test_query}')
    
    # Wait for search results to load
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h3')))
    
    print("Google search test successful.")
except Exception as ex:
    print(f"Error during Google search test: {ex}")
    driver.quit()
    exit()

results = []

# Define websites to search
websites = {
    'tomshardware': 'site:tomshardware.com',
    'pcmag': 'site:pcmag.com'
}

for processor_name in processor_names:
    processor_name_cleaned = processor_name.replace(' ', '+')
    
    for site, site_keyword in websites.items():
        search_query = f'{processor_name_cleaned}+{site_keyword}+review'
        try:
            driver.get(f'https://www.google.com/search?q={search_query}')
            
            # Wait for search results to load
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h3')))
            
            # Find the first search result link
            search_results = driver.find_elements(By.CSS_SELECTOR, 'h3')
            first_result = search_results[0]
            link = first_result.find_element(By.XPATH, './../../a').get_attribute('href')
            
            results.append({
                'Processor': processor_name,
                'Source': site,
                'URL': link
            })
            
            print(f"Found {site} review for {processor_name}")
        except Exception as ex:
            print(f"Error processing {site} review for {processor_name}: {ex}")
    
# Save results to CSV
df = pd.DataFrame(results)
df.to_csv('processors_links.csv', index=False)

driver.quit()
print("Finished")


In [None]:
%pip install webdriver-manager


#### Extraction of techical reviews

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import Firefox
import time

# Initialize Selenium WebDriver for Firefox
driver = Firefox()
driver.maximize_window()

# Function to scrape content from tomshardware.com
def scrape_tomshardware(url):
    driver.get(url)
    time.sleep(15)  # Wait for the page to load completely
    main_content = driver.find_element(By.ID, 'article-body')
    # Extract the paragraphs that are not inside figure or aside tags and do not contain a tags
    paragraphs = main_content.find_elements(By.XPATH, './/p[not(ancestor::figure) and not(ancestor::aside) and not(descendant::a)]')
    # Join the text content of the paragraphs
    content = ' '.join([para.text for para in paragraphs])
    return content

# Function to scrape content from pcgamer.com
def scrape_pcmag(url):
    driver.get(url)
    time.sleep(5)  # Wait for the page to load completely
    content_container = driver.find_element(By.ID, 'article')  # Example CSS selector
    paragraphs = content_container.find_elements(By.TAG_NAME, 'p')
    content = ' '.join([para.text for para in paragraphs])
    return content

# Read input CSV file
input_csv = 'processors_links.csv'
df = pd.read_csv(input_csv)

# Prepare list to store results
results = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    processor = row['Processor']
    source = row['Source']
    url = row['URL']
    
    # Determine which scraping function to call based on the source
    if source == 'tomshardware':
        content = scrape_tomshardware(url)
    elif source == 'pcmag':
        content = scrape_pcmag(url)
    else:
        content = f"Scraping function not defined for source: {source}"
    
    results.append({'Processor': processor, 'Source': source, 'Content': content})

# Close the Selenium WebDriver
driver.quit()

# Create a new DataFrame with the results
results_df = pd.DataFrame(results)

# Save the results to a new CSV file
output_csv = 'scraped_processors.csv'
results_df.to_csv(output_csv, index=False)

print(f"Scraping completed. Data saved to {output_csv}")
