In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Setup WebDriver
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Open Trustpilot page
url = "https://www.trustpilot.com/review/www.zendesk.com"
driver.get(url)

# Initialize reviews list
all_reviews = []

while True:
    try:
        # Wait for reviews to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, "//p[contains(@class, 'typography_body')]"))
        )

        # Extract reviews on this page
        review_elements = driver.find_elements(By.XPATH, "//p[contains(@class, 'typography_body')]")
        page_reviews = [elem.text for elem in review_elements]
        all_reviews.extend(page_reviews)

        # Try clicking the next page button
        next_button = driver.find_element(By.CSS_SELECTOR, 'a[data-pagination-button-next-link]')
        if 'disabled' in next_button.get_attribute('class'):
            break  # No more pages
        next_button.click()
        time.sleep(2)  # Give time for page transition

    except Exception as e:
        print("No more pages or error:", e)
        break

# Convert to DataFrame
df_reviews = pd.DataFrame(all_reviews, columns=["review"])
print(df_reviews.head())
print(f"✅ Total reviews scraped: {len(df_reviews)}")

driver.quit()

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Setup WebDriver
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Open Trustpilot page
url = "https://www.trustpilot.com/review/www.zendesk.com"
driver.get(url)

# Initialize reviews list
all_reviews = []

while True:
    try:
        # Wait for reviews to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, "//p[contains(@class, 'typography_body')]"))
        )

        # Extract reviews on this page
        review_elements = driver.find_elements(By.XPATH, "//p[contains(@class, 'typography_body')]")
        page_reviews = [elem.text for elem in review_elements]
        all_reviews.extend(page_reviews)

        # Try clicking the next page button
        next_button = driver.find_element(By.CSS_SELECTOR, 'a[data-pagination-button-next-link]')
        if 'disabled' in next_button.get_attribute('class'):
            break  # No more pages
        next_button.click()
        time.sleep(2)  # Give time for page transition

    except Exception as e:
        print("No more pages or error:", e)
        break

# Convert to DataFrame
df_reviews = pd.DataFrame(all_reviews, columns=["review"])
print(df_reviews.head())
print(f"✅ Total reviews scraped: {len(df_reviews)}")

driver.quit()

In [None]:
from sqlalchemy import create_engine

# PostgreSQL Connection Details
user = 'postgres'
password = 'isba_4715'
host = 'isba-dev-02.cxue0o466fzu.us-east-1.rds.amazonaws.com'
port = 5432
database = 'postgres'  # Change this if using a different DB name

# Create engine string
engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}')

# Upload DataFrame
df_reviews.to_sql(name='trustpilot_reviews', con=engine, if_exists='replace', index=False)

print("✅ Data uploaded to PostgreSQL.")

In [None]:
df_reviews.to_sql(
    name='trustpilot_reviews',
    con=engine,
    schema='sql_project',  # 👈 this ensures it lands in the correct schema
    if_exists='replace',
    index=False
)

In [None]:
from dotenv import load_dotenv
import os
import pandas as pd
from sqlalchemy import create_engine

load_dotenv()

pg_conn_str = f"postgresql+psycopg2://{os.getenv('PG_USER')}:{os.getenv('PG_PASSWORD')}@{os.getenv('PG_HOST')}/{os.getenv('PG_DB')}"
pg_engine = create_engine(pg_conn_str)

# Extract
query = "SELECT * FROM sql_project.trustpilot_reviews"
df = pd.read_sql(query, pg_engine)

# Load
df.to_sql(
    name='trustpilot_reviews_raw',
    con=pg_engine,
    schema='raw',
    if_exists='replace',
    index=False
)

print("✅ Successfully extracted and loaded Trustpilot Reviews!")


In [None]:
# Quick checks
print(df.info())
print(df.isnull().sum())
print(df.describe())
print(df.head())

# Light cleaning examples
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]  # Clean column names
df = df.drop_duplicates()  # Remove duplicate rows
df = df.dropna(subset=['review'])  # Remove rows missing important fields



Testing New Format

In [7]:
# --- Imports ---
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

In [None]:
# --- Load .env variables ---
load_dotenv()
pg_conn_str = f"postgresql+psycopg2://{os.getenv('PG_USER')}:{os.getenv('PG_PASSWORD')}@{os.getenv('PG_HOST')}/{os.getenv('PG_DB')}"
pg_engine = create_engine(pg_conn_str)

In [8]:
# --- Static HTML request (no Selenium) ---
url = "https://www.trustpilot.com/review/www.zendesk.com"
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

In [9]:
# --- Extract reviews ---
reviews = []
for p in soup.find_all("p", class_="typography_body-l__KUYFJ typography_appearance-default__AAY17"):
    text = p.text.strip()
    if len(text) > 20:
        reviews.append(text)

In [10]:
# --- Create DataFrame ---
df_reviews = pd.DataFrame(reviews, columns=["review"])
df_reviews = df_reviews.drop_duplicates().dropna()
df_reviews.reset_index(inplace=True)
df_reviews.rename(columns={"index": "review_id"}, inplace=True)

In [12]:
# --- Upload to PostgreSQL ---
df_reviews.to_sql(
    name='trustpilot_reviews_cleaned',
    con=pg_engine,
    schema='sql_project',
    if_exists='replace',
    index=False
)

print("✅ Uploaded Trustpilot reviews without using Selenium.")
print(df_reviews.head())

✅ Uploaded Trustpilot reviews without using Selenium.
Empty DataFrame
Columns: [review_id, review]
Index: []


In [13]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

load_dotenv()

# Setup connection
pg_conn_str = f"postgresql+psycopg2://{os.getenv('PG_USER')}:{os.getenv('PG_PASSWORD')}@{os.getenv('PG_HOST')}/{os.getenv('PG_DB')}"
pg_engine = create_engine(pg_conn_str)

# Load tables
df_reviews = pd.read_sql("SELECT * FROM sql_project.trustpilot_reviews_cleaned", pg_engine)
df_tickets = pd.read_sql("SELECT * FROM sql_project.customer_support_raw", pg_engine)

# Basic text matching: assign ticket_id if customer_name appears in the review
def match_ticket_id(review_text):
    for _, row in df_tickets.iterrows():
        if row['customer_name'].lower() in review_text.lower():
            return row['ticket_id']
    return None

df_reviews['ticket_id'] = df_reviews['review'].apply(match_ticket_id)

# Drop unmatched if desired
df_reviews = df_reviews.dropna(subset=['ticket_id'])

# Save the updated table back to the database
df_reviews.to_sql(
    name='trustpilot_reviews_cleaned',
    con=pg_engine,
    schema='sql_project',
    if_exists='replace',
    index=False
)

print("✅ Updated table with ticket_id and uploaded to database.")


✅ Updated table with ticket_id and uploaded to database.
