In [15]:
import os
import pandas as pd
import googleapiclient.discovery
import googleapiclient.errors
import time
import re
from datetime import datetime

# -------- SETUP YOUTUBE API -------- #
def setup_youtube_api():
    api_service_name = "youtube"
    api_version = "v3"
    api_key = "AIzaSyB_J5k3fuulU_hAybFdodaaFDi514pcipQ"  # Replace with your YouTube API key

    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, developerKey=api_key)
    return youtube

# -------- VIDEO SEARCH -------- #
def search_videos(youtube, query, max_results=20):
    request = youtube.search().list(
        part="snippet",
        q=query,
        type="video",
        maxResults=max_results,
        relevanceLanguage="en",
        order="relevance"
    )
    response = request.execute()

    videos = []
    for item in response["items"]:
        video_id = item["id"]["videoId"]
        title = item["snippet"]["title"]
        videos.append({"id": video_id, "title": title})

    return videos

# -------- FEATURE EXTRACTION -------- #
def extract_features(text):
    feature_patterns = {
        'design': [r'\bdesign\b', r'\bappearance\b', r'\bstyling\b', r'\blook[s]?\b', r'\bexterior\b', r'\binterior\b'],
        'range': [r'\brange\b', r'\b[0-9]+ miles\b', r'\bcharge range\b', r'\bdriving range\b'],
        'performance': [r'\bperformance\b', r'\bacceleration\b', r'\btowing\b', r'\bspeed\b', r'\bhorsepower\b'],
        'price': [r'\bprice\b', r'\bcost\b', r'\bexpensive\b', r'\baffordable\b', r'\boverpriced\b'],
        'battery': [r'\bbattery\b', r'\bcharging\b', r'\bcharge time\b', r'\bsupercharg(e|er)\b', r'\bfast[- ]charging\b'],
        'tech': [r'\btech\b', r'\bautopilot\b', r'\bself[- ]driving\b', r'\binfotainment\b', r'\bscreen\b', r'\bdashboard\b'],
        'comfort': [r'\bcomfort\b', r'\bseat\b', r'\blegroom\b', r'\bsuspension\b'],
        'delivery': [r'\bdelivery\b', r'\bdelay\b', r'\bpreorder\b', r'\bbacklog\b'],
        'build_quality': [r'\bbuild\b', r'\bquality\b', r'\bfit and finish\b', r'\bpanel gap[s]?\b', r'\bmaterials\b']
    }

    text_lower = text.lower()
    features_found = []
    matched_patterns = {}

    for feature, patterns in feature_patterns.items():
        for pattern in patterns:
            if re.search(pattern, text_lower):
                features_found.append(feature)
                if feature not in matched_patterns:
                    matched_patterns[feature] = []
                matched_patterns[feature].append(pattern)
                break

    return features_found, matched_patterns

# -------- COMMENT SCRAPER -------- #
def get_video_comments(youtube, video_id, label, max_comments=100):
    comments = []
    next_page_token = None

    while len(comments) < max_comments:
        try:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=min(100, max_comments - len(comments)),
                pageToken=next_page_token
            )
            response = request.execute()

            for item in response["items"]:
                comment = item["snippet"]["topLevelComment"]["snippet"]
                text = comment["textDisplay"]
                features, _ = extract_features(text)

                comments.append({
                    "Product": label,
                    "Review Text": text,
                    "Review Date": comment["publishedAt"],
                    "source": "YouTube",
                    "features": ', '.join(features) if features else "none",
                    "like_count": comment["likeCount"]
                })

            if "nextPageToken" in response and len(comments) < max_comments:
                next_page_token = response["nextPageToken"]
            else:
                break

        except googleapiclient.errors.HttpError as e:
            print(f"HTTP error: {e}")
            break

        time.sleep(0.5)

    return comments

# -------- MAIN RUNNER -------- #
def main():
    youtube = setup_youtube_api()

    vehicles = [
        {"query": "Tesla Cybertruck review", "label": "Tesla Cybertruck"},
        {"query": "Ford F-150 Lightning review", "label": "Ford F-150 Lightning"},
        {"query": "Rivian R1T review", "label": "Rivian R1T"},
        {"query": "Chevy Silverado EV review", "label": "Chevy Silverado EV"},
        {"query": "GMC Hummer EV review", "label": "GMC Hummer EV"}
    ]

    all_comments = []

    for vehicle in vehicles:
        print(f"\n🔍 Searching videos for: {vehicle['label']}")
        videos = search_videos(youtube, vehicle["query"], max_results=5)

        for video in videos:
            print(f"📺 Scraping comments for: {video['title']}")
            comments = get_video_comments(youtube, video["id"], vehicle["label"], max_comments=200)
            all_comments.extend(comments)
            print(f"✅ Retrieved {len(comments)} comments.")

    # Save results
    df = pd.DataFrame(all_comments)
    if not os.path.exists("output"):
        os.makedirs("output")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    file_path = f"output/amazon_api_data.csv"
    df.to_csv(file_path, index=False, encoding="utf-8")
    print(f"\n📁 Saved to: {file_path}")

    # Summary
    print("\n📊 Comments per vehicle:")
    print(df["Product"].value_counts())

    print("\n🔧 Top feature mentions:")
    feature_counts = df["features"].str.split(", ").explode().value_counts()
    print(feature_counts)

    print("\n🚘 Feature breakdown by vehicle:")
    for vehicle in df["Product"].unique():
        print(f"\n{vehicle}:")
        vehicle_df = df[df["Product"] == vehicle]
        features = vehicle_df["features"].str.split(", ").explode()
        features = features[features != "none"]
        print(features.value_counts().head(5))

if __name__ == "__main__":
    main()



🔍 Searching videos for: Tesla Cybertruck
📺 Scraping comments for: Tesla Cybertruck Review: Already Iconic?
✅ Retrieved 200 comments.
📺 Scraping comments for: 2024 Tesla Cybertruck Review: This Is Just Plain Cool
✅ Retrieved 200 comments.
📺 Scraping comments for: James May finally drives the Tesla Cybertruck
✅ Retrieved 200 comments.
📺 Scraping comments for: C’mon, the Tesla Cybertruck isn’t THAT bad. 😏
✅ Retrieved 200 comments.
📺 Scraping comments for: Cybertruck BEST and WORST features #cybertruck #tesla #automobile
✅ Retrieved 200 comments.

🔍 Searching videos for: Ford F-150 Lightning
📺 Scraping comments for: F-150 Lightning 1 Year Review: The Good, Bad, &amp; Ugly
✅ Retrieved 200 comments.
📺 Scraping comments for: 2022 Ford F-150 Lightning Review // Almost A Game Changer
✅ Retrieved 200 comments.
📺 Scraping comments for: New Ford F150 Lightning REVIEW with 0-60mph test!
✅ Retrieved 200 comments.
📺 Scraping comments for: The 580HP Ford F-150 Lightning is an impressive electric truc