In [None]:
import os
import pandas as pd
import requests
from serpapi import GoogleSearch
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
from fpdf import FPDF

import time
import os
import base64
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import logging

In [None]:
# SerpAPI and YouTube API keys
SERP_API_KEY = ""
YT_KEY = ""
youtube = build('youtube', 'v3', developerKey=YT_KEY)
logging.getLogger("youtube_transcript_api").disabled = True

In [None]:
def youtube_search(query, max_results=5):
    """
    Perform a YouTube search and return video details.
    """
    search_response = youtube.search().list(
        q=query,
        part='snippet',
        type='video',
        order='viewCount',  # Get the most viewed videos
        maxResults=max_results
    ).execute()
    
    videos = []

    print(f"# of Search Results from YT for {query}: {len(search_response['items'])}")
    
    for item in search_response['items']:
        video_data = {
            'video_id': item['id']['videoId'],
            'title': item['snippet']['title'],
            'channel': item['snippet']['channelTitle'],
            'published_at': item['snippet']['publishedAt'],
            'description': item['snippet']['description']
        }
        videos.append(video_data)
    
    return videos

def get_transcript(video_id):
    """
    Fetch the transcript of a YouTube video by ID if available.
    """
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        transcript_text = ' '.join([entry['text'] for entry in transcript])
        return transcript_text
    except (TranscriptsDisabled, NoTranscriptFound):
        print(f"Could not fetch transcript for video {video_id}")
        return None  # Transcript not available
    
def save_transcript_as_pdf(name, video_id, transcript, folder_path):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=10)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Add title
    pdf.set_font("Arial", style="B", size=14)
    pdf.cell(0, 10, f"Transcript for {video_id}", ln=True, align='C')

    encoded_transcript = transcript.encode('latin-1', 'replace').decode('latin-1')

    # Add transcript text
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, encoded_transcript)

    # Save the PDF
    output_path = os.path.join(folder_path, f"{name}_Transcript_{video_id}.pdf")
    pdf.output(output_path)
    # print(f"Saved: {output_path}")

def process_videos(country, startup_name):
    """
    Search for videos, check for transcripts, and save results to a CSV.
    """
    query = f"""
    "{startup_name}" "startup" "{country}"
    """
    videos = youtube_search(query, max_results=5)
    data = []
    
    for video in videos[:10]:  # Limit to 20 videos to extract transcripts
        transcript = get_transcript(video['video_id'])
        
        if transcript:
            video_data = {
                "search_query": query,
                "video_id": video['video_id'],
                "meta_data_dict": video,
                "transcript": transcript
            }
            data.append(video_data)
    
    return data

In [62]:
# Function to fetch article URLs using SerpAPI
def fetch_article_urls(country, startup_name):
    search_query = f"""
    "{country}" "{startup_name}" "startup"
    """
    params = {
        "engine": "google_news",
        "q": search_query,
        "api_key": SERP_API_KEY,
    }
    search = GoogleSearch(params)
    results = search.get_dict()
    
    # Extract URLs from results
    urls = []
    if "news_results" in results:
        for news in results["news_results"][:10]:  # Get top 10 articles
            urls.append(news["link"])
    print(f"# of URLs for {startup_name} in {country}: {len(urls)}")

    return urls

def save_as_pdf(driver, output_path):
    # Send print-to-PDF command via DevTools Protocol
    result = driver.execute_cdp_cmd("Page.printToPDF", {
        "scale": 0.5,
        "printBackground": False  
    })
    with open(output_path, "wb") as f:
        f.write(base64.b64decode(result['data']))

def print_to_pdf(url, pdf_file_path):
    # Configure Chrome options
    chrome_options = Options()

    chrome_options.add_argument("--kiosk-printing")
    chrome_options.add_argument("--headless")

    # Set up the ChromeDriver
    driver = webdriver.Chrome(options=chrome_options)

    driver.get(url)
    time.sleep(5)
    save_as_pdf(driver, pdf_file_path)
    driver.quit()

In [None]:
# Function to create folder structure
def create_folder_structure(country, startup_name):
    folder_path = f"{country}/{startup_name}"
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    return folder_path

# Main function to process the dataframe
def process_dataframe(df):
    # all_urls = []

    for _, row in df.iterrows():
        data = []
        country = row["country"]
        startup_name = row["name"]
        
        folder_path = create_folder_structure(country, startup_name)
        # Fetch article URLs
        urls = fetch_article_urls(country, startup_name)
        for idx, url in enumerate(urls):
            output_file = f"{startup_name}_article_{idx+1}.pdf"
            pdf_file_path = os.path.join(folder_path, output_file)
            print_to_pdf(url, pdf_file_path)
        
        print(f"Searching YT for {startup_name} / {country}...")
        data.append(process_videos(country, startup_name))
        df = pd.DataFrame(data[0])

        for _, row in df.iterrows():
            save_transcript_as_pdf(startup_name, row["video_id"], row["transcript"], folder_path)

In [32]:
# Example dataframe
df = pd.read_csv('startups.csv')

In [44]:
df.loc[df["country"] == "USA", "country"] = "US"

In [None]:
# Run the main function
process_dataframe(df)

Searching YT for Zepto / India...
# of Search Results from YT for 
    "Zepto" "startup" "India"
    : 5
Could not fetch transcript for video F1Rrsu_wPmA: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=F1Rrsu_wPmA! This is most likely caused by:

No transcripts were found for any of the requested language codes: ('en',)

For this video (F1Rrsu_wPmA) transcripts are available in the following languages:

(MANUALLY CREATED)
None

(GENERATED)
 - hi ("Hindi (auto-generated)")[TRANSLATABLE]

(TRANSLATION LANGUAGES)
 - ab ("Abkhazian")
 - aa ("Afar")
 - af ("Afrikaans")
 - ak ("Akan")
 - sq ("Albanian")
 - am ("Amharic")
 - ar ("Arabic")
 - hy ("Armenian")
 - as ("Assamese")
 - ay ("Aymara")
 - az ("Azerbaijani")
 - bn ("Bangla")
 - ba ("Bashkir")
 - eu ("Basque")
 - be ("Belarusian")
 - bho ("Bhojpuri")
 - bs ("Bosnian")
 - br ("Breton")
 - bg ("Bulgarian")
 - my ("Burmese")
 - ca ("Catalan")
 - ceb ("Cebuano")
 - zh-Hans ("Chinese (Simplified)")
 - zh-Hant (