In [4]:
from selenium import webdriver
from bs4 import BeautifulSoup
import re

search_query = 'discrete random variables'
#search_query = input()

def parse_views(views_text):
    numerical_part = views_text.split()[0]
    if 'K' in views_text:
        return int(float(numerical_part.replace('K', '')) * 1000)
    elif 'M' in views_text:
        return int(float(numerical_part.replace('M', '')) * 1000000)
    elif 'views' in views_text:
        return 0
    else:
        return int(numerical_part.replace(',', ''))

def get_likes(driver, video_url):
    driver.get(video_url)
    content = driver.page_source
    likes_match = re.search(r'(\d+(?:\.\d+)?(?:K|M)?) likes', content)
    if likes_match:
        likes_text = likes_match.group(1)
        return likes_text
    else:
        return "Likes not found"

# initialize webdriver
driver = webdriver.Chrome()

# build the search URL
search_url = f'https://www.youtube.com/results?search_query={search_query}'
driver.get(search_url)

# BeautifulSoup 
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content, 'html.parser')

# video elements
video_elements = soup.find_all('ytd-video-renderer')

# initialize a list to store video data
videos_data = []

# extract title, views, and URL
for video in video_elements:
    # title
    title_element = video.find('yt-formatted-string', class_='style-scope ytd-video-renderer')
    if title_element:
        title = title_element.text.strip()
    else:
        title = "Title not found."

    # views
    views_element = video.find('span', class_='inline-metadata-item style-scope ytd-video-meta-block')
    if views_element:
        views = views_element.text.strip()
    else:
        views = "Views not found."

    # URL
    url = video.find('a', class_='yt-simple-endpoint')['href']
    video_url = f'https://www.youtube.com{url}'

    # Append data to the list
    videos_data.append({'title': title, 'views': views, 'url': video_url})

# Sort based on views
sorted_videos_data = sorted(videos_data, key=lambda x: parse_views(x['views']), reverse=True)

# number of likes for each video
for i, video_data in enumerate(sorted_videos_data[:5], start=1):
    likes = get_likes(driver, video_data['url'])
    print(f"{i}. Title: {video_data['title']}")
    print(f"   Views: {video_data['views']}")
    print(f"   Likes: {likes}")
    print(f"   URL: {video_data['url']}")
    print("=" * 50)

# Close the webdriver
driver.quit()


1. Title: Discrete and continuous random variables | Probability and Statistics | Khan Academy
   Views: 2M views
   Likes: 13K
   URL: https://www.youtube.com/watch?v=dOr0NKyD31Q&pp=ygUZZGlzY3JldGUgcmFuZG9tIHZhcmlhYmxlcw%3D%3D
2. Title: Random variables | Probability and Statistics | Khan Academy
   Views: 1.7M views
   Likes: 9.9K
   URL: https://www.youtube.com/watch?v=3v9w79NhsfI&pp=ygUZZGlzY3JldGUgcmFuZG9tIHZhcmlhYmxlcw%3D%3D
3. Title: 02 - Random Variables and Discrete Probability Distributions
   Views: 1.6M views
   Likes: 23K
   URL: https://www.youtube.com/watch?v=UnzbuqgU2LE&pp=ygUZZGlzY3JldGUgcmFuZG9tIHZhcmlhYmxlcw%3D%3D
4. Title: Expected Value and Variance of Discrete Random Variables
   Views: 1.1M views
   Likes: 12K
   URL: https://www.youtube.com/watch?v=OvTEhNL96v0&pp=ygUZZGlzY3JldGUgcmFuZG9tIHZhcmlhYmxlcw%3D%3D
5. Title: How To Calculate Expected Value
   Views: 453K views
   Likes: 4.3K
   URL: https://www.youtube.com/watch?v=b6VK2VPMXNI&pp=ygUZZGlzY3JldGUgcmFuZG9t

In [23]:
ytubevid = sorted_videos_data[1]
ytubevid.keys()

ytubevidurl = ytubevid['url']

# Extracting the desired part from the URL
desired_part = youtube_url.split('v=')[1]

print("Desired part:", desired_part)

Desired part: 3v9w79NhsfI


In [69]:
from youtube_transcript_api import YouTubeTranscriptApi 
  
# assigning srt variable with the list 
# of dictionaries obtained by the get_transcript() function
srt = YouTubeTranscriptApi.get_transcript(desired_part)


# Assuming srt is the list returned by YouTubeTranscriptApi.get_transcript(desired_part)
text_list = []

for item in srt:
    text_list.append(item['text'])

# Now text_list contains all the 'text' values from the transcript
#print(text_list)

# Remove line breaks and join the text
combined_text = ' '.join(text.replace('\n', ' ') for text in text_list)

# Specify the file name and open the file in write mode
file_name = 'combined_text.txt'
with open(file_name, 'w') as file:
    # Write the combined text to the file
    file.write(combined_text)

print(f'The combined text has been saved to {file_name}')

The combined text has been saved to combined_text.txt
