In [5]:
import requests
import pandas as pd

def get_channel_details(api_key, channel_id):
    url = f"https://www.googleapis.com/youtube/v3/channels?part=statistics&id={channel_id}&key={api_key}"
    response = requests.get(url).json()
    if 'items' in response and len(response['items']) > 0:
        stats = response['items'][0]['statistics']
        return {
            'subscriber_count': stats.get('subscriberCount', '0')
        }
    return {'subscriber_count': '0'}

def get_video_details(api_key, video_id):
    url = f"https://www.googleapis.com/youtube/v3/videos?part=statistics,snippet,contentDetails&id={video_id}&key={api_key}"
    response = requests.get(url).json()
    details = {}
    if 'items' in response and len(response['items']) > 0:
        stats = response['items'][0]['statistics']
        snippet = response['items'][0]['snippet']
        content_details = response['items'][0]['contentDetails']
        duration = parse_duration(content_details['duration'])
        details = {
            'description': snippet.get('description', ''),
            'channel_id': snippet['channelId'],
            'likes': stats.get('likeCount', '0'),
            'views': stats.get('viewCount', '0'),
            'comments': stats.get('commentCount', '0'),
            'stages': snippet.get('liveBroadcastContent', ''),
            'category': snippet.get('categoryId', ''),
            'licensed_content': content_details.get('licensedContent', False),
            'duration': duration,
            'comments_enabled': 'commentCount' in stats
        }
    return details

def parse_duration(duration):
    import isodate
    duration = isodate.parse_duration(duration)
    return duration.total_seconds() / 60  # Convert to minutes

def get_top_comments(api_key, video_id):
    url = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={video_id}&key={api_key}&order=relevance&maxResults=3"
    response = requests.get(url).json()
    top_comments = []
    if 'items' in response:
        for item in response['items']:
            top_comments.append(item['snippet']['topLevelComment']['snippet']['textDisplay'])
    return top_comments

def collect_videos(api_key, search_query, order, max_videos):
    collected_data = []
    next_page_token = None
    while len(collected_data) < max_videos:
        url = f"https://www.googleapis.com/youtube/v3/search?key={api_key}&q={search_query}&part=snippet&type=video&maxResults={min(50, max_videos-len(collected_data))}&order={order}&pageToken={next_page_token or ''}"
        response = requests.get(url).json()
        if "items" not in response:
            break
        
        for item in response["items"]:
            video_id = item["id"]["videoId"]
            video_details = get_video_details(api_key, video_id)
            duration = video_details.get('duration', 0)
            if 4 <= duration <= 20:
                channel_details = get_channel_details(api_key, video_details['channel_id'])
                top_comments = get_top_comments(api_key, video_id)
                
                collected_data.append({
                    "search_query": search_query,
                    "video_id": video_id,
                    "title": item["snippet"]["title"],
                    "video_url": f"https://www.youtube.com/watch?v={video_id}",
                    "description": video_details['description'],
                    "channel_id": video_details['channel_id'],
                    "subscriber_count": channel_details['subscriber_count'],
                    "likes": video_details['likes'],
                    "views": video_details['views'],
                    "comments": video_details['comments'],
                    "top_comments": top_comments,
                    "stages": video_details['stages'],
                    "category": video_details['category'],
                    "licensed_content": video_details['licensed_content'],
                    "duration": video_details['duration'],
                    "comments_enabled": video_details['comments_enabled'],
                    "order": order
                })
                
                if len(collected_data) >= max_videos:
                    break
        
        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break
    
    return collected_data

# Your API Key
api_key = "AIzaSyDUx6ksTMOX5PdEJccfUdRj5MsG_AcySYI"

# Define a list of search queries
search_queries = ["metoo my story"]
videos_per_query = 200  # Total videos per category
videos_per_type = 100  # Number of videos per relevance and recent type

# Placeholder for collected video data
video_data = []

for search_query in search_queries:
    # Collect relevance videos
    video_data += collect_videos(api_key, search_query, 'relevance', videos_per_type)
    # Collect recent videos
    video_data += collect_videos(api_key, search_query, 'date', videos_per_type)

# Save video data to a CSV file
df = pd.DataFrame(video_data)
df.to_csv('metoo_my_story_videos.csv', index=False)

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time

def scrape_tiktok(search_query, max_videos):
    driver = webdriver.Chrome()
    driver.get(f"https://www.tiktok.com/search?q={search_query}")
    time.sleep(5)  # Adjust as necessary

    videos = []
    while len(videos) < max_videos:
        # Find video elements
        video_elements = driver.find_elements(By.XPATH, "//div[@class='video-feed-item-wrapper']")
        for elem in video_elements:
            if len(videos) >= max_videos:
                break
            title = elem.find_element(By.XPATH, ".//a").text
            video_url = elem.find_element(By.XPATH, ".//a").get_attribute("href")
            likes = elem.find_element(By.XPATH, ".//span[@class='like-count']").text
            views = elem.find_element(By.XPATH, ".//span[@class='play-count']").text
            videos.append({
                "title": title,
                "video_url": video_url,
                "likes": likes,
                "views": views,
                "search_query": search_query,
                "platform": "TikTok"
            })
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Adjust as necessary

    driver.quit()
    return videos

def scrape_instagram(search_query, max_videos):
    driver = webdriver.Chrome()
    driver.get(f"https://www.instagram.com/explore/tags/{search_query}/")
    time.sleep(5)  # Adjust as necessary

    videos = []
    while len(videos) < max_videos:
        # Find reel elements
        reel_elements = driver.find_elements(By.XPATH, "//div[@class='v1Nh3 kIKUG  _bz0w']")
        for elem in reel_elements:
            if len(videos) >= max_videos:
                break
            video_url = elem.find_element(By.XPATH, ".//a").get_attribute("href")
            elem.click()
            time.sleep(3)  # Adjust as necessary
            likes = driver.find_element(By.XPATH, "//div[@class='Nm9Fw']/button/span").text
            views = driver.find_element(By.XPATH, "//span[@class='vcOH2']").text
            title = driver.find_element(By.XPATH, "//div[@class='C4VMK']/span").text
            videos.append({
                "title": title,
                "video_url": video_url,
                "likes": likes,
                "views": views,
                "search_query": search_query,
                "platform": "Instagram"
            })
            driver.back()
            time.sleep(2)  # Adjust as necessary
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Adjust as necessary

    driver.quit()
    return videos

# Define search query and video count
search_query = "black women disparities medicine"
videos_per_platform = 50  # 100 videos total, 50 from TikTok and 50 from Instagram

# Scrape TikTok videos
tiktok_videos = scrape_tiktok(search_query, videos_per_platform)

# Scrape Instagram Reels
instagram_videos = scrape_instagram(search_query, videos_per_platform)

# Combine data and save to CSV
all_videos = tiktok_videos + instagram_videos
df = pd.DataFrame(all_videos)
df.to_csv('black_women_disparities_medicine_videos.csv', index=False)


ModuleNotFoundError: No module named 'selenium'

In [4]:
import requests
import pandas as pd

def get_tiktok_data(search_query, max_videos):
    api_url = "https://www.tiktok.com/api/search"  # Example URL, might not be accurate
    params = {
        'query': search_query,
        'limit': max_videos
    }
    response = requests.get(api_url, params=params)
    data = response.json()
    videos = []
    for item in data['results']:
        video = {
            "title": item.get('title', ''),
            "video_url": item.get('video_url', ''),
            "likes": item.get('likes', 0),
            "views": item.get('views', 0),
            "search_query": search_query,
            "platform": "TikTok"
        }
        videos.append(video)
    return videos

def get_instagram_data(search_query, max_videos):
    access_token = "YOUR_INSTAGRAM_ACCESS_TOKEN"  # You need to get this token from Instagram
    api_url = f"https://graph.instagram.com/v11.0/search"
    params = {
        'q': search_query,
        'access_token': access_token,
        'limit': max_videos
    }
    response = requests.get(api_url, params=params)
    data = response.json()
    videos = []
    for item in data['data']:
        video = {
            "title": item.get('caption', ''),
            "video_url": item.get('permalink', ''),
            "likes": item.get('like_count', 0),
            "views": item.get('view_count', 0),
            "search_query": search_query,
            "platform": "Instagram"
        }
        videos.append(video)
    return videos

# Define search query and video count
search_query = "black women disparities medicine"
videos_per_platform = 50  # 100 videos total, 50 from TikTok and 50 from Instagram

# Get TikTok videos
tiktok_videos = get_tiktok_data(search_query, videos_per_platform)

# Get Instagram Reels
instagram_videos = get_instagram_data(search_query, videos_per_platform)

# Combine data and save to CSV
all_videos = tiktok_videos + instagram_videos
df = pd.DataFrame(all_videos)
df.to_csv('black_women_disparities_medicine_videos.csv', index=False)


KeyError: 'results'

In [6]:
import requests
import pandas as pd

def get_channel_details(api_key, channel_id):
    url = f"https://www.googleapis.com/youtube/v3/channels?part=statistics&id={channel_id}&key={api_key}"
    response = requests.get(url).json()
    if 'items' in response and len(response['items']) > 0:
        stats = response['items'][0]['statistics']
        return {
            'subscriber_count': stats.get('subscriberCount', '0')
        }
    return {'subscriber_count': '0'}

def get_video_details(api_key, video_id):
    url = f"https://www.googleapis.com/youtube/v3/videos?part=statistics,snippet,contentDetails&id={video_id}&key={api_key}"
    response = requests.get(url).json()
    details = {}
    if 'items' in response and len(response['items']) > 0:
        stats = response['items'][0]['statistics']
        snippet = response['items'][0]['snippet']
        content_details = response['items'][0]['contentDetails']
        duration = parse_duration(content_details['duration'])
        details = {
            'description': snippet.get('description', ''),
            'channel_id': snippet['channelId'],
            'likes': stats.get('likeCount', '0'),
            'views': stats.get('viewCount', '0'),
            'comments': stats.get('commentCount', '0'),
            'stages': snippet.get('liveBroadcastContent', ''),
            'category': snippet.get('categoryId', ''),
            'licensed_content': content_details.get('licensedContent', False),
            'duration': duration,
            'comments_enabled': 'commentCount' in stats
        }
    return details

def parse_duration(duration):
    import isodate
    duration = isodate.parse_duration(duration)
    return duration.total_seconds() / 60  # Convert to minutes

def get_top_comments(api_key, video_id):
    url = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={video_id}&key={api_key}&order=relevance&maxResults=3"
    response = requests.get(url).json()
    top_comments = []
    if 'items' in response:
        for item in response['items']:
            top_comments.append(item['snippet']['topLevelComment']['snippet']['textDisplay'])
    return top_comments

def collect_videos(api_key, search_query, order, max_videos):
    collected_data = []
    next_page_token = None
    while len(collected_data) < max_videos:
        url = f"https://www.googleapis.com/youtube/v3/search?key={api_key}&q={search_query}&part=snippet&type=video&maxResults={min(50, max_videos-len(collected_data))}&order={order}&pageToken={next_page_token or ''}"
        response = requests.get(url).json()
        if "items" not in response:
            break
        
        for item in response["items"]:
            video_id = item["id"]["videoId"]
            video_details = get_video_details(api_key, video_id)
            duration = video_details.get('duration', 0)
            if 4 <= duration <= 20:  # Example duration filter in minutes
                channel_details = get_channel_details(api_key, video_details['channel_id'])
                top_comments = get_top_comments(api_key, video_id)
                
                collected_data.append({
                    "search_query": search_query,
                    "video_id": video_id,
                    "title": item["snippet"]["title"],
                    "video_url": f"https://www.youtube.com/watch?v={video_id}",
                    "description": video_details['description'],
                    "channel_id": video_details['channel_id'],
                    "subscriber_count": channel_details['subscriber_count'],
                    "likes": video_details['likes'],
                    "views": video_details['views'],
                    "comments": video_details['comments'],
                    "top_comments": top_comments,
                    "stages": video_details['stages'],
                    "category": video_details['category'],
                    "licensed_content": video_details['licensed_content'],
                    "duration": video_details['duration'],
                    "comments_enabled": video_details['comments_enabled'],
                    "order": order
                })
                
                if len(collected_data) >= max_videos:
                    break
        
        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break
    
    return collected_data

# Your API Key
api_key = "AIzaSyCGxK6OOPUk7DCODbiFy2jZ27bF3Mlxgu4"

# Define the search query and video count
search_query = "black women disparities medicine"
videos_per_type = 50  # Number of videos per relevance and recent type (total 100)

# Placeholder for collected video data
video_data = []

# Collect relevance videos
video_data += collect_videos(api_key, search_query, 'relevance', videos_per_type)
# Collect recent videos
video_data += collect_videos(api_key, search_query, 'date', videos_per_type)

# Save video data to a CSV file
df = pd.DataFrame(video_data)
df.to_csv('black_women_disparities_medicine_videos.csv', index=False)


ModuleNotFoundError: No module named 'isodate'

In [8]:
pip install requests isodate pandas

Collecting isodate
  Downloading isodate-0.6.1-py2.py3-none-any.whl.metadata (9.6 kB)
Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
   ---------------------------------------- 0.0/41.7 kB ? eta -:--:--
   ---------------------------------------- 0.0/41.7 kB ? eta -:--:--
   --------- ------------------------------ 10.2/41.7 kB ? eta -:--:--
   ----------------------------- ---------- 30.7/41.7 kB 325.1 kB/s eta 0:00:01
   ---------------------------------------- 41.7/41.7 kB 334.9 kB/s eta 0:00:00
Installing collected packages: isodate
Successfully installed isodate-0.6.1
Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install requests isodate pandas


Note: you may need to restart the kernel to use updated packages.


In [12]:
import requests
import pandas as pd
import isodate

def get_channel_details(api_key, channel_id):
    url = f"https://www.googleapis.com/youtube/v3/channels?part=statistics&id={channel_id}&key={api_key}"
    response = requests.get(url).json()
    if 'items' in response and len(response['items']) > 0:
        stats = response['items'][0]['statistics']
        return {
            'subscriber_count': stats.get('subscriberCount', '0')
        }
    return {'subscriber_count': '0'}

def get_video_details(api_key, video_id):
    url = f"https://www.googleapis.com/youtube/v3/videos?part=statistics,snippet,contentDetails&id={video_id}&key={api_key}"
    response = requests.get(url).json()
    details = {}
    if 'items' in response and len(response['items']) > 0:
        stats = response['items'][0]['statistics']
        snippet = response['items'][0]['snippet']
        content_details = response['items'][0]['contentDetails']
        duration = parse_duration(content_details['duration'])
        details = {
            'description': snippet.get('description', ''),
            'channel_id': snippet['channelId'],
            'likes': stats.get('likeCount', '0'),
            'views': stats.get('viewCount', '0'),
            'comments': stats.get('commentCount', '0'),
            'stages': snippet.get('liveBroadcastContent', ''),
            'category': snippet.get('categoryId', ''),
            'licensed_content': content_details.get('licensedContent', False),
            'duration': duration,
            'comments_enabled': 'commentCount' in stats
        }
    return details

def parse_duration(duration):
    duration = isodate.parse_duration(duration)
    return duration.total_seconds() / 60  # Convert to minutes

def get_top_comments(api_key, video_id):
    url = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={video_id}&key={api_key}&order=relevance&maxResults=3"
    response = requests.get(url).json()
    top_comments = []
    if 'items' in response:
        for item in response['items']:
            top_comments.append(item['snippet']['topLevelComment']['snippet']['textDisplay'])
    return top_comments

def collect_videos(api_key, search_query, order, max_videos):
    collected_data = []
    next_page_token = None
    while len(collected_data) < max_videos:
        url = f"https://www.googleapis.com/youtube/v3/search?key={api_key}&q={search_query}&part=snippet&type=video&maxResults={min(50, max_videos-len(collected_data))}&order={order}&pageToken={next_page_token or ''}"
        response = requests.get(url).json()
        if "items" not in response:
            break
        
        for item in response["items"]:
            video_id = item["id"]["videoId"]
            video_details = get_video_details(api_key, video_id)
            duration = video_details.get('duration', 0)
            if 4 <= duration <= 20:  # Example duration filter in minutes
                channel_details = get_channel_details(api_key, video_details['channel_id'])
                top_comments = get_top_comments(api_key, video_id)
                
                collected_data.append({
                    "search_query": search_query,
                    "video_id": video_id,
                    "title": item["snippet"]["title"],
                    "video_url": f"https://www.youtube.com/watch?v={video_id}",
                    "description": video_details['description'],
                    "channel_id": video_details['channel_id'],
                    "subscriber_count": channel_details['subscriber_count'],
                    "likes": video_details['likes'],
                    "views": video_details['views'],
                    "comments": video_details['comments'],
                    "top_comments": top_comments,
                    "stages": video_details['stages'],
                    "category": video_details['category'],
                    "licensed_content": video_details['licensed_content'],
                    "duration": video_details['duration'],
                    "comments_enabled": video_details['comments_enabled'],
                    "order": order
                })
                
                if len(collected_data) >= max_videos:
                    break
        
        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break
    
    return collected_data

# Your API Key
api_key = "AIzaSyCGxK6OOPUk7DCODbiFy2jZ27bF3Mlxgu4"

# Define the search query and video count
search_query = "black women disparities medicine"
videos_per_type = 50  # Number of videos per relevance and recent type (total 100)

# Placeholder for collected video data
video_data = []

# Collect relevance videos
video_data += collect_videos(api_key, search_query, 'relevance', videos_per_type)
# Collect recent videos
video_data += collect_videos(api_key, search_query, 'date', videos_per_type)

# Save video data to a CSV file
df = pd.DataFrame(video_data)
df.to_csv('black_women_disparities_medicine_videos.csv', index=False)
