# XAI FOR YOUTUBE ENGAGEMENT

StudentID|Full Name
-|-
21127050|Tran Nguyen Huan 
21127240|Nguyen Phat Dat

## Part 1: Crawl necessary data `Youtube` from file `Entube.csv`

## Import necessary libraries

In [None]:
import pandas as pd
import requests
import yt_dlp

### Crawl features `Title`, `Tags`, and `Thumbnail`

In [None]:
API_KEY = "*"
YOUTUBE_API_URL = "https://www.googleapis.com/youtube/v3/videos"

In [None]:
def get_video_metadata(video_id):
    try:
        params = {
            "part": "snippet",
            "id": video_id,
            "key": API_KEY
        }
        response = requests.get(YOUTUBE_API_URL, params=params)
        data = response.json()

        if "items" in data and len(data["items"]) > 0:
            snippet = data["items"][0]["snippet"]
            return {
                "Title": snippet["title"],
                "Tags": snippet.get("tags", []),
                "Thumbnail": snippet["thumbnails"]["high"]["url"],
            }
        else:
            print(f"No data found for video ID: {video_id}")
            return {"Title": None, "Tags": None, "Thumbnail": None}

    except Exception as e:
        print(f"Error processing video ID {video_id}: {e}")
        return {"Title": None, "Tags": None, "Thumbnail": None}

In [None]:
data = pd.read_csv("data/Entube.csv")

video_data = [get_video_metadata(row["video_id"]) for _, row in data.iterrows()]

crawled_df = pd.DataFrame(video_data)
result_df = pd.concat([data, crawled_df], axis=1)
result_df.to_csv("entube_with_metadata.csv", index=False)
print("Crawling completed and saved to entube_with_metadata.csv")

## Crawl feature `audio`

In [None]:
def download_audio(youtube_url, output_path='data/audio/'):
    """Download video and extract audio using yt-dlp."""
    ydl_opts = {
        'outtmpl': output_path + '%(id)s.%(ext)s',  # Save with video ID as filename
        'format': 'bestvideo+bestaudio/best',  # Best quality video and audio
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',  # Extract audio in mp3 format
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([youtube_url])
        print(f"Downloaded: {youtube_url}")
    except Exception as e:
        print(f"Error downloading {youtube_url}: {e}")

def download_audios_from_csv(csv_file, output_path='data/audio/'):
    """Download audio from all YouTube videos listed in the CSV file."""
    # Load the dataset from CSV file
    dataset = pd.read_csv(csv_file)

    for index, row in dataset.iterrows():
        youtube_url = row['video_link']  # Assuming the URL is in the 'video_link' column
        download_audio(youtube_url, output_path)

In [None]:
data = pd.read_csv("data/Entube.csv")

# Download audio for all videos in the dataset
download_audios_from_csv(data)


## Crawl feature `video content`

In [None]:
def download_video(youtube_url, output_path='data/video/'):
    """Download clip from YouTube."""
    ydl_opts = {
        'format': 'mp4',
        'outtmpl': output_path + '%(id)s.%(ext)s',
    }
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(youtube_url, download=True)
            video_path = f"{output_path}{info['id']}.mp4"
            print(f"Video đã tải về: {video_path}")
            return video_path
    except Exception as e:
        print(f"Error when downloading video: {e}")
        return None

def download_videos_from_csv(csv_file, output_path='data/video/'):
    """Download audio from all YouTube videos listed in the CSV file."""
    dataset = pd.read_csv(csv_file)

    for index, row in dataset.iterrows():
        youtube_url = row['video_link'] 
        download_video(youtube_url, output_path)

In [None]:
data = 'data/EnTube.csv' 

download_videos_from_csv(data)