In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv('data/SnapUGC/train_out.txt', sep='\t') # remember to remove the quotation at line 342 for train_out.txt
test_df = pd.read_csv('data/SnapUGC/train_out.txt', sep='\t')

len(train_df), len(test_df)

(113810, 113810)

In [3]:
train_df.dtypes, test_df.dtypes

(Id                object
 Video_len        float64
 order of ECR       int64
 order of NAWP      int64
 Title             object
 Description       object
 Link              object
 dtype: object,
 Id                object
 Video_len        float64
 order of ECR       int64
 order of NAWP      int64
 Title             object
 Description       object
 Link              object
 dtype: object)

In [4]:
import asyncio
import aiohttp
from aiohttp import ClientSession
from tqdm import tqdm
import os

# Download a single video
async def download_video(session: ClientSession, url: str, save_dir: str, name: str):
    try:
        async with session.get(url) as response:
            if response.status == 200:
                # Save the video with a unique filename
                filename = os.path.join(save_dir, f"{name}.mp4")
                with open(filename, "wb") as f:
                    f.write(await response.read())
                return None  # No error
            else:
                return f"Failed to download {url}, status: {response.status}"
    except Exception as e:
        return f"Error downloading {url}: {e}"

# Main function to download all videos
async def download_all_videos(urls, ids, save_dir, error_log_file):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for url, id in zip(urls, ids):
            tasks.append(download_video(session, url, save_dir, id))
        
        # Use tqdm for progress visualization
        with open(error_log_file, "w") as log_file:
            for task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Downloading videos"):
                result = await task
                if result:  # Log only errors
                    log_file.write(result + "\n")

In [None]:
# Directory to save videos
TRAIN_VIDEOS_DIR = "/mnt/d/Thesis/Data/Video/Train"
TEST_VIDEOS_DIR = "/mnt/d/Thesis/Data/Video/Test"
os.makedirs(TRAIN_VIDEOS_DIR, exist_ok=False)
os.makedirs(TEST_VIDEOS_DIR, exist_ok=False)

await download_all_videos(train_df['Link'], train_df['Id'], TRAIN_VIDEOS_DIR, 'data/SnapUGC/train_download_log.txt')
await download_all_videos(test_df['Link'], train_df['Id'], TEST_VIDEOS_DIR, 'data/SnapUGC/test_download_log.txt')

Downloading videos: 100%|██████████| 113810/113810 [05:27<00:00, 347.88it/s]  
Downloading videos: 100%|██████████| 113810/113810 [05:28<00:00, 346.50it/s]  
