In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv('data/SnapUGC/train_out.txt', sep='\t') # remember to remove the quotation at line 342 for train_out.txt
test_df = pd.read_csv('data/SnapUGC/train_out.txt', sep='\t')

len(train_df), len(test_df)

(113810, 113810)

In [3]:
train_df.dtypes, test_df.dtypes

(Id                object
 Video_len        float64
 order of ECR       int64
 order of NAWP      int64
 Title             object
 Description       object
 Link              object
 dtype: object,
 Id                object
 Video_len        float64
 order of ECR       int64
 order of NAWP      int64
 Title             object
 Description       object
 Link              object
 dtype: object)

In [4]:
train_df[train_df['Link'].isna()]

Unnamed: 0,Id,Video_len,order of ECR,order of NAWP,Title,Description,Link


In [5]:
test_df[test_df['Link'].isna()]

Unnamed: 0,Id,Video_len,order of ECR,order of NAWP,Title,Description,Link


In [6]:
import asyncio
import aiohttp
from aiohttp import ClientSession, TCPConnector
from tqdm import tqdm
import os
from urllib.parse import urlparse
import nest_asyncio

nest_asyncio.apply()

# Semaphore to limit concurrency
semaphore = asyncio.Semaphore(10)  # Adjust concurrency limit as needed

# Download a single video with retry, exponential backoff, and persistent connection
async def download_video(session: ClientSession, url: str, save_dir: str, name: str, retries=3):
    temp_filename = os.path.join(save_dir, f"{name}.mp4.part")
    final_filename = os.path.join(save_dir, f"{name}.mp4")
    
    if os.path.exists(final_filename):
        return None

    async with semaphore:  # Enforce concurrency limit
        for attempt in range(retries):
            try:
                async with session.get(url) as response:
                    if response.status == 200:
                        # Save the video to a temporary file
                        with open(temp_filename, "wb") as f:
                            f.write(await response.read())
                        # Rename to final filename
                        os.rename(temp_filename, final_filename)
                        return None  # Success
                    else:
                        error_message = f"Failed to download {url}, status: {response.status}"
                        if attempt < retries - 1:
                            await asyncio.sleep(2 ** attempt)  # Exponential backoff
                        else:
                            return error_message
            except Exception as e:
                if attempt < retries - 1:
                    await asyncio.sleep(2 ** attempt)  # Exponential backoff
                else:
                    return f"Error downloading {url}: {e}"

# Validate URLs
def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

# Main function to download all videos with a persistent connection
async def download_all_videos(urls, ids, save_dir, error_log_file):
    # Create a connector for persistent connections
    connector = TCPConnector(limit_per_host=10)  # Adjust as needed
    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = []
        for url, id in zip(urls, ids):
            if is_valid_url(url):
                tasks.append(download_video(session, url, save_dir, id))
            else:
                print(f"Invalid URL: {url}")
        
        # Process tasks with tqdm for progress tracking
        with open(error_log_file, "w") as log_file:
            for task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Downloading videos"):
                result = await task
                if result:  # Log only errors
                    log_file.write(result + "\n")


In [None]:
# Directory to save videos
TRAIN_VIDEOS_DIR = "/mnt/dat/thes/Train"
TEST_VIDEOS_DIR = "/mnt/dat/thes/Test"
os.makedirs(TRAIN_VIDEOS_DIR, exist_ok=True)
os.makedirs(TEST_VIDEOS_DIR, exist_ok=True)

await download_all_videos(train_df['Link'], train_df['Id'], TRAIN_VIDEOS_DIR, 'data/SnapUGC/train_download_log.txt')
await download_all_videos(test_df['Link'], test_df['Id'], TEST_VIDEOS_DIR, 'data/SnapUGC/test_download_log.txt')

Downloading videos:  95%|█████████▍| 107597/113810 [00:41<00:05, 1219.39it/s] 