In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import glob

In [2]:
# Directory to save videos
TRAIN_VIDEOS_DIR = "/mnt/dat/thes/Train"
TEST_VIDEOS_DIR = "/mnt/dat/thes/Test"
os.makedirs(TRAIN_VIDEOS_DIR, exist_ok=True)
os.makedirs(TEST_VIDEOS_DIR, exist_ok=True)

In [3]:
import csv

train_df = pd.read_csv('data/SnapUGC/train_out.txt', sep='\t')   # remember to remove the unclosed quotation at line 342 for train_out.txt
                    #    quoting=csv.QUOTE_NONE) # or use this

test_df = pd.read_csv('data/SnapUGC/test_out.txt', sep='\t')

len(train_df), len(test_df)

(113810, 15640)

In [4]:
train_df.dtypes, test_df.dtypes

(Id                object
 Video_len        float64
 order of ECR       int64
 order of NAWP      int64
 Title             object
 Description       object
 Link              object
 dtype: object,
 Id                object
 Video_len        float64
 order of ECR       int64
 order of NAWP      int64
 Title             object
 Description       object
 Link              object
 dtype: object)

In [5]:
train_df[train_df['Link'].isna()]

Unnamed: 0,Id,Video_len,order of ECR,order of NAWP,Title,Description,Link


In [6]:
test_df[test_df['Link'].isna()]

Unnamed: 0,Id,Video_len,order of ECR,order of NAWP,Title,Description,Link


In [7]:
glob.glob(os.path.join(TRAIN_VIDEOS_DIR, '*.mp4'))

['/mnt/dat/thes/Train/e0d0c5003ace0db8441f126acca23763.mp4',
 '/mnt/dat/thes/Train/d5cf30e9cde207914c63ee418c44f9da.mp4',
 '/mnt/dat/thes/Train/59758d35d60f814d28fa96d838f26ff8.mp4',
 '/mnt/dat/thes/Train/8de7a01eb31b520e27fd100afb4ed26c.mp4',
 '/mnt/dat/thes/Train/b846bfc3a8c5520f29b5d0d6b1f09028.mp4',
 '/mnt/dat/thes/Train/9765149f67b556cb6c34d11f16c6858f.mp4',
 '/mnt/dat/thes/Train/c1d82c701f58d2c4b34cfc797637598b.mp4',
 '/mnt/dat/thes/Train/0f26d736921d31ce624fe447c32c39e3.mp4',
 '/mnt/dat/thes/Train/1d4cda9bb101d18b7196e6d027ec030f.mp4',
 '/mnt/dat/thes/Train/81c67e2caf2f781c32a412d71e57d455.mp4',
 '/mnt/dat/thes/Train/382db6fb1acb5c49eb1e3c13ba66a8bc.mp4',
 '/mnt/dat/thes/Train/8629353280d2a654148f4c2fbfb7933a.mp4',
 '/mnt/dat/thes/Train/b8215c5531bbb777bd6b8c2124699aab.mp4',
 '/mnt/dat/thes/Train/c6fd1af44a764cc9e0f2bafba25cbbdc.mp4',
 '/mnt/dat/thes/Train/7879fe041f2656b57bc486b6439fcca3.mp4',
 '/mnt/dat/thes/Train/712c674a8014e7f49c2c3a131c96190b.mp4',
 '/mnt/dat/thes/Train/cb

In [None]:
import aiohttp
from aiohttp import ClientSession, TCPConnector
import asyncio
from tqdm import tqdm
import os
from urllib.parse import urlparse
import nest_asyncio

nest_asyncio.apply()

# Semaphore to limit concurrency
semaphore = asyncio.Semaphore(50)  # Adjust concurrency limit as needed

# Download a single video with retry, exponential backoff, and persistent connection
async def download_video(session: ClientSession, url: str, save_dir: str, name: str, retries=3):
    temp_filename = os.path.join(save_dir, f"{name}.mp4.part")
    final_filename = os.path.join(save_dir, f"{name}.mp4")
    
    if os.path.exists(final_filename):
        return None

    async with semaphore:  # Enforce concurrency limit
        for attempt in range(retries):
            try:
                async with session.get(url) as response:
                    if response.status == 200:
                        # Save the video to a temporary file
                        with open(temp_filename, "wb") as f:
                            f.write(await response.read())
                        # Rename to final filename
                        os.rename(temp_filename, final_filename)
                        return None  # Success
                    else:
                        error_message = f"Failed to download {url}, status: {response.status}"
                        if attempt < retries - 1:
                            await asyncio.sleep(2 ** attempt)  # Exponential backoff
                        else:
                            return error_message
            except Exception as e:
                if attempt < retries - 1:
                    await asyncio.sleep(2 ** attempt)  # Exponential backoff
                else:
                    return f"Error downloading {url}: {e}"

# Validate URLs
def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

# Main function to download all videos with a persistent connection
async def download_all_videos(urls, ids, save_dir, error_log_file):
    # Create a connector for persistent connections
    connector = TCPConnector(limit_per_host=50)  # Adjust as needed
    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = []
        for url, id in zip(urls, ids):
            if is_valid_url(url):
                tasks.append(download_video(session, url, save_dir, id))
            else:
                print(f"Invalid URL: {url}")
        
        # Process tasks with tqdm for progress tracking
        with open(error_log_file, "a") as log_file:
            for task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Downloading videos"):
                result = await task
                if result:  # Log only errors
                    log_file.write(result + "\n")


# Ignore 404 links

In [18]:
def get_404_links(log_file):
    if not os.path.exists(log_file):
        return []
    
    with open(log_file, 'r') as file:
        text = file.read()
    pattern = r"Failed to download (https?://\S+), status: 404"
    matches = re.findall(pattern, text)

    # Check if all lines contain "status: 404"
    if len(matches) != len(text.strip().split("\n")):
        raise ValueError("Encountered a line that does not contain 'status: 404'.")

    return matches

In [19]:
train_logs = 'data/SnapUGC/train_download_log.txt'
test_logs = 'data/SnapUGC/test_download_log.txt'

train_fails = get_404_links(train_logs)
test_fails = get_404_links(test_logs)

In [20]:
sub_train_df = train_df[~train_df['Link'].isin(train_fails)]
sub_test_df = test_df[~test_df['Link'].isin(test_fails)]

print(f"Remaining: {len(sub_train_df)} for train, {len(sub_test_df)} for test")
await download_all_videos(sub_train_df['Link'], sub_train_df['Id'], TRAIN_VIDEOS_DIR, train_logs)
await download_all_videos(sub_test_df['Link'], sub_test_df['Id'], TEST_VIDEOS_DIR, test_logs)

Remaining: 108792 for train, 15640 for test


Downloading videos: 100%|██████████| 108792/108792 [01:48<00:00, 1001.71it/s] 
Downloading videos: 100%|██████████| 15640/15640 [13:02<00:00, 19.99it/s]
