In [1]:
import pandas as pd
import requests
from requests.exceptions import RequestException
import csv
import time

input_files = ["football.csv", "tennis.csv", "other.csv"]
output_file = "url_status_output.csv"

def check_url(url):
    if pd.isna(url) or str(url).strip() == "":
        return "EMPTY"

    headers = {
        "User-Agent": "VLC/3.0.18 LibVLC/3.0.18"
    }

    try:
        with requests.get(url, headers=headers, timeout=(5, 20), stream=True, allow_redirects=True, verify=False
        ) as response:
            if response.status_code in [200, 206]:
                chunk = next(response.iter_content(1024), None)
                if chunk:
                    return "WORKING"
                else:
                    return "NO_DATA"
            return response.status_code

    except Exception as e:
        return f"ERROR: {type(e).__name__}"

# --------------------------------------------------
# STEP 1: Collect union of all columns
# --------------------------------------------------
all_columns = set()
all_columns.add("channel")

for file in input_files:
    df = pd.read_csv(file, nrows=1)
    all_columns.update(df.columns)

# Remove duplicates and ensure order
all_columns = list(all_columns)
all_columns.remove("channel")

# Final ordered header
final_columns = ["source_file", "channel"] + sorted(all_columns)


# --------------------------------------------------
# STEP 2: Stream processing
# --------------------------------------------------
with open(output_file, "w", newline="", encoding="utf-8") as f_out:
    writer = csv.DictWriter(f_out, fieldnames=final_columns)
    writer.writeheader()
    f_out.flush()

    for file in input_files:
        print(f"Processing {file}...")
        df = pd.read_csv(file)

        for _, row in df.iterrows():
            output_row = {col: "" for col in final_columns}

            output_row["source_file"] = file
            output_row["channel"] = row["channel"]

            for col in df.columns:
                if col == "channel":
                    continue

                status = check_url(row[col])
                output_row[col] = status

            writer.writerow(output_row)
            f_out.flush()   # ðŸ”¥ immediate disk write

            time.sleep(0.05)  # optional safety delay

print("Done.")

Processing football.csv...




Processing tennis.csv...




Processing other.csv...




Done.
