# Redirect Script
Run the cell below this one\
Change the **index_range = slice(0,300000)** line for more precise ranges

In [5]:
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
from urllib.parse import urlparse
import validators
from IPython.display import clear_output
import time

start_time = time.time()
    
nest_asyncio.apply()
MAX_CONCURRENT_REQUESTS = 10
# Change this line for more precise ranges
index_range = slice(0,300000)

def initial_processing(url):
    if not url or url != url or pd.isna(url):
        return ''
    
    # Sanitize URL
    corrected_url = sanitize_url(url)
    return corrected_url

# Function to sanitize/correct URLs missing pieces
def sanitize_url(url):
    # Parse URL to correct any issues then reconstruct
    parsed_url = urlparse(url)

    if not parsed_url.scheme:
    # Assume http scheme
        corrected_url = 'http://'+parsed_url.netloc + parsed_url.path + parsed_url.params + parsed_url.query + parsed_url.fragment
    else:
        corrected_url = parsed_url.geturl()

    return corrected_url

async def check_url(session, url, semaphore):
    async with semaphore:
        try:
            async with session.head(url, allow_redirects=True, timeout=10) as response:
                return str(response.url) # Return final URL as string
        # Catch errors
        except asyncio.TimeoutError as te:
            return 'Timeout Error'
        except aiohttp.ClientError as ce:
            return 'Client Error'
        except ValueError as ve:
            return 'Value Error'

async def process_urls(urls):
    print(f"processing {len(urls)} urls")
    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
    async with aiohttp.ClientSession() as session:
        tasks = [check_url(session, url, semaphore) for url in urls]
        results = await asyncio.gather(*tasks)
        return results

def update_redirect_urls(file_path, urls, redirect_urls):
    df = pd.read_csv(file_path, low_memory=False)
    df['Website Redirect'][index_range] = redirect_urls
    df.to_csv(file_path, index=False)

file_path = './Website_Redirects_230919.csv'
df = pd.read_csv(file_path, low_memory=False)

if 'Website' not in df.columns:
    print("The CSV file must have a 'Website' column containing the URLs.")
else:
    raw_urls = df['Website'][index_range].tolist()
    redirect_urls = df.get('Website Redirect', pd.Series(dtype=str)).tolist()[index_range]

    # Check if 'Website Redirect' column is already populated (with valid URL)
    for i, redirect_url in enumerate(redirect_urls):
        if redirect_url and validators.url(redirect_url):
            raw_urls[i] = redirect_url

    # Process the URLs asynchronously
    sanitized_urls = [initial_processing(url) for url in raw_urls]
    valid_urls = [url if validators.url(url) else '' for url in sanitized_urls]

    # Run the asynchronous function using asyncio.run()
    loop = asyncio.get_event_loop()
    final_urls = loop.run_until_complete(process_urls(valid_urls))

    # Update 'Website Redirect' column in the CSV file with final URLs
    update_redirect_urls(file_path, valid_urls, final_urls)

    print(f"'Website Redirect' column updated in {time.time()-start_time} seconds.")

processing 500 urls
'Website Redirect' column updated in 146.62903261184692 seconds.
