# Redirect Script
Run the cell below this one\
Change the **index_range = slice(0,300000)** line for more precise ranges

In [1]:
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
from urllib.parse import urlparse
import validators
import time
    
nest_asyncio.apply()

def initial_processing(url):
    if not url or url != url or pd.isna(url):
        return ''
    
    # Sanitize URL
    corrected_url = sanitize_url(url)
    return corrected_url

# Function to sanitize/correct URLs missing pieces
def sanitize_url(url):
    # Parse URL to correct any issues then reconstruct
    parsed_url = urlparse(url)

    if not parsed_url.scheme:
    # Assume http scheme
        corrected_url = 'http://'+parsed_url.netloc + parsed_url.path + parsed_url.params + parsed_url.query + parsed_url.fragment
    else:
        corrected_url = parsed_url.geturl()

    return corrected_url

async def check_url(session, url, semaphore):
    async with semaphore:
        try:
            async with session.head(url, allow_redirects=True, timeout=100) as response:
                return str(response.url) # Return final URL as string
        # Catch errors
        except asyncio.TimeoutError as te:
            return 'Timeout Error'
        except aiohttp.ClientError as ce:
            return 'Client Error'
        except ValueError as ve:
            return 'Value Error'

async def process_urls(urls, MAX_CONCURRENT_REQUESTS):
    print(f"processing {len(urls)} urls")
    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
    async with aiohttp.ClientSession() as session:
        tasks = [check_url(session, url, semaphore) for url in urls]
        results = await asyncio.gather(*tasks)
        print('re',results)
        return results

def update_redirect_urls(file_path, index_range, redirect_urls):
    df = pd.read_csv(file_path, low_memory=False)
    new_list = list(df['Website Redirect'][:index_range.start]) + redirect_urls + list(df['Website Redirect'][index_range.stop:])
    # print(len(index_range),len(new_list))
    df['Website Redirect'] = new_list
    df.to_csv(file_path, index=False)

def main():
    start_time = time.time()
    MAX_CONCURRENT_REQUESTS = 1000

    file_path = './Website_Redirects_230919.csv'
    df = pd.read_csv(file_path, low_memory=False)

    # Change this line for more precise ranges
    index_range = slice(160000,170000)

    if 'Website' not in df.columns:
        print("The CSV file must have a 'Website' column containing the URLs.")
    else:
        raw_urls = df['Website'][index_range].tolist()
        redirect_urls = df.get('Website Redirect', pd.Series(dtype=str)).tolist()[index_range]

        # Check if 'Website Redirect' column is already populated (with valid URL)
        for i, redirect_url in enumerate(redirect_urls):
            if redirect_url and validators.url(redirect_url):
                raw_urls[i] = redirect_url

        # Process the URLs asynchronously
        sanitized_urls = [initial_processing(url) for url in raw_urls]
        valid_urls = [url if validators.url(url) else '' for url in sanitized_urls]

        # Run the asynchronous function using asyncio.run()
        loop = asyncio.get_event_loop()
        final_urls = loop.run_until_complete(process_urls(valid_urls, MAX_CONCURRENT_REQUESTS))

        # Update 'Website Redirect' column in the CSV file with final URLs
        update_redirect_urls(file_path, index_range, final_urls)

        print(f"'Website Redirect' column updated in {time.time()-start_time} seconds.")

main()

processing 10000 urls


Can not load response cookies: Illegal key 'o2GQoupQ4a6yP1dff/CfG3OtpYnQChInJHu4bVAUi4g'
Can not load response cookies: Illegal key '/_ProfileCookie'


re ['Client Error', 'https://ascent360.com', 'https://www.novel.com', 'http://www.gichner.us', 'https://www.ifoodds.com', 'https://www.quantasy.com/', 'http://www.shellnetworks.com', 'https://www.newwaypro.com', 'https://www.gravityusa.com', 'https://www.novarad.net', 'https://snitechnology.net', 'https://www.fullpicture.com', 'Client Error', 'https://www.premierwalking.com', 'https://www.uniqueinfotec.com/', 'https://www.multimatic.com', 'https://cheero.net', 'Client Error', 'https://www.bartonchicago.com', 'https://seikoinstruments.com', 'https://www.groundwatersoftware.com', 'http://www.basspro.com', 'https://www.autosuccessonline.com', 'https://www.kansasgirlscouts.org/', 'https://www.ati-online.com', 'https://www.bassett.org', 'https://www.blue-berry.com', 'https://www.nsc.org', 'https://dan.com/buy-domain/matchgo.com?redirected=true', 'Client Error', 'https://canadaprimemarketing.com/', 'https://www.asa.org', 'Client Error', 'https://beelinedelivery.com', 'https://www.ringringmar

In [38]:
file_path = './Website_Redirects_230919.csv'
df = pd.read_csv(file_path, low_memory=False)

errno, count = 0,0
for rd in list(df['Website Redirect']):
    # if type(rd) == str and 'Timeout Error' in rd:
    if type(rd) == str:
        if 'Error' in rd:
            errno+=1
        count +=1 

errno/count

0.33894117647058825