In [34]:
import math
import time
import pandas as pd
import requests
import validators
from urllib.parse import urlparse, urlunparse
from concurrent.futures import ThreadPoolExecutor

# Function to check if a URL is valid
def is_valid_url(url):
    if not url or url != url:
        return True
    return validators.url(url)

# Function to sanitize/correct URLs missing pieces
def sanitize_url(url):
    # Parse url to correct any issues then reconstruct
    parsed_url = urlparse(url)
    
    if not parsed_url.scheme:
        # Assume http scheme
#         corrected_url = f"http://{parsed_url.netloc}{parsed_url.path}"
        corrected_url = 'http://'+parsed_url.netloc + parsed_url.path + parsed_url.params + parsed_url.query + parsed_url.fragment
    else:
        corrected_url = parsed_url.geturl()
        
    return corrected_url

# Function to get the redirected URL
def get_final_redirect_url(url): 
    try:
        # This handles a nonexistent url and 'url != url' handles the NA -> nan case
        if not url or url != url:
            return ''
        
        # Sanitize url
        corrected_url = sanitize_url(url)
        
        # Redirects set to true by default
#         response = requests.get(corrected_url, allow_redirects = True, stream = True, timeout = 4, headers={"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"})
        # DEBUG - requests.head is a slightly shorter call as it doesn't return entire html body of request
        response = requests.head(corrected_url, allow_redirects = True, stream = True, timeout = 5, headers={"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"})
        final_url = response.url
        return final_url
    except requests.exceptions.RequestException as re:
        # DEBUG
#         print(f"An error occured:{e}")
        return ''
    except requests.exception.Timeout as te:
        return 'Timeout Exception'

# Start script
start_time = time.time()
max_workers = 0

# Load csv
file_path = 'Website_Redirects_230919.csv'
# low_memory=False to avoid dtype error message 
df = pd.read_csv(file_path, low_memory=False)

#Ensure source file has 'Websites' column
if 'Website' not in df.columns:
    print('Original csv file must contain a \'Website\' column')
else:
    # Create ThreadPoolExecutor
    max_workers = 100
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Define parallel URL processing function
        def process_url(url):
            # DEBUG
#             print(url)
            if is_valid_url(url):
                return get_final_redirect_url(url)
            else:
                return get_final_redirect_url(sanitize_url(url))
            
        # Use executor to map processing function to URLs
#         df['Website Redirect'] = list(executor.map(process_url, df['Website'][:20]))
        redirects = list(executor.map(process_url, df['Website'][0:1000]))
        # DEBUG
        print(redirects)
    # Save the modified Dataframe back to the CSV file
#     df.to_csv(file_path, index=False)

    
f'Done in {time.time() - start_time}, {redirects.count("")} broken urls and {redirects.count("Timeout Exception")} timeouts with {max_workers} max workers.'

['https://sunstonepartners.com/', 'https://www.appliedlearning.com/', 'https://www.forsalebyowner.com/', 'http://comsatmedia-en.tumblr.com/', 'https://www.kdanmobile.com/', 'https://www.sunchlorellausa.com/', 'https://www.papemh.com/', 'https://everstream.net/', 'https://wackerbrewing.com/', 'https://21stsoft.com/', 'https://marketingbysos.com/', 'https://www.wittmann-group.com/en', 'https://www.xelapack.com/', 'https://epicio.com/', 'https://mammachia.com/', 'https://cambli.com/', '', 'https://www.river-run.com/', 'https://www.re-soft.com/', 'https://www.pavestone.com/', 'https://www.framedisplays.com/', 'https://www.virtually-anywhere.com/', 'https://abak.hopem.com/', 'https://jswsteel.us/', 'https://www.glr.qc.ca/', 'http://www.woodrock.com/', 'https://tracegenomics.com/', 'https://avada.com/', '', 'https://www.cthedge.org/', 'https://greenecowalls.com/', 'https://www.ripoffreportremovalhelp.com/', 'https://www.kumi-na.com/', 'https://libertynet.com', 'https://www.ptechnosoft.com/',

'Done in 204.8988757133484, 115 broken urls and 0 timeouts with 100 max workers.'

In [10]:
time2 = time.time()
t_u = 'www.glcom.net'
t_u = sanitize_url(t_u)

# t_u = requests.head(t_u, allow_redirects=True)
t_u = requests.get(t_u, allow_redirects=True)

t_u.url, time.time() - time2

('https://everstream.net/', 2.3883795738220215)

In [15]:
time3 = time.time()

t_u2 = 'www.re-soft.com'
t_u2 = sanitize_url(t_u2)

try:
    t_u2 = requests.get(t_u2, allow_redirects = True, stream = True, timeout = 5, headers={"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"})
except (requests.exceptions.RequestException,requests.exceptions.Timeout) as (e1,e2):
        # DEBUG
#         print(f"An error occured:{e}")
        print(e1,'\n',e2)
        return ''

t_u2.url, time.time() - time3

SyntaxError: invalid syntax (<ipython-input-15-522fbf3b402e>, line 8)

In [None]:
    # Iterate through the URLs in 'Websites' column and capture the final redirected URLs
#     for index, row in df.head().iterrows():
#     for index, row in df.iloc[:100].iterrows():
#     original_url = row['Website']

    # Applies get_final_redirect_url to entire 'Website' column
    # then creates new column with redirects      
#     df['Website Redirects'] = df['Website'][:1000].apply(get_final_redirect_url)
#         df['Website Redirects'] = df['Website'].apply(get_final_redirect_url)

    # Save the modified Dataframe back to the CSV file
#     df.to_csv(file_path, index=False)

    # Count for counting errors
#     count = 0
#     for index, row in df[:301].iterrows():
#         original_url = row['Website']
#         print(original_url)
#         final_url = get_final_redirect_url(original_url)
#         if final_url == '':
#             count+=1
#             continue
#         if final_url != original_url+'/':
#             print(original_url, '->',final_url)
            
#         if final_url == original_url:
#             print(original_url)

# df.head()['Website']

In [44]:
type(df['Website'])

pandas.core.series.Series

In [36]:
is_valid_url('https://sunstonepartners.com/')

thisguy = urlparse('www.glcom.net')
f"http://{thisguy.netloc}{thisguy.path}"
# print(thisguy)
# thisguy._replace(scheme='http')
# print(thisguy)
# thisguy.geturl()
thisguy

ParseResult(scheme='', netloc='', path='www.glcom.net', params='', query='', fragment='')

In [30]:
get_final_redirect_url('https://xelapack.com')

'https://www.xelapack.com/'

In [5]:
for index, row in df.iloc[:2].iterrows():
    print(row)

Company ID (18 Char)              0018X00002wGCv0QAG
Company Record Type            Sponsor / Lender / IB
Company Name                       Sunstone Partners
Website                 https://sunstonepartners.com
Website Redirects                                NaN
Name: 0, dtype: object
Company ID (18 Char)                 0018X000037J7rtQAC
Company Record Type                            Prospect
Company Name                      Applied Learning Labs
Website                 https://www.appliedlearning.com
Website Redirects                                   NaN
Name: 1, dtype: object


In [17]:
import requests

# responser = requests.get('http://www.glcom.net', allow_redirects=True)
# responser = requests.head('http://www.glcom.net', allow_redirects=True)
responser = requests.head('https://hawksoftinc.com', allow_redirects=True)
responser.url
# responser

ConnectionError: HTTPSConnectionPool(host='hawksoftinc.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000236257C2DF0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))

In [6]:
from bs4 import BeautifulSoup,SoupStrainer
import urllib.request
import colorama,re,queue,threading
from colorama import Fore
from urllib.parse import *

class check_link():
    def __init__(self,address):
        self.address=address        
    def check(self,address):   
        try:
            req=urllib.request.Request(url=address)
            resp=urllib.request.urlopen(req)
            if resp.status in [400,404,403,408,409,501,502,503]:
                print (Fore.RED+resp.status+"-"+resp.reason+"-->"+address)          
            else: print (Fore.GREEN+"no problem in-->"+address)
                              
        except Exception as e:
            print (Fore.YELLOW+"{}-{}".format(e,address))
            pass   
def pattern_adjust(a):  
    try:
        if re.match('^#' ,a):return 0 
        r=urlsplit(a)
        if r.scheme=='' and (r.netloc!='' or r.path!=''):
            d=urlunsplit(r)
            if re.match('^//' ,d):
                m= re.search('(?<=//)\S+', d)
                d=m.group(0)  
                m="https://"+d
                return m
        elif r.scheme=='' and r.netloc=='':
            return address+a
        else:return a
    except Exception as e:
        pass
def extract_link(address):
    tags= {'a':'href', 'img':'src', 'script':'src', 'link':'href' }
    for key,value in iter(tags.items()):    
        try:
            headers={"User-Agent": "Mozilla/5.0"}
            res=urllib.request.urlopen(urllib.request.Request(url=address, headers=headers))
            response=res.read().decode('utf-8') #needs improvement
            for link in BeautifulSoup(response,"html.parser",parse_only=SoupStrainer(key)): 
                if link.has_attr(value) and address in link[value]: # address in link[value] to keep testing the target site only
                    p=pattern_adjust(link[value])
                    if p!=0 and str(p)!='None':        
                        newcheck=check_link(p)
                        newcheck.check(p)
                        if p not in hyperlinks:
                            hyperlinks.add(p)
                            if website.split('.')[1] in p:#needs improvement
                                if not website.endswith(('.png','.jpeg','.js','jpg')):
                                    q.put(p)                    
        except Exception as e:
            print (e,address)                                
def threader():
    while True:
        value=q.get()  
        result=extract_link(value)
        q.task_done()

if __name__=="__main__":
    colorama.init()
    q=queue.Queue()
    global hyperlinks,website
    hyperlinks=set()
#     website= 'https://www.sozcu.com.tr/' #Target website 
#     website = 'http://www.hawksoftinc.com'
    website = 'http://www.glcom.net'
    for x in range(30):
        t=threading.Thread(target=threader)
        t.deamon=True
        t.start()   
    q.put(website.strip())
    q.join()

In [None]:
import math
import time
import pandas as pd
import requests
import validators
from urllib.parse import urlparse, urlunparse
from concurrent.futures import ThreadPoolExecutor

# Function to check if a URL is valid
def is_valid_url(url):
    if not url or url != url:
        return True
    return validators.url(url)

# Function to sanitize/correct URLs missing pieces
def sanitize_url(url):
    # Parse url to correct any issues then reconstruct
    parsed_url = urlparse(url)
    
    if not parsed_url.scheme:
        # Assume http scheme
#         corrected_url = f"http://{parsed_url.netloc}{parsed_url.path}"
        corrected_url = 'http://'+parsed_url.netloc + parsed_url.path + parsed_url.params + parsed_url.query + parsed_url.fragment
    else:
        corrected_url = parsed_url.geturl()
        
    return corrected_url

# Function to get the redirected URL
def get_final_redirect_url(url): 
    try:
        # This handles a nonexistent url and 'url != url' handles the NA -> nan case
        if not url or url != url:
            return ''
        
        # Sanitize url
        corrected_url = sanitize_url(url)
        
        # Redirects set to true by default
#         response = requests.get(corrected_url, allow_redirects = True, stream = True, timeout = 4, headers={"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"})
        # DEBUG - requests.head is a slightly shorter call as it doesn't return entire html body of request
        response = requests.head(corrected_url, allow_redirects = True, stream = True, timeout = 5, headers={"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"})
        final_url = response.url
        return final_url
    except requests.exceptions.RequestException as re:
        # DEBUG
#         print(f"An error occured:{e}")
        return ''
    except requests.exception.Timeout as te:
        return 'Timeout Exception'

# Start script
start_time = time.time()
max_workers = 0

# Load csv
file_path = 'Website_Redirects_230919.csv'
# low_memory=False to avoid dtype error message 
df = pd.read_csv(file_path, low_memory=False)

#Ensure source file has 'Websites' column
if 'Website' not in df.columns:
    print('Original csv file must contain a \'Website\' column')
else:
    # Create ThreadPoolExecutor
    max_workers = 100
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Define parallel URL processing function
        def process_url(url):
            # DEBUG
#             print(url)
            if is_valid_url(url):
                return get_final_redirect_url(url)
            else:
                return get_final_redirect_url(sanitize_url(url))
            
        # Use executor to map processing function to URLs
#         df['Website Redirect'] = list(executor.map(process_url, df['Website'][:20]))
        redirects = list(executor.map(process_url, df['Website'][0:1000]))
        # DEBUG
        print(redirects)
    # Save the modified Dataframe back to the CSV file
#     df.to_csv(file_path, index=False)

    
f'Done in {time.time() - start_time}, {redirects.count("")} broken urls and {redirects.count("Timeout Exception")} timeouts with {max_workers} max workers.'

In [33]:
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
from urllib.parse import urlparse
import validators

# Note: cannot have excel sheet open while doing this for permission.
# TOOD: (1) do with semaphores instead of threads (?)
        #(2) research why this is better
        #(3) install and use aiodns
        #(4) cache bad urls in excel column

nest_asyncio.apply()
MAX_CONCURRENT_REQUESTS = 10

async def check_url(session, url, semaphore):
    # DEBUG
    async with semaphore:
        try:
            async with session.head(url, allow_redirects=True, timeout=10) as response:
                return str(response.url) # Return final URL as string
        except asyncio.TimeoutError as te:
            print(f"Timed out for url: {url}, {te}")
            return 'Timeout Error'
        except aiohttp.ClientError as ce:
            print(f"Client error for url: {url}, {ce}")
            return 'Client Error'

async def process_urls(urls):
    print('processing',urls)
    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
    async with aiohttp.ClientSession() as session:
        tasks = [check_url(session, url, semaphore) for url in urls]
        results = await asyncio.gather(*tasks)
        return results
        
        # Process responses
#         for final_url in results:
#             if final_url:
#                 print(f"URL: {final_url}")
#             else:
#                 print("Request failed or timed out")
                
def initial_processing(url):
    if not url or url != url or pd.isna(url):
            return ''
        
    # Sanitize url -> TODO change name so not same as in sanitize?
    corrected_url = sanitize_url(url)
    return corrected_url

def update_redirect_urls(file_path, urls, redirect_urls):
    df = pd.read_csv(file_path, low_memory=False)
    print(redirect_urls)
    df['Website Redirect'][:100] = redirect_urls
    df.to_csv(file_path, index=False)

# Function to check if a URL is valid
async def is_valid_url(url):
    if not url or url != url:
        return True
    return validators.url(url)

# Function to sanitize/correct URLs missing pieces
def sanitize_url(url):
    # Parse url to correct any issues then reconstruct
    parsed_url = urlparse(url)
    
    if not parsed_url.scheme:
        # Assume http scheme
#         corrected_url = f"http://{parsed_url.netloc}{parsed_url.path}"
        corrected_url = 'http://'+parsed_url.netloc + parsed_url.path + parsed_url.params + parsed_url.query + parsed_url.fragment
    else:
        corrected_url = parsed_url.geturl()
        
    return corrected_url

file_path = 'Website_Redirects_230919.csv'
df = pd.read_csv(file_path, low_memory=False)

if 'Website' not in df.columns:
    print("The CSV file must have a 'Websites' column containing the URLs.")
else:
    raw_urls = df['Website'][:100].tolist()
    redirect_urls = df.get('Website Redirect', pd.Series(dtype=str)).tolist()
    
    # Check if 'Website Redirect' column is already populated
    for i, redirect_url in enumerate(redirect_urls):
        if redirect_url and validators.url(redirect_url):
            raw_urls[i] = redirect_url
#         elif redirect_url == 'Timeout Error'
            
    # Process the URLs asynchronously
    sanitized_urls = [initial_processing(url) for url in raw_urls]
    valid_urls = [url for url in sanitized_urls if validators.url(url)]

    # Run the asynchronous function using asyncio.run()
    loop = asyncio.get_event_loop()
    final_urls = loop.run_until_complete(process_urls(valid_urls))

    # Update 'Website Redirect' column in the CSV file with final URLs
    update_redirect_urls(file_path, valid_urls, final_urls)
    
    # Sanitize URLs and remove NaN values
#     sanitized_urls = [initial_processing(url) for url in raw_urls]
#     # DEBUG
# #     sanitized_urls = raw_urls.apply(sanitize_url)
#     valid_urls = [url for url in sanitized_urls if validators.url(url)]
    
#     # Check if an event loop is already running
#     if asyncio.get_event_loop().is_running():
#         loop = asyncio.get_event_loop()
#     else:
#         loop = asyncio.new_event_loop()
#         asyncio.set_event_loop(loop)

#     # Asynchronously check URLs
#     loop = asyncio.get_event_loop()
#     print('jere')
#     results = loop.run_until_complete(process_urls(valid_urls))
#     print('results', results)

#     # Update DataFrame with results
#     df['Website Redirect'] = results

    # Save the modified DataFrame back to the CSV file
#     df.to_csv(file_path, index=False)
    print("Redirects have been added to the CSV file.")

processing ['https://sunstonepartners.com/', 'https://www.appliedlearning.com/', 'https://www.forsalebyowner.com/', 'http://comsatmedia-en.tumblr.com/', 'https://www.kdanmobile.com/', 'https://www.sunchlorellausa.com', 'https://www.papemh.com', 'https://everstream.net/', 'https://wackerbrewing.com/', 'https://21stsoft.com', 'https://marketingbysos.com', 'https://www.wittmann-group.com/en', 'https://www.xelapack.com/', 'https://epicio.com', 'https://mammachia.com', 'https://cambli.com', 'http://hawksoftinc.com', 'https://www.river-run.com', 'https://www.re-soft.com/', 'https://www.pavestone.com', 'https://www.framedisplays.com', 'https://www.virtually-anywhere.com', 'https://abak.hopem.com', 'https://www.jswsteel.us', 'https://www.glr.qc.ca', 'http://www.woodrock.com', 'https://tracegenomics.com', 'https://avada.com', 'https://contactind.com', 'https://www.cthedge.org', 'http://greenecowalls.com', 'https://www.ripoffreportremovalhelp.com', 'https://www.kumi-na.com', 'https://www.cwnetwo

In [23]:
# without semaphore

import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
from urllib.parse import urlparse
import validators

# TODO: Add no response and last fail/run through only fails

async def check_url(session, url):
# async def check_url(session, url, semaphore):
    # DEBUG
#     async with semaphore:
        try:
            async with session.head(url, allow_redirects=True, timeout=10) as response:
                return str(response.url) # Return final URL as string
        except asyncio.TimeoutError as te:
            print(f"Timed out for url: {url}, {te}")
            return None
        except aiohttp.ClientError as ce:
            print(f"Client error for url: {url}, {ce}")
            return None

async def process_urls(urls):
# async def process_urls2(urls):
    print('processing',urls)
#     semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
#     async with aiohttp.ClientSession() as session:
#         tasks = [check_url(session, url, semaphore) for url in urls]
#         results = await asyncio.gather(*tasks)
#         return results
    async with aiohttp.ClientSession() as session:
        tasks = [check_url(session, url) for url in urls]
        return await asyncio.gather(*tasks)

def initial_processing(url):
    if not url or url != url or pd.isna(url):
            return ''
        
    # Sanitize url -> TODO change name so not same as in sanitize?
    corrected_url = sanitize_url(url)
    return corrected_url

def update_redirect_urls(file_path, urls, redirect_urls):
    df = pd.read_csv(file_path, low_memory=False)
    print(redirect_urls)
#     df['Website Redirect'] = redirect_urls
#     df.to_csv(file_path, index=False)

# Function to check if a URL is valid
async def is_valid_url(url):
    if not url or url != url:
        return True
    return validators.url(url)

# Function to sanitize/correct URLs missing pieces
def sanitize_url(url):
    # Parse url to correct any issues then reconstruct
    parsed_url = urlparse(url)
    
    if not parsed_url.scheme:
        # Assume http scheme
#         corrected_url = f"http://{parsed_url.netloc}{parsed_url.path}"
        sanitized_url = 'http://'+parsed_url.netloc + parsed_url.path + parsed_url.params + parsed_url.query + parsed_url.fragment
    else:
        sanitized_url = parsed_url.geturl()
        
    return sanitized_url

file_path = 'Website_Redirects_230919.csv'
df = pd.read_csv(file_path, low_memory=False)

if 'Website' not in df.columns:
    print("The CSV file must have a 'Websites' column containing the URLs.")
else:
    raw_urls = df['Website'][:20].tolist()
    redirect_urls = df['Website Redirect'][:20].tolist()
    input_urls = []
    
    # Check if 'Website Redirect' column is already populated
    for i, redirect_url in enumerate(redirect_urls):
        if redirect_url and validators.url(redirect_url):
            input_urls.append(redirect_url)
        else:
            input_urls.append(raw_urls[i])
            
    # Process the URLs asynchronously
    sanitized_urls = [initial_processing(url) for url in input_urls]
    valid_urls = [url for url in sanitized_urls if validators.url(url)]
    
    print(valid_urls)
    
    final_urls = process_urls(valid_urls)
# final_urls

['https://sunstonepartners.com/', 'https://www.appliedlearning.com/', 'https://www.forsalebyowner.com/', 'http://comsatmedia-en.tumblr.com/', 'https://www.kdanmobile.com/', 'https://www.sunchlorellausa.com', 'https://www.papemh.com', 'http://www.glcom.net', 'http://wackerbrewing.com', 'https://21stsoft.com', 'https://marketingbysos.com', 'https://www.wittmann-group.com', 'https://xelapack.com', 'https://epicio.com', 'https://mammachia.com', 'https://cambli.com', 'http://hawksoftinc.com', 'https://www.river-run.com', 'http://www.re-soft.com', 'https://www.pavestone.com']


In [18]:
import aiohttp

url = 'http://www.glcom.net'

session = aiohttp.ClientSession()
async with session.head(url, timeout=10) as ans:
    print(ans.url)

await asyncio.gather(*tasks)

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x000001CF32ACE0A0>


http://www.glcom.net


NameError: name 'tasks' is not defined

## Sent script
With comments/TODOs

In [None]:
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
from urllib.parse import urlparse
import validators
from IPython.display import clear_output
import time

# TODO
  # (1) Increase MAX_CONCURRENT_REQUESTS
  # (2) Figure out way to scan in one click.
    # (a) Don't test already redirected values?
    # (b) Go through all one time, see if works.

start_time = time.time()
    
nest_asyncio.apply()
MAX_CONCURRENT_REQUESTS = 10
# Slice for all websites: slice(0,300000)
index_range = slice(0,500)

def initial_processing(url):
    if not url or url != url or pd.isna(url):
        return ''
    
    # Sanitize URL
    corrected_url = sanitize_url(url)
    return corrected_url

# Function to sanitize/correct URLs missing pieces
def sanitize_url(url):
    # Parse URL to correct any issues then reconstruct
    parsed_url = urlparse(url)

    if not parsed_url.scheme:
    # Assume http scheme
        corrected_url = 'http://'+parsed_url.netloc + parsed_url.path + parsed_url.params + parsed_url.query + parsed_url.fragment
    else:
        corrected_url = parsed_url.geturl()

    return corrected_url

# DEBUG
# async def check_url(session, url, semaphore, i):
async def check_url(session, url, semaphore):
  # DEBUG
  # if i % 10:
    # clear_output()
    async with semaphore:
        try:
            async with session.head(url, allow_redirects=True, timeout=10) as response:
                return str(response.url) # Return final URL as string
        # Catch errors
        except asyncio.TimeoutError as te:
          # print(f"Timed out for url: {url}, {te}")
          return 'Timeout Error'
        except aiohttp.ClientError as ce:
          # print(f"Client error for url: {url}, {ce}")
          return 'Client Error'
        except ValueError as ve:
          # print(f"Value error for url: {url}, {ve}")
          return 'Value Error'

async def process_urls(urls):
    print(f"processing {len(urls)} urls")
    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
    async with aiohttp.ClientSession() as session:
        # tasks = [check_url(session, url, semaphore, i) for i, url in enumerate(urls)]
        tasks = [check_url(session, url, semaphore) for url in urls]
        results = await asyncio.gather(*tasks)
        return results

def update_redirect_urls(file_path, urls, redirect_urls):
    df = pd.read_csv(file_path, low_memory=False)
    # print(redirect_urls)
    df['Website Redirect'][index_range] = redirect_urls
    df.to_csv(file_path, index=False)

file_path = './Website_Redirects_230919.csv'
df = pd.read_csv(file_path, low_memory=False)

if 'Website' not in df.columns:
    print("The CSV file must have a 'Website' column containing the URLs.")
else:
    raw_urls = df['Website'][index_range].tolist()
    redirect_urls = df.get('Website Redirect', pd.Series(dtype=str)).tolist()[index_range]

    # Check if 'Website Redirect' column is already populated (with valid URL)
    for i, redirect_url in enumerate(redirect_urls):
        if redirect_url and validators.url(redirect_url):
            raw_urls[i] = redirect_url

    # Process the URLs asynchronously
    sanitized_urls = [initial_processing(url) for url in raw_urls]
    valid_urls = [url if validators.url(url) else '' for url in sanitized_urls]

    # Run the asynchronous function using asyncio.run()
    loop = asyncio.get_event_loop()
    final_urls = loop.run_until_complete(process_urls(valid_urls))

    # Update 'Website Redirect' column in the CSV file with final URLs
    update_redirect_urls(file_path, valid_urls, final_urls)

    print(f"'Website Redirect' column updated in {time.time()-start_time} seconds.")

## Report

### Threads Times

#### Threads | URLs | Time (seconds)
10  | 100    | 16\
10  | 3000   | 813\
10  | 10000  | 134\
10  | 100000 | 39,000 (11 hours)\
10  | 250000 | 107,000 (29 hours)\
100 | 3000   | 326\
500 | 3000   | 68