# What it is

A script that takes an excel xlsx file containing the original url and the redirected url pairings that a user wants to check and outputs whether the redirects are valid or not.

# How to Use it

1. Create an input file containing the url's to check that is modelled off of the example sheet. Place it in the 'Inputs' folder. If you just want a feel of how to run the notebook, you can use the default examples. The input file name should be enclosed by quotes like shown and include the file extension .xlsx
2. **WIP** Select whether the site to check is review or production by inputting either 'review' or 'production' next to the variable __ENVIRONMENT__
3. Run the check by going to 'Cell' in the top navigation and selecting 'Run All'.
4. View what urls passed or failed the test by reading below or going to the 'Results' folder and selecting the file with the timestamp of your last run.

FUCK

In [4]:
# User input data

REDIRECTS_WORKBOOK = 'Example.xlsx'
REDIRECTS_WORKBOOK_SHEET = 'Sheet1'


In [6]:
# Imports and constants

import urllib.request as request
import pandas as pd
import re
import xlrd
from xlutils.copy import copy
from datetime import datetime
import requests

import matplotlib
REDIRECTS_INPUT_FOLDER = 'Inputs/'
REDIRECTS_OUTPUT_FOLDER = 'Results/'

REDIRECTS_INPUT_WORKBOOK_PATH = REDIRECTS_INPUT_FOLDER + REDIRECTS_WORKBOOK

to_check = xlrd.open_workbook(REDIRECTS_INPUT_WORKBOOK_PATH)
to_check_sheet = to_check.sheet_by_name(REDIRECTS_WORKBOOK_SHEET)

check_wb = copy(to_check) 
check_sheet = check_wb.get_sheet(REDIRECTS_WORKBOOK_SHEET)

In [9]:
# Methods to parse data in file

def add_https_if_none(url):
    if url.startswith("www"):
        return "https://" + url
    else: return url

In [11]:
# Checking the redirects

cols = ["result", "status code", "url", "expected redirect", "actual redirect", "hops"]
list_of_results = pd.DataFrame(columns=cols)

# For every row in the input data, check to see that the actual redirect is the same as the desired
for i in range(1, len(check_sheet.rows)):
    url_to_redirect = add_https_if_none(to_check_sheet.cell(i, 0).value.strip())
    expected_redirect = add_https_if_none(to_check_sheet.cell(i, 1).value.strip())
    
    req = requests.get(url_to_redirect)
    
    # This status code is reflective of the last code outputted and will not reflect redirects
    status_code = req.status_code
    
    # To check redirects, the history of the response must be parsed
    # If there is no history, then a redirect did not occur
    if req.history:
        status_code = req.history[-1].status_code
        hops = len(req.history)
        if  status_code == 301:
            actual_redirect = req.url
            if actual_redirect == expected_redirect:
                if hops >= 3:
                    result = "Correct redirect, but hopped greater than three times"
                else:
                    result = "OK"
            else:
                result = 'Redirected to wrong url'
        else: result = "Wrong redirect response"
    else:
        result = "View status code"
    
    # Append the result to a dataframe for output later
    list_of_results.loc[i] = [result, status_code, url_to_redirect, expected_redirect, actual_redirect, hops]

print(list_of_results)

                     result status code  \
1   Redirected to wrong url         301   
2   Redirected to wrong url         301   
3   Redirected to wrong url         301   
4                        OK         301   
5                        OK         301   
6   Redirected to wrong url         301   
7                        OK         301   
8                        OK         301   
9                        OK         301   
10                       OK         301   
11                       OK         301   
12  Redirected to wrong url         301   
13                       OK         301   
14                       OK         301   
15  Redirected to wrong url         301   
16  Redirected to wrong url         301   
17         View status code         404   
18  Redirected to wrong url         301   

                                                  url  \
1                https://www.masterlock.com/index.jsp   
2   https://www.masterlock.com/service-and-support...   
3   https:/

In [12]:
# Run to output the dataframe as an xlsx file in the 'Results' folder

OUTPUT_FILE = REDIRECTS_OUTPUT_FOLDER + 'redirect-results_'+ datetime.now().strftime("%Y-%m-%d_%H-%M") + '.xlsx'

writer = pd.ExcelWriter(OUTPUT_FILE, engine='xlsxwriter',)
list_of_results.to_excel(writer, sheet_name='Sheet1', index=False)
writer.save()