# What it is

A script that takes an excel xlsx file containing the original url and the redirected url pairings that a user wants to check and outputs whether the redirects are valid or not.

# How to Use it

1. Create an input file containing the url's to check that is modelled off of the example sheet. Place it in the 'Inputs' folder. If you just want a feel of how to run the notebook, you can use the default examples. The input file name should be enclosed by quotes like shown and include the file extension .xlsx
2. **WIP** If you are checking a Master Lock or SentrySafe site, select whether the site to check is review or production by inputting either 'review' or 'production' next to the variable __ENVIRONMENT__
3. Run the check by going to 'Cell' in the top navigation and selecting 'Run All'.
4. View what urls passed or failed the test by reading below or going to the 'Results' folder and selecting the file with the timestamp of your last run.

In [22]:
# User input data

REDIRECTS_WORKBOOK = 'Example.xlsx'
REDIRECTS_WORKBOOK_SHEET = 'Sheet1'

# Whether looking to test the review site or production site
# Used for Master Lock and SentrySafe
# May not be supported for other sites, to be safe, leave on production
ENVIRONMENT = 'production' #'production' or 'review'


In [23]:
# Imports and constants

import urllib.request as request
import pandas as pd
import re
import xlrd
from xlutils.copy import copy
from datetime import datetime
import requests

import matplotlib
REDIRECTS_INPUT_FOLDER = 'Inputs/'
REDIRECTS_OUTPUT_FOLDER = 'Results/'

REDIRECTS_INPUT_WORKBOOK_PATH = REDIRECTS_INPUT_FOLDER + REDIRECTS_WORKBOOK

to_check = xlrd.open_workbook(REDIRECTS_INPUT_WORKBOOK_PATH)
to_check_sheet = to_check.sheet_by_name(REDIRECTS_WORKBOOK_SHEET)

check_wb = copy(to_check) 
check_sheet = check_wb.get_sheet(REDIRECTS_WORKBOOK_SHEET)

ENVIRONMENT_LIST = ['production', 'review']

In [43]:
# Methods to parse data in file

def change_to_prod_https(url):
    if not url.startswith("https://"):
        if '//' in url or url.startswith("review"):
            return "".join(["https://www.", url.split('.',1)[1]])
        else:
            return "".join(["https://", url])
    else: return url

def change_to_review(url):
    basic_path = url.split('.', 1)[1]
    review_url = "".join(['review.', basic_path])
    return review_url

def change_env(url, env):
    if env in ENVIRONMENT_LIST:
        if env == 'production':
            url = change_to_prod_https(url)
            return url
        elif env == 'review':
            url = change_to_review(url)
            return url
        else:
            sys.exit("This environment has been declared usable, but has not been built out.")
    else:
        sys.exit("Passed in environment is incorrect")  

def check_seo_hops(hops):
    if hops >= 3:
        seo_check = "Correct redirect, but hopped three times or more."
    else:
        seo_check = "OK"
    return seo_check

# Testing

The cells below are a check to make sure that the tool is working correctly. If one of these fails, and the canonical checker still runs, outputted file may be incorrect. Reach out or trouble shoot based on the outputted error.

When selecting 'Run All Cells', if one of these tests fails, the code will stop running at this cell. If you want to continue, you can select the 'Actual Check' cell and continue by running that, but it's highly advised against.

In [39]:
def test_change_to_env(url, env_url, env):
    test_url = change_env(url, env)
    if test_url == env_url:
        print("True")
    else:
        print("An error occurred. Test url: " + test_url)
        print("Expected url: " + env_url)
        print("Env: "+ str(env))
        sys.exit()

test_change_to_env("www.sentrysafe.com", "https://www.sentrysafe.com", "production")
test_change_to_env("www.sentrysafe.com", "review.sentrysafe.com", "review")
test_change_to_env("review.sentrysafe.com", "https://www.sentrysafe.com", "production")
test_change_to_env("nm.org", "https://nm.org", "production")

True
True
True
True


In [None]:
# Checking the redirects

cols = ["result", "status code", "url", "expected redirect", "actual redirect", "hops", "SEO Results"]
list_of_results = pd.DataFrame(columns=cols)

# For every row in the input data, check to see that the actual redirect is the same as the desired
for i in range(1, len(check_sheet.rows)):
    seo_check = "n/a"
    matched_result = ""
    
    url_to_redirect = change_env(to_check_sheet.cell(i, 0).value.strip(), ENVIRONMENT)
    expected_redirect = add_https_if_none(to_check_sheet.cell(i, 1).value.strip())
    
    req = requests.get(url_to_redirect)
    
    # This status code is reflective of the last code outputted and will not reflect redirects
    status_code = req.status_code
    
    # To check redirects, the history of the response must be parsed
    # If there is no history, then a redirect did not occur
    if req.history:
        status_code = req.history[-1].status_code
        hops = len(req.history)
        if  status_code == 301:
            actual_redirect = req.url
            if actual_redirect == expected_redirect:
                matched_result = "OK"
                seo_check = check_seo_hops(hops)
            else:
                matched_result = 'Redirected to wrong url'
        else: matched_result = "Wrong redirect response"
    else:
        matched_result = "View status code"
    
    # Append the result to a dataframe for output later
    list_of_results.loc[i] = [matched_result, status_code, url_to_redirect,
                              expected_redirect, actual_redirect, hops, seo_check]

print(list_of_results)

In [12]:
# Run to output the dataframe as an xlsx file in the 'Results' folder

OUTPUT_FILE = REDIRECTS_OUTPUT_FOLDER + 'redirect-results_'+ datetime.now().strftime("%Y-%m-%d_%H-%M") + '.xlsx'

writer = pd.ExcelWriter(OUTPUT_FILE, engine='xlsxwriter',)
list_of_results.to_excel(writer, sheet_name='Sheet1', index=False)
writer.save()